data-annotations 2.3.0__tar.gz → 2.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {data_annotations-2.3.0 → data_annotations-2.5.0}/PKG-INFO +162 -14
  2. {data_annotations-2.3.0 → data_annotations-2.5.0}/README.md +159 -12
  3. {data_annotations-2.3.0 → data_annotations-2.5.0}/pyproject.toml +3 -3
  4. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/annotations/models.py +2 -2
  5. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/cli.py +3 -1
  6. data_annotations-2.5.0/src/data_annotations/cli_app/annotate/__init__.py +492 -0
  7. data_annotations-2.5.0/src/data_annotations/cli_app/annotate/helpers.py +605 -0
  8. data_annotations-2.5.0/src/data_annotations/cli_app/answers.py +405 -0
  9. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/cli_app/common.py +50 -4
  10. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/cli_app/prompts.py +21 -6
  11. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/cli_app/provenance_commands.py +48 -11
  12. data_annotations-2.5.0/src/data_annotations/cli_app/publish.py +98 -0
  13. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/provenance/__init__.py +6 -0
  14. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/provenance/models.py +19 -2
  15. data_annotations-2.5.0/src/data_annotations/provenance/recovery/__init__.py +102 -0
  16. data_annotations-2.5.0/src/data_annotations/provenance/recovery/chain.py +312 -0
  17. data_annotations-2.5.0/src/data_annotations/provenance/recovery/manifest.py +179 -0
  18. data_annotations-2.5.0/src/data_annotations/provenance/recovery/matching.py +263 -0
  19. data_annotations-2.5.0/src/data_annotations/provenance/recovery/sources.py +507 -0
  20. data_annotations-2.5.0/src/data_annotations/provenance/recovery/types.py +32 -0
  21. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/provenance/writers.py +23 -0
  22. data_annotations-2.5.0/src/data_annotations/publish.py +546 -0
  23. data_annotations-2.3.0/src/data_annotations/cli_app/annotate.py +0 -483
  24. data_annotations-2.3.0/src/data_annotations/provenance/recovery.py +0 -926
  25. {data_annotations-2.3.0 → data_annotations-2.5.0}/LICENSE +0 -0
  26. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/__init__.py +0 -0
  27. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/_decorators.py +0 -0
  28. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/annotations/__init__.py +0 -0
  29. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/annotations/decorators.py +0 -0
  30. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/annotations/writers.py +0 -0
  31. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/cli_app/__init__.py +0 -0
  32. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/description/__init__.py +0 -0
  33. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/description/decorators.py +0 -0
  34. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/description/models.py +0 -0
  35. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/description/writers.py +0 -0
  36. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/provenance/decorators.py +0 -0
  37. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/provenance/git.py +0 -0
  38. {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/provenance/runtime.py +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-annotations
3
- Version: 2.3.0
4
- Summary: Annotate generated data artifacts
3
+ Version: 2.5.0
4
+ Summary: Annotate data artifacts with provenance and descriptions
5
5
  Keywords: annotations,data,metadata,provenance,reproducibility
6
6
  Author: Rodrigo C. G. Pena
7
7
  Author-email: Rodrigo C. G. Pena <rodrigo.cerqueiragonzalezpena@unibas.ch>
@@ -18,6 +18,7 @@ Classifier: Programming Language :: Python :: 3.14
18
18
  Classifier: Topic :: Scientific/Engineering
19
19
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
20
  Requires-Dist: pydantic>=2.13.1
21
+ Requires-Dist: pyyaml>=6.0.2 ; extra == 'cli'
21
22
  Requires-Dist: questionary>=2.1.1 ; extra == 'cli'
22
23
  Requires-Dist: typer>=0.16.0 ; extra == 'cli'
23
24
  Requires-Python: >=3.12
@@ -101,6 +102,8 @@ Every annotation document includes provenance with:
101
102
  - The script path relative to the Git repo root when it can be determined
102
103
  - Git commit, branch, dirty state, canonical repository remote, exact tags, and
103
104
  `git describe` output when available
105
+ - A source-code reference for recovery, derived from Git metadata when possible
106
+ or supplied explicitly for archives, individual files, and DOI/URI records
104
107
  - The current `SLURM_JOB_ID` when available
105
108
  - Structured snapshots for recorded local inputs, including file checksums,
106
109
  directory content digests, and upstream annotation sidecar references when
@@ -109,8 +112,8 @@ Every annotation document includes provenance with:
109
112
  You can also attach your own parameters, input file paths, and function names.
110
113
  Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
111
114
  such as `s3://...` or `https://...` are preserved as provided.
112
- Git tags and `git_describe` are human-friendly hints only; `git_sha` remains the
113
- source of truth for reproducibility, matching, and source checkout.
115
+ Git tags and `git_describe` are human-friendly hints only. For Git sources,
116
+ `git_sha` and `source_code.revision` identify the recoverable code state.
114
117
 
115
118
  ## Quick Start
116
119
 
@@ -536,8 +539,9 @@ per call.
536
539
  Use `artifact_matches_manifest(...)` to verify whether a detached artifact still
537
540
  matches an annotation document. Use `analyze_provenance_chain(...)` when you also
538
541
  want to verify recorded inputs and recursively follow upstream annotation
539
- sidecars. Use `checkout_manifest_source(...)` to recover the recorded code state
540
- from Git metadata.
542
+ sidecars. Use `recover_manifest_source(...)` to recover the recorded source code
543
+ from Git metadata, a recorded source archive, or a recorded source file.
544
+ `checkout_manifest_source(...)` remains available as a compatibility alias.
541
545
 
542
546
  ```python
543
547
  from pathlib import Path
@@ -545,7 +549,7 @@ from pathlib import Path
545
549
  from data_annotations.provenance import (
546
550
  analyze_provenance_chain,
547
551
  artifact_matches_manifest,
548
- checkout_manifest_source,
552
+ recover_manifest_source,
549
553
  )
550
554
 
551
555
  annotation_path = Path("outputs/participants.csv.annotation.json")
@@ -554,7 +558,7 @@ artifact_path = Path("downloads/participants.csv")
554
558
  if artifact_matches_manifest(artifact_path, annotation_path):
555
559
  chain = analyze_provenance_chain(artifact_path)
556
560
  print(chain.status)
557
- recovered = checkout_manifest_source(annotation_path)
561
+ recovered = recover_manifest_source(annotation_path)
558
562
  print(recovered.checkout_path)
559
563
  print(recovered.script_path)
560
564
  ```
@@ -571,9 +575,9 @@ still attach provenance and description after the fact.
571
575
 
572
576
  Post-hoc descriptions can still be very useful, but the quality of post-hoc
573
577
  provenance depends on how exact the supplied answers are. In particular, fields
574
- such as the generating script, command, function, Git commit, repository path,
575
- Git tags, `git describe` output, inputs, and parameters are only as reliable as
576
- the information entered during annotation.
578
+ such as the generating script, command, function, source-code URI, Git commit,
579
+ repository path, Git tags, `git describe` output, inputs, and parameters are
580
+ only as reliable as the information entered during annotation.
577
581
 
578
582
  ## CLI Workflow
579
583
 
@@ -599,14 +603,115 @@ These commands prompt for missing details, write `*.annotation.json` or `data-an
599
603
  and optionally derive README sidecars. Post-hoc records are marked with
600
604
  `capture_mode="post_hoc"`.
601
605
 
606
+ For shell workflows, you can move the prompt answers into a YAML file and run
607
+ the command non-interactively:
608
+
609
+ ```bash
610
+ data-annotations annotate file path/to/participants.csv --answers participants.yaml
611
+ data-annotations annotate directory path/to/run-001 --answers run-001.yaml
612
+ data-annotations annotate answers check participants.yaml
613
+ ```
614
+
615
+ When `--answers` is provided, `--no-interactive` is the default. Use
616
+ `--interactive` if you want the YAML file to provide defaults and still prompt
617
+ for missing required values. If the YAML file includes `target`, the positional
618
+ target may be omitted; when both are provided, they must resolve to the same
619
+ path. Environment variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded
620
+ inside string values, and validation fails if a referenced variable is not set.
621
+ The `answers check` helper requires `target` so it can infer whether the answers
622
+ describe a file or a directory.
623
+
624
+ File answers can use top-level prompt-style keys:
625
+
626
+ ```yaml
627
+ target: path/to/participants.csv
628
+ title: Participant Cohort
629
+ summary: Participant-level cohort assignments.
630
+ kind: dataset
631
+
632
+ inputs:
633
+ - ${DATA_ROOT}/raw/participants.csv
634
+
635
+ params:
636
+ split: validation
637
+
638
+ provenance:
639
+ command: bash scripts/build_participants.sh
640
+ script: scripts/build_participants.sh
641
+ git_sha: deadbeef
642
+ source_code:
643
+ kind: archive
644
+ uri: https://doi.org/10.5281/zenodo.12345
645
+ download_uri: https://zenodo.org/records/12345/files/source.zip
646
+ path: scripts/build_participants.sh
647
+ sha256: 0000000000000000000000000000000000000000000000000000000000000000
648
+
649
+ fields:
650
+ - name: participant_id
651
+ summary: Stable participant identifier.
652
+ data_type: string
653
+ required: true
654
+ nullable: false
655
+
656
+ primary_key:
657
+ - participant_id
658
+ ```
659
+
660
+ Directory answers use an explicit inventory. Paths in `artifacts`,
661
+ `artifact_groups.paths`, and `child_bundles` are relative to the annotated
662
+ directory unless absolute:
663
+
664
+ ```yaml
665
+ target: path/to/run-001
666
+ title: Processing outputs
667
+ summary: Files produced by the shell processing workflow.
668
+
669
+ provenance:
670
+ command: bash process_from_instrument.sh
671
+ script: process_from_instrument.sh
672
+
673
+ artifacts:
674
+ - path: processed.csv
675
+ kind: dataset
676
+ title: Processed instrument output
677
+ summary: Normalized output from the processing script.
678
+
679
+ artifact_groups:
680
+ - title: Diagnostic plots
681
+ kind: plot
682
+ selector: plots/*.png
683
+ paths:
684
+ - plots/qc-1.png
685
+ - plots/qc-2.png
686
+
687
+ child_bundles:
688
+ - path: model
689
+ annotation_path: model/data-annotations.json
690
+ ```
691
+
692
+ Answers files may also use schema-style aliases such as `subject.path`,
693
+ `subject.kind`, `description.title`, `description.summary`,
694
+ `description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
695
+ and `provenance.params`.
696
+
697
+ For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
698
+ `file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
699
+ sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
700
+ the generating script inside the recovered source. DOI or landing-page-only
701
+ references can be recorded with `kind: uri`, but they are not directly
702
+ recoverable unless a direct archive or file `download_uri` is also recorded.
703
+
602
704
  When group selectors are provided, the CLI expands them to concrete member paths
603
705
  at annotation time. Grouped files are tracked in `subject.produced_files[]` but
604
706
  are skipped by the per-file prompt flow, so you do not have to answer the same
605
707
  questions for every matching file.
606
708
 
607
- For post-hoc provenance, use repeatable `--git-tag` and optional
608
- `--git-describe` when you know the original code state. These values are stored
609
- as human-readable hints; `--git-sha` remains the field used for recovery.
709
+ For post-hoc provenance, use `--source-kind`, `--source-uri`,
710
+ `--source-download-uri`, `--source-path`, `--source-revision`, and
711
+ `--source-sha256` when the generating code is recoverable from a Git remote,
712
+ source archive, source file, or reference URI. Use repeatable `--git-tag` and
713
+ optional `--git-describe` when you know the original Git state; these values are
714
+ stored as human-readable hints.
610
715
 
611
716
  For provenance inspection and source recovery:
612
717
 
@@ -617,6 +722,14 @@ data-annotations provenance chain path/to/artifact --full-paths
617
722
  data-annotations provenance checkout path/to/artifact
618
723
  ```
619
724
 
725
+ Command `checkout` recovers the recorded source code. For Git sources, it clones
726
+ the recorded remote and checks out the recorded revision. For archive and file
727
+ sources, it downloads or copies the recorded object, verifies `sha256` when
728
+ present, and resolves the generating script path when recorded. Reference-only
729
+ URI sources are preserved in the annotation but are not directly recoverable.
730
+ The command prompts before downloading source code and defaults to No; use
731
+ `--force` when running trusted provenance checkout non-interactively.
732
+
620
733
  Command `match` auto-discovers `*.annotation.json` for files and `data-annotations.json` for
621
734
  directories, prints a verification summary, and suggests the exact `checkout`
622
735
  command to run next when Git recovery metadata is available.
@@ -626,6 +739,27 @@ those inputs. Its default output shows a compact relative-path tree and lists
626
739
  stale, missing, or unverifiable nodes first; use `--full-paths` when you need
627
740
  absolute paths.
628
741
 
742
+ For publication workflows, create a sanitized copy of an annotated artifact tree:
743
+
744
+ ```bash
745
+ data-annotations publish path/to/run-001 path/to/publish-bundle
746
+ data-annotations publish path/to/run-001 path/to/publish-bundle \
747
+ --prefix /private/raw/study-a='$INPUT_ROOT'
748
+ data-annotations publish path/to/run-001 path/to/publish-metadata \
749
+ --annotations-only
750
+ data-annotations publish path/to/run-001 path/to/publish-bundle --dry-run
751
+ ```
752
+
753
+ Command `publish` recursively discovers file annotations (`*.annotation.json`) and
754
+ directory annotations (`data-annotations.json`), writes a mirrored publish bundle,
755
+ and regenerates README sidecars from sanitized annotation JSON. Paths under the
756
+ source directory are rewritten to `$ARTIFACT_ROOT/...`; additional `--prefix`
757
+ mappings rewrite other private path roots. Hostname, username, and SLURM job ID
758
+ are redacted by default. Git remote URLs are preserved unless
759
+ `--redact-git-remote` is provided. Strict mode is enabled by default and fails if
760
+ any local absolute path remains after sanitization; use `--no-strict` only after
761
+ reviewing `--dry-run` output.
762
+
629
763
  If `data-annotations provenance --help` does not list `chain`, your shell is
630
764
  resolving an older installed command. From a source checkout, use
631
765
  `uv run data-annotations provenance chain ...`, or reinstall the CLI from the
@@ -656,6 +790,7 @@ uv run data-annotations annotate directory path/to/run-001
656
790
  uv run data-annotations provenance match path/to/participants.csv
657
791
  uv run data-annotations provenance chain path/to/participants.csv
658
792
  uv run data-annotations provenance checkout path/to/participants.csv
793
+ uv run data-annotations publish path/to/run-001 path/to/publish-bundle
659
794
  ```
660
795
 
661
796
  ## API Overview
@@ -708,6 +843,8 @@ uv run data-annotations provenance checkout path/to/participants.csv
708
843
  - `ProducedFile`
709
844
  - `ChildBundle`
710
845
  - `InputArtifact`
846
+ - `SourceCodeKind`
847
+ - `SourceCodeReference`
711
848
  - `BaseProvenance`
712
849
  - `FileManifest`
713
850
  - `DirectoryManifest`
@@ -725,8 +862,16 @@ uv run data-annotations provenance checkout path/to/participants.csv
725
862
  - `analyze_provenance_chain(...)`
726
863
  - `provenance_chain_is_fresh(...)`
727
864
  - `artifact_matches_manifest(...)`
865
+ - `recover_manifest_source(...)`
728
866
  - `checkout_manifest_source(...)`
729
867
 
868
+ ### Publish Functions
869
+
870
+ - `discover_annotation_paths(...)`
871
+ - `sanitize_annotation_document(...)`
872
+ - `sanitize_annotation_path(...)`
873
+ - `publish_directory(...)`
874
+
730
875
  ## Examples
731
876
 
732
877
  Runnable examples live in `examples/` and mirror the README workflows.
@@ -741,6 +886,7 @@ uv run python examples/record_file_description.py
741
886
  uv run python examples/record_directory_description.py
742
887
  uv run python examples/annotate_file.py
743
888
  uv run python examples/annotate_directory.py
889
+ uv run python examples/annotate_file_answers_cli.py
744
890
  uv run python examples/write_file_manifest.py
745
891
  uv run python examples/write_directory_manifest.py
746
892
  uv run python examples/write_file_description.py
@@ -749,6 +895,8 @@ uv run python examples/provenance_chain.py
749
895
  uv run python examples/provenance_chain_cli.py
750
896
  uv run python examples/recover_provenance.py
751
897
  uv run python examples/recover_provenance_cli.py
898
+ uv run python examples/recover_archive_source.py
899
+ uv run python examples/publish_cli.py
752
900
  ```
753
901
 
754
902
  Each example writes its outputs to a fresh temporary directory and prints the
@@ -72,6 +72,8 @@ Every annotation document includes provenance with:
72
72
  - The script path relative to the Git repo root when it can be determined
73
73
  - Git commit, branch, dirty state, canonical repository remote, exact tags, and
74
74
  `git describe` output when available
75
+ - A source-code reference for recovery, derived from Git metadata when possible
76
+ or supplied explicitly for archives, individual files, and DOI/URI records
75
77
  - The current `SLURM_JOB_ID` when available
76
78
  - Structured snapshots for recorded local inputs, including file checksums,
77
79
  directory content digests, and upstream annotation sidecar references when
@@ -80,8 +82,8 @@ Every annotation document includes provenance with:
80
82
  You can also attach your own parameters, input file paths, and function names.
81
83
  Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
82
84
  such as `s3://...` or `https://...` are preserved as provided.
83
- Git tags and `git_describe` are human-friendly hints only; `git_sha` remains the
84
- source of truth for reproducibility, matching, and source checkout.
85
+ Git tags and `git_describe` are human-friendly hints only. For Git sources,
86
+ `git_sha` and `source_code.revision` identify the recoverable code state.
85
87
 
86
88
  ## Quick Start
87
89
 
@@ -507,8 +509,9 @@ per call.
507
509
  Use `artifact_matches_manifest(...)` to verify whether a detached artifact still
508
510
  matches an annotation document. Use `analyze_provenance_chain(...)` when you also
509
511
  want to verify recorded inputs and recursively follow upstream annotation
510
- sidecars. Use `checkout_manifest_source(...)` to recover the recorded code state
511
- from Git metadata.
512
+ sidecars. Use `recover_manifest_source(...)` to recover the recorded source code
513
+ from Git metadata, a recorded source archive, or a recorded source file.
514
+ `checkout_manifest_source(...)` remains available as a compatibility alias.
512
515
 
513
516
  ```python
514
517
  from pathlib import Path
@@ -516,7 +519,7 @@ from pathlib import Path
516
519
  from data_annotations.provenance import (
517
520
  analyze_provenance_chain,
518
521
  artifact_matches_manifest,
519
- checkout_manifest_source,
522
+ recover_manifest_source,
520
523
  )
521
524
 
522
525
  annotation_path = Path("outputs/participants.csv.annotation.json")
@@ -525,7 +528,7 @@ artifact_path = Path("downloads/participants.csv")
525
528
  if artifact_matches_manifest(artifact_path, annotation_path):
526
529
  chain = analyze_provenance_chain(artifact_path)
527
530
  print(chain.status)
528
- recovered = checkout_manifest_source(annotation_path)
531
+ recovered = recover_manifest_source(annotation_path)
529
532
  print(recovered.checkout_path)
530
533
  print(recovered.script_path)
531
534
  ```
@@ -542,9 +545,9 @@ still attach provenance and description after the fact.
542
545
 
543
546
  Post-hoc descriptions can still be very useful, but the quality of post-hoc
544
547
  provenance depends on how exact the supplied answers are. In particular, fields
545
- such as the generating script, command, function, Git commit, repository path,
546
- Git tags, `git describe` output, inputs, and parameters are only as reliable as
547
- the information entered during annotation.
548
+ such as the generating script, command, function, source-code URI, Git commit,
549
+ repository path, Git tags, `git describe` output, inputs, and parameters are
550
+ only as reliable as the information entered during annotation.
548
551
 
549
552
  ## CLI Workflow
550
553
 
@@ -570,14 +573,115 @@ These commands prompt for missing details, write `*.annotation.json` or `data-an
570
573
  and optionally derive README sidecars. Post-hoc records are marked with
571
574
  `capture_mode="post_hoc"`.
572
575
 
576
+ For shell workflows, you can move the prompt answers into a YAML file and run
577
+ the command non-interactively:
578
+
579
+ ```bash
580
+ data-annotations annotate file path/to/participants.csv --answers participants.yaml
581
+ data-annotations annotate directory path/to/run-001 --answers run-001.yaml
582
+ data-annotations annotate answers check participants.yaml
583
+ ```
584
+
585
+ When `--answers` is provided, `--no-interactive` is the default. Use
586
+ `--interactive` if you want the YAML file to provide defaults and still prompt
587
+ for missing required values. If the YAML file includes `target`, the positional
588
+ target may be omitted; when both are provided, they must resolve to the same
589
+ path. Environment variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded
590
+ inside string values, and validation fails if a referenced variable is not set.
591
+ The `answers check` helper requires `target` so it can infer whether the answers
592
+ describe a file or a directory.
593
+
594
+ File answers can use top-level prompt-style keys:
595
+
596
+ ```yaml
597
+ target: path/to/participants.csv
598
+ title: Participant Cohort
599
+ summary: Participant-level cohort assignments.
600
+ kind: dataset
601
+
602
+ inputs:
603
+ - ${DATA_ROOT}/raw/participants.csv
604
+
605
+ params:
606
+ split: validation
607
+
608
+ provenance:
609
+ command: bash scripts/build_participants.sh
610
+ script: scripts/build_participants.sh
611
+ git_sha: deadbeef
612
+ source_code:
613
+ kind: archive
614
+ uri: https://doi.org/10.5281/zenodo.12345
615
+ download_uri: https://zenodo.org/records/12345/files/source.zip
616
+ path: scripts/build_participants.sh
617
+ sha256: 0000000000000000000000000000000000000000000000000000000000000000
618
+
619
+ fields:
620
+ - name: participant_id
621
+ summary: Stable participant identifier.
622
+ data_type: string
623
+ required: true
624
+ nullable: false
625
+
626
+ primary_key:
627
+ - participant_id
628
+ ```
629
+
630
+ Directory answers use an explicit inventory. Paths in `artifacts`,
631
+ `artifact_groups.paths`, and `child_bundles` are relative to the annotated
632
+ directory unless absolute:
633
+
634
+ ```yaml
635
+ target: path/to/run-001
636
+ title: Processing outputs
637
+ summary: Files produced by the shell processing workflow.
638
+
639
+ provenance:
640
+ command: bash process_from_instrument.sh
641
+ script: process_from_instrument.sh
642
+
643
+ artifacts:
644
+ - path: processed.csv
645
+ kind: dataset
646
+ title: Processed instrument output
647
+ summary: Normalized output from the processing script.
648
+
649
+ artifact_groups:
650
+ - title: Diagnostic plots
651
+ kind: plot
652
+ selector: plots/*.png
653
+ paths:
654
+ - plots/qc-1.png
655
+ - plots/qc-2.png
656
+
657
+ child_bundles:
658
+ - path: model
659
+ annotation_path: model/data-annotations.json
660
+ ```
661
+
662
+ Answers files may also use schema-style aliases such as `subject.path`,
663
+ `subject.kind`, `description.title`, `description.summary`,
664
+ `description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
665
+ and `provenance.params`.
666
+
667
+ For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
668
+ `file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
669
+ sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
670
+ the generating script inside the recovered source. DOI or landing-page-only
671
+ references can be recorded with `kind: uri`, but they are not directly
672
+ recoverable unless a direct archive or file `download_uri` is also recorded.
673
+
573
674
  When group selectors are provided, the CLI expands them to concrete member paths
574
675
  at annotation time. Grouped files are tracked in `subject.produced_files[]` but
575
676
  are skipped by the per-file prompt flow, so you do not have to answer the same
576
677
  questions for every matching file.
577
678
 
578
- For post-hoc provenance, use repeatable `--git-tag` and optional
579
- `--git-describe` when you know the original code state. These values are stored
580
- as human-readable hints; `--git-sha` remains the field used for recovery.
679
+ For post-hoc provenance, use `--source-kind`, `--source-uri`,
680
+ `--source-download-uri`, `--source-path`, `--source-revision`, and
681
+ `--source-sha256` when the generating code is recoverable from a Git remote,
682
+ source archive, source file, or reference URI. Use repeatable `--git-tag` and
683
+ optional `--git-describe` when you know the original Git state; these values are
684
+ stored as human-readable hints.
581
685
 
582
686
  For provenance inspection and source recovery:
583
687
 
@@ -588,6 +692,14 @@ data-annotations provenance chain path/to/artifact --full-paths
588
692
  data-annotations provenance checkout path/to/artifact
589
693
  ```
590
694
 
695
+ Command `checkout` recovers the recorded source code. For Git sources, it clones
696
+ the recorded remote and checks out the recorded revision. For archive and file
697
+ sources, it downloads or copies the recorded object, verifies `sha256` when
698
+ present, and resolves the generating script path when recorded. Reference-only
699
+ URI sources are preserved in the annotation but are not directly recoverable.
700
+ The command prompts before downloading source code and defaults to No; use
701
+ `--force` when running trusted provenance checkout non-interactively.
702
+
591
703
  Command `match` auto-discovers `*.annotation.json` for files and `data-annotations.json` for
592
704
  directories, prints a verification summary, and suggests the exact `checkout`
593
705
  command to run next when Git recovery metadata is available.
@@ -597,6 +709,27 @@ those inputs. Its default output shows a compact relative-path tree and lists
597
709
  stale, missing, or unverifiable nodes first; use `--full-paths` when you need
598
710
  absolute paths.
599
711
 
712
+ For publication workflows, create a sanitized copy of an annotated artifact tree:
713
+
714
+ ```bash
715
+ data-annotations publish path/to/run-001 path/to/publish-bundle
716
+ data-annotations publish path/to/run-001 path/to/publish-bundle \
717
+ --prefix /private/raw/study-a='$INPUT_ROOT'
718
+ data-annotations publish path/to/run-001 path/to/publish-metadata \
719
+ --annotations-only
720
+ data-annotations publish path/to/run-001 path/to/publish-bundle --dry-run
721
+ ```
722
+
723
+ Command `publish` recursively discovers file annotations (`*.annotation.json`) and
724
+ directory annotations (`data-annotations.json`), writes a mirrored publish bundle,
725
+ and regenerates README sidecars from sanitized annotation JSON. Paths under the
726
+ source directory are rewritten to `$ARTIFACT_ROOT/...`; additional `--prefix`
727
+ mappings rewrite other private path roots. Hostname, username, and SLURM job ID
728
+ are redacted by default. Git remote URLs are preserved unless
729
+ `--redact-git-remote` is provided. Strict mode is enabled by default and fails if
730
+ any local absolute path remains after sanitization; use `--no-strict` only after
731
+ reviewing `--dry-run` output.
732
+
600
733
  If `data-annotations provenance --help` does not list `chain`, your shell is
601
734
  resolving an older installed command. From a source checkout, use
602
735
  `uv run data-annotations provenance chain ...`, or reinstall the CLI from the
@@ -627,6 +760,7 @@ uv run data-annotations annotate directory path/to/run-001
627
760
  uv run data-annotations provenance match path/to/participants.csv
628
761
  uv run data-annotations provenance chain path/to/participants.csv
629
762
  uv run data-annotations provenance checkout path/to/participants.csv
763
+ uv run data-annotations publish path/to/run-001 path/to/publish-bundle
630
764
  ```
631
765
 
632
766
  ## API Overview
@@ -679,6 +813,8 @@ uv run data-annotations provenance checkout path/to/participants.csv
679
813
  - `ProducedFile`
680
814
  - `ChildBundle`
681
815
  - `InputArtifact`
816
+ - `SourceCodeKind`
817
+ - `SourceCodeReference`
682
818
  - `BaseProvenance`
683
819
  - `FileManifest`
684
820
  - `DirectoryManifest`
@@ -696,8 +832,16 @@ uv run data-annotations provenance checkout path/to/participants.csv
696
832
  - `analyze_provenance_chain(...)`
697
833
  - `provenance_chain_is_fresh(...)`
698
834
  - `artifact_matches_manifest(...)`
835
+ - `recover_manifest_source(...)`
699
836
  - `checkout_manifest_source(...)`
700
837
 
838
+ ### Publish Functions
839
+
840
+ - `discover_annotation_paths(...)`
841
+ - `sanitize_annotation_document(...)`
842
+ - `sanitize_annotation_path(...)`
843
+ - `publish_directory(...)`
844
+
701
845
  ## Examples
702
846
 
703
847
  Runnable examples live in `examples/` and mirror the README workflows.
@@ -712,6 +856,7 @@ uv run python examples/record_file_description.py
712
856
  uv run python examples/record_directory_description.py
713
857
  uv run python examples/annotate_file.py
714
858
  uv run python examples/annotate_directory.py
859
+ uv run python examples/annotate_file_answers_cli.py
715
860
  uv run python examples/write_file_manifest.py
716
861
  uv run python examples/write_directory_manifest.py
717
862
  uv run python examples/write_file_description.py
@@ -720,6 +865,8 @@ uv run python examples/provenance_chain.py
720
865
  uv run python examples/provenance_chain_cli.py
721
866
  uv run python examples/recover_provenance.py
722
867
  uv run python examples/recover_provenance_cli.py
868
+ uv run python examples/recover_archive_source.py
869
+ uv run python examples/publish_cli.py
723
870
  ```
724
871
 
725
872
  Each example writes its outputs to a fresh temporary directory and prints the
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "data-annotations"
3
- version = "2.3.0"
4
- description = "Annotate generated data artifacts"
3
+ version = "2.5.0"
4
+ description = "Annotate data artifacts with provenance and descriptions"
5
5
  readme = "README.md"
6
6
  authors = [
7
7
  { name = "Rodrigo C. G. Pena", email = "rodrigo.cerqueiragonzalezpena@unibas.ch" },
@@ -30,7 +30,7 @@ Changelog = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/blob/main/C
30
30
  Issues = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/issues"
31
31
 
32
32
  [project.optional-dependencies]
33
- cli = ["questionary>=2.1.1", "typer>=0.16.0"]
33
+ cli = ["PyYAML>=6.0.2", "questionary>=2.1.1", "typer>=0.16.0"]
34
34
 
35
35
  [project.scripts]
36
36
  data-annotations = "data_annotations.cli:main"
@@ -22,14 +22,14 @@ class DirectoryArtifactSubject(BaseModel):
22
22
 
23
23
 
24
24
  class FileAnnotationDocument(BaseModel):
25
- annotation_version: Literal["5"] = "5"
25
+ annotation_version: Literal["6"] = "6"
26
26
  subject: FileArtifactSubject
27
27
  provenance: BaseProvenance
28
28
  description: FileDescription
29
29
 
30
30
 
31
31
  class DirectoryAnnotationDocument(BaseModel):
32
- annotation_version: Literal["5"] = "5"
32
+ annotation_version: Literal["6"] = "6"
33
33
  subject: DirectoryArtifactSubject
34
34
  provenance: BaseProvenance
35
35
  description: DirectoryDescription
@@ -1,7 +1,7 @@
1
1
  from typing import Any
2
2
 
3
3
  _CLI_IMPORT_ERROR: ModuleNotFoundError | None = None
4
- _CLI_OPTIONAL_DEPENDENCIES = {"questionary", "typer"}
4
+ _CLI_OPTIONAL_DEPENDENCIES = {"questionary", "typer", "yaml"}
5
5
  app: Any = None
6
6
 
7
7
  try:
@@ -9,6 +9,7 @@ try:
9
9
 
10
10
  from data_annotations.cli_app.annotate import annotate_app
11
11
  from data_annotations.cli_app.provenance_commands import provenance_app
12
+ from data_annotations.cli_app.publish import publish_command
12
13
  except ModuleNotFoundError as exc:
13
14
  if exc.name not in _CLI_OPTIONAL_DEPENDENCIES:
14
15
  raise
@@ -17,6 +18,7 @@ else:
17
18
  app = typer.Typer(no_args_is_help=True)
18
19
  app.add_typer(annotate_app, name="annotate")
19
20
  app.add_typer(provenance_app, name="provenance")
21
+ app.command("publish")(publish_command)
20
22
 
21
23
 
22
24
  def main() -> None: