data-annotations 2.4.0__tar.gz → 2.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {data_annotations-2.4.0 → data_annotations-2.6.0}/PKG-INFO +152 -41
  2. {data_annotations-2.4.0 → data_annotations-2.6.0}/README.md +150 -39
  3. {data_annotations-2.4.0 → data_annotations-2.6.0}/pyproject.toml +2 -2
  4. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/_decorators.py +39 -4
  5. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/annotations/decorators.py +18 -1
  6. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/annotations/models.py +2 -2
  7. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/annotations/writers.py +44 -1
  8. data_annotations-2.6.0/src/data_annotations/cli_app/annotate/__init__.py +524 -0
  9. data_annotations-2.4.0/src/data_annotations/cli_app/annotate.py → data_annotations-2.6.0/src/data_annotations/cli_app/annotate/helpers.py +121 -439
  10. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/answers.py +17 -2
  11. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/common.py +106 -4
  12. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/provenance_commands.py +43 -17
  13. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/publish.py +7 -1
  14. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/provenance/__init__.py +10 -0
  15. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/provenance/decorators.py +17 -1
  16. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/provenance/models.py +20 -2
  17. data_annotations-2.6.0/src/data_annotations/provenance/recovery/__init__.py +102 -0
  18. data_annotations-2.6.0/src/data_annotations/provenance/recovery/chain.py +361 -0
  19. data_annotations-2.6.0/src/data_annotations/provenance/recovery/manifest.py +179 -0
  20. data_annotations-2.6.0/src/data_annotations/provenance/recovery/matching.py +324 -0
  21. data_annotations-2.6.0/src/data_annotations/provenance/recovery/sources.py +507 -0
  22. data_annotations-2.6.0/src/data_annotations/provenance/recovery/types.py +32 -0
  23. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/provenance/writers.py +223 -18
  24. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/publish.py +14 -0
  25. data_annotations-2.4.0/src/data_annotations/provenance/recovery.py +0 -926
  26. {data_annotations-2.4.0 → data_annotations-2.6.0}/LICENSE +0 -0
  27. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/__init__.py +0 -0
  28. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/annotations/__init__.py +0 -0
  29. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/cli.py +0 -0
  30. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/__init__.py +0 -0
  31. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/prompts.py +0 -0
  32. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/description/__init__.py +0 -0
  33. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/description/decorators.py +0 -0
  34. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/description/models.py +0 -0
  35. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/description/writers.py +0 -0
  36. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/provenance/git.py +0 -0
  37. {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/provenance/runtime.py +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-annotations
3
- Version: 2.4.0
4
- Summary: Annotate generated data artifacts
3
+ Version: 2.6.0
4
+ Summary: Annotate data artifacts with provenance and descriptions
5
5
  Keywords: annotations,data,metadata,provenance,reproducibility
6
6
  Author: Rodrigo C. G. Pena
7
7
  Author-email: Rodrigo C. G. Pena <rodrigo.cerqueiragonzalezpena@unibas.ch>
@@ -102,16 +102,23 @@ Every annotation document includes provenance with:
102
102
  - The script path relative to the Git repo root when it can be determined
103
103
  - Git commit, branch, dirty state, canonical repository remote, exact tags, and
104
104
  `git describe` output when available
105
+ - A source-code reference for recovery, derived from Git metadata when possible
106
+ or supplied explicitly for archives, individual files, and DOI/URI records
105
107
  - The current `SLURM_JOB_ID` when available
106
108
  - Structured snapshots for recorded local inputs, including file checksums,
107
109
  directory content digests, and upstream annotation sidecar references when
108
110
  present
109
111
 
112
+ Local file hashing defaults to checksum policy `auto`: existing files are hashed
113
+ only up to `10 * 1024**3` bytes (10 GiB). Larger files are still recorded, but
114
+ their `sha256` or directory `content_digest` is left unset unless you provide a
115
+ precomputed checksum yourself.
116
+
110
117
  You can also attach your own parameters, input file paths, and function names.
111
118
  Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
112
119
  such as `s3://...` or `https://...` are preserved as provided.
113
- Git tags and `git_describe` are human-friendly hints only; `git_sha` remains the
114
- source of truth for reproducibility, matching, and source checkout.
120
+ Git tags and `git_describe` are human-friendly hints only. For Git sources,
121
+ `git_sha` and `source_code.revision` identify the recoverable code state.
115
122
 
116
123
  ## Quick Start
117
124
 
@@ -500,6 +507,75 @@ README.
500
507
  If you want the direct writer approach instead, use `write_file_manifest(...)` and
501
508
  `write_directory_manifest(...)` (see `examples/`).
502
509
 
510
+ ## Checksum Policy
511
+
512
+ All provenance and annotation entry points that hash local files support the same
513
+ policy controls:
514
+
515
+ - `checksum_policy="auto"`: hash existing local files only when they are at or
516
+ below `max_checksum_bytes`. This is the default, and
517
+ `max_checksum_bytes` defaults to `10 * 1024**3` bytes (10 GiB).
518
+ - `checksum_policy="always"`: hash existing local files regardless of size.
519
+ - `checksum_policy="never"`: never hash local files automatically. Checksums are
520
+ recorded only when you supply them explicitly.
521
+
522
+ When a checksum is skipped, JSON sidecars keep the same schema and simply store
523
+ `sha256: null`. Directory `content_digest` is also left unset when any tracked
524
+ member file lacks a checksum.
525
+
526
+ You can change the policy from Python:
527
+
528
+ ```python
529
+ from data_annotations.annotations import annotate_file
530
+ from data_annotations.provenance import write_file_manifest
531
+
532
+ write_file_manifest(
533
+ "outputs/summary.txt",
534
+ checksum_policy="always",
535
+ )
536
+
537
+ annotate_file(
538
+ "outputs/summary.txt",
539
+ title="Run Summary",
540
+ summary="Post-hoc summary.",
541
+ artifact_sha256="precomputed-sha256",
542
+ checksum_policy="never",
543
+ )
544
+ ```
545
+
546
+ You can also inject precomputed checksums directly:
547
+
548
+ - File APIs: pass `artifact_sha256=...`.
549
+ - File or directory APIs: pass `checksum_overrides={path: sha256}`. For
550
+ directory outputs, keys can be relative to the output directory or absolute
551
+ paths.
552
+ - Decorators such as `record_file_manifest(...)`, `record_directory_manifest(...)`,
553
+ `record_file_annotation(...)`, and `record_directory_annotation(...)` accept the
554
+ same checksum-policy arguments.
555
+
556
+ From the CLI, use `--checksum-policy`, `--max-checksum-bytes`, `--sha256`, and
557
+ repeatable `--checksum PATH=SHA256`:
558
+
559
+ ```bash
560
+ data-annotations annotate file path/to/summary.txt \
561
+ --title "Run Summary" \
562
+ --summary "Post-hoc summary." \
563
+ --kind report \
564
+ --checksum-policy never \
565
+ --sha256 0123456789abcdef...
566
+
567
+ data-annotations annotate directory path/to/run-001 \
568
+ --title "Processing outputs" \
569
+ --summary "Directory-level outputs." \
570
+ --checksum-policy never \
571
+ --checksum processed.csv=0123456789abcdef...
572
+
573
+ data-annotations provenance chain path/to/run-001 \
574
+ --checksum-policy always
575
+ ```
576
+
577
+ For a complete runnable workflow, see `examples/checksum_policy.py`.
578
+
503
579
  ## Description Layer
504
580
 
505
581
  The `data_annotations.description` sub-package provides the structured description
@@ -537,8 +613,9 @@ per call.
537
613
  Use `artifact_matches_manifest(...)` to verify whether a detached artifact still
538
614
  matches an annotation document. Use `analyze_provenance_chain(...)` when you also
539
615
  want to verify recorded inputs and recursively follow upstream annotation
540
- sidecars. Use `checkout_manifest_source(...)` to recover the recorded code state
541
- from Git metadata.
616
+ sidecars. Use `recover_manifest_source(...)` to recover the recorded source code
617
+ from Git metadata, a recorded source archive, or a recorded source file.
618
+ `checkout_manifest_source(...)` remains available as a compatibility alias.
542
619
 
543
620
  ```python
544
621
  from pathlib import Path
@@ -546,7 +623,7 @@ from pathlib import Path
546
623
  from data_annotations.provenance import (
547
624
  analyze_provenance_chain,
548
625
  artifact_matches_manifest,
549
- checkout_manifest_source,
626
+ recover_manifest_source,
550
627
  )
551
628
 
552
629
  annotation_path = Path("outputs/participants.csv.annotation.json")
@@ -555,7 +632,7 @@ artifact_path = Path("downloads/participants.csv")
555
632
  if artifact_matches_manifest(artifact_path, annotation_path):
556
633
  chain = analyze_provenance_chain(artifact_path)
557
634
  print(chain.status)
558
- recovered = checkout_manifest_source(annotation_path)
635
+ recovered = recover_manifest_source(annotation_path)
559
636
  print(recovered.checkout_path)
560
637
  print(recovered.script_path)
561
638
  ```
@@ -572,9 +649,9 @@ still attach provenance and description after the fact.
572
649
 
573
650
  Post-hoc descriptions can still be very useful, but the quality of post-hoc
574
651
  provenance depends on how exact the supplied answers are. In particular, fields
575
- such as the generating script, command, function, Git commit, repository path,
576
- Git tags, `git describe` output, inputs, and parameters are only as reliable as
577
- the information entered during annotation.
652
+ such as the generating script, command, function, source-code URI, Git commit,
653
+ repository path, Git tags, `git describe` output, inputs, and parameters are
654
+ only as reliable as the information entered during annotation.
578
655
 
579
656
  ## CLI Workflow
580
657
 
@@ -625,27 +702,34 @@ target: path/to/participants.csv
625
702
  title: Participant Cohort
626
703
  summary: Participant-level cohort assignments.
627
704
  kind: dataset
705
+ sha256: 0123456789abcdef...
628
706
 
629
707
  inputs:
630
- - ${DATA_ROOT}/raw/participants.csv
708
+ - ${DATA_ROOT}/raw/participants.csv
631
709
 
632
710
  params:
633
- split: validation
711
+ split: validation
634
712
 
635
713
  provenance:
636
- command: bash scripts/build_participants.sh
637
- script: scripts/build_participants.sh
638
- git_sha: deadbeef
714
+ command: bash scripts/build_participants.sh
715
+ script: scripts/build_participants.sh
716
+ git_sha: deadbeef
717
+ source_code:
718
+ kind: archive
719
+ uri: https://doi.org/10.5281/zenodo.12345
720
+ download_uri: https://zenodo.org/records/12345/files/source.zip
721
+ path: scripts/build_participants.sh
722
+ sha256: 0000000000000000000000000000000000000000000000000000000000000000
639
723
 
640
724
  fields:
641
- - name: participant_id
642
- summary: Stable participant identifier.
643
- data_type: string
644
- required: true
645
- nullable: false
725
+ - name: participant_id
726
+ summary: Stable participant identifier.
727
+ data_type: string
728
+ required: true
729
+ nullable: false
646
730
 
647
731
  primary_key:
648
- - participant_id
732
+ - participant_id
649
733
  ```
650
734
 
651
735
  Directory answers use an explicit inventory. Paths in `artifacts`,
@@ -658,26 +742,29 @@ title: Processing outputs
658
742
  summary: Files produced by the shell processing workflow.
659
743
 
660
744
  provenance:
661
- command: bash process_from_instrument.sh
662
- script: process_from_instrument.sh
745
+ command: bash process_from_instrument.sh
746
+ script: process_from_instrument.sh
747
+
748
+ checksums:
749
+ processed.csv: 0123456789abcdef...
663
750
 
664
751
  artifacts:
665
- - path: processed.csv
666
- kind: dataset
667
- title: Processed instrument output
668
- summary: Normalized output from the processing script.
752
+ - path: processed.csv
753
+ kind: dataset
754
+ title: Processed instrument output
755
+ summary: Normalized output from the processing script.
669
756
 
670
757
  artifact_groups:
671
- - title: Diagnostic plots
672
- kind: plot
673
- selector: plots/*.png
674
- paths:
675
- - plots/qc-1.png
676
- - plots/qc-2.png
758
+ - title: Diagnostic plots
759
+ kind: plot
760
+ selector: plots/*.png
761
+ paths:
762
+ - plots/qc-1.png
763
+ - plots/qc-2.png
677
764
 
678
765
  child_bundles:
679
- - path: model
680
- annotation_path: model/data-annotations.json
766
+ - path: model
767
+ annotation_path: model/data-annotations.json
681
768
  ```
682
769
 
683
770
  Answers files may also use schema-style aliases such as `subject.path`,
@@ -685,14 +772,24 @@ Answers files may also use schema-style aliases such as `subject.path`,
685
772
  `description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
686
773
  and `provenance.params`.
687
774
 
775
+ For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
776
+ `file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
777
+ sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
778
+ the generating script inside the recovered source. DOI or landing-page-only
779
+ references can be recorded with `kind: uri`, but they are not directly
780
+ recoverable unless a direct archive or file `download_uri` is also recorded.
781
+
688
782
  When group selectors are provided, the CLI expands them to concrete member paths
689
783
  at annotation time. Grouped files are tracked in `subject.produced_files[]` but
690
784
  are skipped by the per-file prompt flow, so you do not have to answer the same
691
785
  questions for every matching file.
692
786
 
693
- For post-hoc provenance, use repeatable `--git-tag` and optional
694
- `--git-describe` when you know the original code state. These values are stored
695
- as human-readable hints; `--git-sha` remains the field used for recovery.
787
+ For post-hoc provenance, use `--source-kind`, `--source-uri`,
788
+ `--source-download-uri`, `--source-path`, `--source-revision`, and
789
+ `--source-sha256` when the generating code is recoverable from a Git remote,
790
+ source archive, source file, or reference URI. Use repeatable `--git-tag` and
791
+ optional `--git-describe` when you know the original Git state; these values are
792
+ stored as human-readable hints.
696
793
 
697
794
  For provenance inspection and source recovery:
698
795
 
@@ -703,8 +800,12 @@ data-annotations provenance chain path/to/artifact --full-paths
703
800
  data-annotations provenance checkout path/to/artifact
704
801
  ```
705
802
 
706
- Command `checkout` downloads the recorded Git remote and checks out the recorded
707
- commit. It prompts before downloading source code and defaults to No; use
803
+ Command `checkout` recovers the recorded source code. For Git sources, it clones
804
+ the recorded remote and checks out the recorded revision. For archive and file
805
+ sources, it downloads or copies the recorded object, verifies `sha256` when
806
+ present, and resolves the generating script path when recorded. Reference-only
807
+ URI sources are preserved in the annotation but are not directly recoverable.
808
+ The command prompts before downloading source code and defaults to No; use
708
809
  `--force` when running trusted provenance checkout non-interactively.
709
810
 
710
811
  Command `match` auto-discovers `*.annotation.json` for files and `data-annotations.json` for
@@ -742,6 +843,11 @@ resolving an older installed command. From a source checkout, use
742
843
  `uv run data-annotations provenance chain ...`, or reinstall the CLI from the
743
844
  updated source before using the bare `data-annotations` command.
744
845
 
846
+ Both `match` and `chain` also accept `--checksum-policy` and
847
+ `--max-checksum-bytes`. Use `--checksum-policy always` when you want full
848
+ verification of large local files, and leave the default `auto` when you prefer
849
+ to avoid long checksum passes on very large artifacts.
850
+
745
851
  ### Run With `uvx`
746
852
 
747
853
  ```bash
@@ -820,6 +926,8 @@ uv run data-annotations publish path/to/run-001 path/to/publish-bundle
820
926
  - `ProducedFile`
821
927
  - `ChildBundle`
822
928
  - `InputArtifact`
929
+ - `SourceCodeKind`
930
+ - `SourceCodeReference`
823
931
  - `BaseProvenance`
824
932
  - `FileManifest`
825
933
  - `DirectoryManifest`
@@ -837,6 +945,7 @@ uv run data-annotations publish path/to/run-001 path/to/publish-bundle
837
945
  - `analyze_provenance_chain(...)`
838
946
  - `provenance_chain_is_fresh(...)`
839
947
  - `artifact_matches_manifest(...)`
948
+ - `recover_manifest_source(...)`
840
949
  - `checkout_manifest_source(...)`
841
950
 
842
951
  ### Publish Functions
@@ -860,6 +969,7 @@ uv run python examples/record_file_description.py
860
969
  uv run python examples/record_directory_description.py
861
970
  uv run python examples/annotate_file.py
862
971
  uv run python examples/annotate_directory.py
972
+ uv run python examples/checksum_policy.py
863
973
  uv run python examples/annotate_file_answers_cli.py
864
974
  uv run python examples/write_file_manifest.py
865
975
  uv run python examples/write_directory_manifest.py
@@ -869,6 +979,7 @@ uv run python examples/provenance_chain.py
869
979
  uv run python examples/provenance_chain_cli.py
870
980
  uv run python examples/recover_provenance.py
871
981
  uv run python examples/recover_provenance_cli.py
982
+ uv run python examples/recover_archive_source.py
872
983
  uv run python examples/publish_cli.py
873
984
  ```
874
985
 
@@ -72,16 +72,23 @@ Every annotation document includes provenance with:
72
72
  - The script path relative to the Git repo root when it can be determined
73
73
  - Git commit, branch, dirty state, canonical repository remote, exact tags, and
74
74
  `git describe` output when available
75
+ - A source-code reference for recovery, derived from Git metadata when possible
76
+ or supplied explicitly for archives, individual files, and DOI/URI records
75
77
  - The current `SLURM_JOB_ID` when available
76
78
  - Structured snapshots for recorded local inputs, including file checksums,
77
79
  directory content digests, and upstream annotation sidecar references when
78
80
  present
79
81
 
82
+ Local file hashing defaults to checksum policy `auto`: existing files are hashed
83
+ only up to `10 * 1024**3` bytes (10 GiB). Larger files are still recorded, but
84
+ their `sha256` or directory `content_digest` is left unset unless you provide a
85
+ precomputed checksum yourself.
86
+
80
87
  You can also attach your own parameters, input file paths, and function names.
81
88
  Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
82
89
  such as `s3://...` or `https://...` are preserved as provided.
83
- Git tags and `git_describe` are human-friendly hints only; `git_sha` remains the
84
- source of truth for reproducibility, matching, and source checkout.
90
+ Git tags and `git_describe` are human-friendly hints only. For Git sources,
91
+ `git_sha` and `source_code.revision` identify the recoverable code state.
85
92
 
86
93
  ## Quick Start
87
94
 
@@ -470,6 +477,75 @@ README.
470
477
  If you want the direct writer approach instead, use `write_file_manifest(...)` and
471
478
  `write_directory_manifest(...)` (see `examples/`).
472
479
 
480
+ ## Checksum Policy
481
+
482
+ All provenance and annotation entry points that hash local files support the same
483
+ policy controls:
484
+
485
+ - `checksum_policy="auto"`: hash existing local files only when they are at or
486
+ below `max_checksum_bytes`. This is the default, and
487
+ `max_checksum_bytes` defaults to `10 * 1024**3` bytes (10 GiB).
488
+ - `checksum_policy="always"`: hash existing local files regardless of size.
489
+ - `checksum_policy="never"`: never hash local files automatically. Checksums are
490
+ recorded only when you supply them explicitly.
491
+
492
+ When a checksum is skipped, JSON sidecars keep the same schema and simply store
493
+ `sha256: null`. Directory `content_digest` is also left unset when any tracked
494
+ member file lacks a checksum.
495
+
496
+ You can change the policy from Python:
497
+
498
+ ```python
499
+ from data_annotations.annotations import annotate_file
500
+ from data_annotations.provenance import write_file_manifest
501
+
502
+ write_file_manifest(
503
+ "outputs/summary.txt",
504
+ checksum_policy="always",
505
+ )
506
+
507
+ annotate_file(
508
+ "outputs/summary.txt",
509
+ title="Run Summary",
510
+ summary="Post-hoc summary.",
511
+ artifact_sha256="precomputed-sha256",
512
+ checksum_policy="never",
513
+ )
514
+ ```
515
+
516
+ You can also inject precomputed checksums directly:
517
+
518
+ - File APIs: pass `artifact_sha256=...`.
519
+ - File or directory APIs: pass `checksum_overrides={path: sha256}`. For
520
+ directory outputs, keys can be relative to the output directory or absolute
521
+ paths.
522
+ - Decorators such as `record_file_manifest(...)`, `record_directory_manifest(...)`,
523
+ `record_file_annotation(...)`, and `record_directory_annotation(...)` accept the
524
+ same checksum-policy arguments.
525
+
526
+ From the CLI, use `--checksum-policy`, `--max-checksum-bytes`, `--sha256`, and
527
+ repeatable `--checksum PATH=SHA256`:
528
+
529
+ ```bash
530
+ data-annotations annotate file path/to/summary.txt \
531
+ --title "Run Summary" \
532
+ --summary "Post-hoc summary." \
533
+ --kind report \
534
+ --checksum-policy never \
535
+ --sha256 0123456789abcdef...
536
+
537
+ data-annotations annotate directory path/to/run-001 \
538
+ --title "Processing outputs" \
539
+ --summary "Directory-level outputs." \
540
+ --checksum-policy never \
541
+ --checksum processed.csv=0123456789abcdef...
542
+
543
+ data-annotations provenance chain path/to/run-001 \
544
+ --checksum-policy always
545
+ ```
546
+
547
+ For a complete runnable workflow, see `examples/checksum_policy.py`.
548
+
473
549
  ## Description Layer
474
550
 
475
551
  The `data_annotations.description` sub-package provides the structured description
@@ -507,8 +583,9 @@ per call.
507
583
  Use `artifact_matches_manifest(...)` to verify whether a detached artifact still
508
584
  matches an annotation document. Use `analyze_provenance_chain(...)` when you also
509
585
  want to verify recorded inputs and recursively follow upstream annotation
510
- sidecars. Use `checkout_manifest_source(...)` to recover the recorded code state
511
- from Git metadata.
586
+ sidecars. Use `recover_manifest_source(...)` to recover the recorded source code
587
+ from Git metadata, a recorded source archive, or a recorded source file.
588
+ `checkout_manifest_source(...)` remains available as a compatibility alias.
512
589
 
513
590
  ```python
514
591
  from pathlib import Path
@@ -516,7 +593,7 @@ from pathlib import Path
516
593
  from data_annotations.provenance import (
517
594
  analyze_provenance_chain,
518
595
  artifact_matches_manifest,
519
- checkout_manifest_source,
596
+ recover_manifest_source,
520
597
  )
521
598
 
522
599
  annotation_path = Path("outputs/participants.csv.annotation.json")
@@ -525,7 +602,7 @@ artifact_path = Path("downloads/participants.csv")
525
602
  if artifact_matches_manifest(artifact_path, annotation_path):
526
603
  chain = analyze_provenance_chain(artifact_path)
527
604
  print(chain.status)
528
- recovered = checkout_manifest_source(annotation_path)
605
+ recovered = recover_manifest_source(annotation_path)
529
606
  print(recovered.checkout_path)
530
607
  print(recovered.script_path)
531
608
  ```
@@ -542,9 +619,9 @@ still attach provenance and description after the fact.
542
619
 
543
620
  Post-hoc descriptions can still be very useful, but the quality of post-hoc
544
621
  provenance depends on how exact the supplied answers are. In particular, fields
545
- such as the generating script, command, function, Git commit, repository path,
546
- Git tags, `git describe` output, inputs, and parameters are only as reliable as
547
- the information entered during annotation.
622
+ such as the generating script, command, function, source-code URI, Git commit,
623
+ repository path, Git tags, `git describe` output, inputs, and parameters are
624
+ only as reliable as the information entered during annotation.
548
625
 
549
626
  ## CLI Workflow
550
627
 
@@ -595,27 +672,34 @@ target: path/to/participants.csv
595
672
  title: Participant Cohort
596
673
  summary: Participant-level cohort assignments.
597
674
  kind: dataset
675
+ sha256: 0123456789abcdef...
598
676
 
599
677
  inputs:
600
- - ${DATA_ROOT}/raw/participants.csv
678
+ - ${DATA_ROOT}/raw/participants.csv
601
679
 
602
680
  params:
603
- split: validation
681
+ split: validation
604
682
 
605
683
  provenance:
606
- command: bash scripts/build_participants.sh
607
- script: scripts/build_participants.sh
608
- git_sha: deadbeef
684
+ command: bash scripts/build_participants.sh
685
+ script: scripts/build_participants.sh
686
+ git_sha: deadbeef
687
+ source_code:
688
+ kind: archive
689
+ uri: https://doi.org/10.5281/zenodo.12345
690
+ download_uri: https://zenodo.org/records/12345/files/source.zip
691
+ path: scripts/build_participants.sh
692
+ sha256: 0000000000000000000000000000000000000000000000000000000000000000
609
693
 
610
694
  fields:
611
- - name: participant_id
612
- summary: Stable participant identifier.
613
- data_type: string
614
- required: true
615
- nullable: false
695
+ - name: participant_id
696
+ summary: Stable participant identifier.
697
+ data_type: string
698
+ required: true
699
+ nullable: false
616
700
 
617
701
  primary_key:
618
- - participant_id
702
+ - participant_id
619
703
  ```
620
704
 
621
705
  Directory answers use an explicit inventory. Paths in `artifacts`,
@@ -628,26 +712,29 @@ title: Processing outputs
628
712
  summary: Files produced by the shell processing workflow.
629
713
 
630
714
  provenance:
631
- command: bash process_from_instrument.sh
632
- script: process_from_instrument.sh
715
+ command: bash process_from_instrument.sh
716
+ script: process_from_instrument.sh
717
+
718
+ checksums:
719
+ processed.csv: 0123456789abcdef...
633
720
 
634
721
  artifacts:
635
- - path: processed.csv
636
- kind: dataset
637
- title: Processed instrument output
638
- summary: Normalized output from the processing script.
722
+ - path: processed.csv
723
+ kind: dataset
724
+ title: Processed instrument output
725
+ summary: Normalized output from the processing script.
639
726
 
640
727
  artifact_groups:
641
- - title: Diagnostic plots
642
- kind: plot
643
- selector: plots/*.png
644
- paths:
645
- - plots/qc-1.png
646
- - plots/qc-2.png
728
+ - title: Diagnostic plots
729
+ kind: plot
730
+ selector: plots/*.png
731
+ paths:
732
+ - plots/qc-1.png
733
+ - plots/qc-2.png
647
734
 
648
735
  child_bundles:
649
- - path: model
650
- annotation_path: model/data-annotations.json
736
+ - path: model
737
+ annotation_path: model/data-annotations.json
651
738
  ```
652
739
 
653
740
  Answers files may also use schema-style aliases such as `subject.path`,
@@ -655,14 +742,24 @@ Answers files may also use schema-style aliases such as `subject.path`,
655
742
  `description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
656
743
  and `provenance.params`.
657
744
 
745
+ For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
746
+ `file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
747
+ sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
748
+ the generating script inside the recovered source. DOI or landing-page-only
749
+ references can be recorded with `kind: uri`, but they are not directly
750
+ recoverable unless a direct archive or file `download_uri` is also recorded.
751
+
658
752
  When group selectors are provided, the CLI expands them to concrete member paths
659
753
  at annotation time. Grouped files are tracked in `subject.produced_files[]` but
660
754
  are skipped by the per-file prompt flow, so you do not have to answer the same
661
755
  questions for every matching file.
662
756
 
663
- For post-hoc provenance, use repeatable `--git-tag` and optional
664
- `--git-describe` when you know the original code state. These values are stored
665
- as human-readable hints; `--git-sha` remains the field used for recovery.
757
+ For post-hoc provenance, use `--source-kind`, `--source-uri`,
758
+ `--source-download-uri`, `--source-path`, `--source-revision`, and
759
+ `--source-sha256` when the generating code is recoverable from a Git remote,
760
+ source archive, source file, or reference URI. Use repeatable `--git-tag` and
761
+ optional `--git-describe` when you know the original Git state; these values are
762
+ stored as human-readable hints.
666
763
 
667
764
  For provenance inspection and source recovery:
668
765
 
@@ -673,8 +770,12 @@ data-annotations provenance chain path/to/artifact --full-paths
673
770
  data-annotations provenance checkout path/to/artifact
674
771
  ```
675
772
 
676
- Command `checkout` downloads the recorded Git remote and checks out the recorded
677
- commit. It prompts before downloading source code and defaults to No; use
773
+ Command `checkout` recovers the recorded source code. For Git sources, it clones
774
+ the recorded remote and checks out the recorded revision. For archive and file
775
+ sources, it downloads or copies the recorded object, verifies `sha256` when
776
+ present, and resolves the generating script path when recorded. Reference-only
777
+ URI sources are preserved in the annotation but are not directly recoverable.
778
+ The command prompts before downloading source code and defaults to No; use
678
779
  `--force` when running trusted provenance checkout non-interactively.
679
780
 
680
781
  Command `match` auto-discovers `*.annotation.json` for files and `data-annotations.json` for
@@ -712,6 +813,11 @@ resolving an older installed command. From a source checkout, use
712
813
  `uv run data-annotations provenance chain ...`, or reinstall the CLI from the
713
814
  updated source before using the bare `data-annotations` command.
714
815
 
816
+ Both `match` and `chain` also accept `--checksum-policy` and
817
+ `--max-checksum-bytes`. Use `--checksum-policy always` when you want full
818
+ verification of large local files, and leave the default `auto` when you prefer
819
+ to avoid long checksum passes on very large artifacts.
820
+
715
821
  ### Run With `uvx`
716
822
 
717
823
  ```bash
@@ -790,6 +896,8 @@ uv run data-annotations publish path/to/run-001 path/to/publish-bundle
790
896
  - `ProducedFile`
791
897
  - `ChildBundle`
792
898
  - `InputArtifact`
899
+ - `SourceCodeKind`
900
+ - `SourceCodeReference`
793
901
  - `BaseProvenance`
794
902
  - `FileManifest`
795
903
  - `DirectoryManifest`
@@ -807,6 +915,7 @@ uv run data-annotations publish path/to/run-001 path/to/publish-bundle
807
915
  - `analyze_provenance_chain(...)`
808
916
  - `provenance_chain_is_fresh(...)`
809
917
  - `artifact_matches_manifest(...)`
918
+ - `recover_manifest_source(...)`
810
919
  - `checkout_manifest_source(...)`
811
920
 
812
921
  ### Publish Functions
@@ -830,6 +939,7 @@ uv run python examples/record_file_description.py
830
939
  uv run python examples/record_directory_description.py
831
940
  uv run python examples/annotate_file.py
832
941
  uv run python examples/annotate_directory.py
942
+ uv run python examples/checksum_policy.py
833
943
  uv run python examples/annotate_file_answers_cli.py
834
944
  uv run python examples/write_file_manifest.py
835
945
  uv run python examples/write_directory_manifest.py
@@ -839,6 +949,7 @@ uv run python examples/provenance_chain.py
839
949
  uv run python examples/provenance_chain_cli.py
840
950
  uv run python examples/recover_provenance.py
841
951
  uv run python examples/recover_provenance_cli.py
952
+ uv run python examples/recover_archive_source.py
842
953
  uv run python examples/publish_cli.py
843
954
  ```
844
955
 
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "data-annotations"
3
- version = "2.4.0"
4
- description = "Annotate generated data artifacts"
3
+ version = "2.6.0"
4
+ description = "Annotate data artifacts with provenance and descriptions"
5
5
  readme = "README.md"
6
6
  authors = [
7
7
  { name = "Rodrigo C. G. Pena", email = "rodrigo.cerqueiragonzalezpena@unibas.ch" },