data-annotations 2.4.0__tar.gz → 2.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_annotations-2.4.0 → data_annotations-2.6.0}/PKG-INFO +152 -41
- {data_annotations-2.4.0 → data_annotations-2.6.0}/README.md +150 -39
- {data_annotations-2.4.0 → data_annotations-2.6.0}/pyproject.toml +2 -2
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/_decorators.py +39 -4
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/annotations/decorators.py +18 -1
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/annotations/models.py +2 -2
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/annotations/writers.py +44 -1
- data_annotations-2.6.0/src/data_annotations/cli_app/annotate/__init__.py +524 -0
- data_annotations-2.4.0/src/data_annotations/cli_app/annotate.py → data_annotations-2.6.0/src/data_annotations/cli_app/annotate/helpers.py +121 -439
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/answers.py +17 -2
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/common.py +106 -4
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/provenance_commands.py +43 -17
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/publish.py +7 -1
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/provenance/__init__.py +10 -0
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/provenance/decorators.py +17 -1
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/provenance/models.py +20 -2
- data_annotations-2.6.0/src/data_annotations/provenance/recovery/__init__.py +102 -0
- data_annotations-2.6.0/src/data_annotations/provenance/recovery/chain.py +361 -0
- data_annotations-2.6.0/src/data_annotations/provenance/recovery/manifest.py +179 -0
- data_annotations-2.6.0/src/data_annotations/provenance/recovery/matching.py +324 -0
- data_annotations-2.6.0/src/data_annotations/provenance/recovery/sources.py +507 -0
- data_annotations-2.6.0/src/data_annotations/provenance/recovery/types.py +32 -0
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/provenance/writers.py +223 -18
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/publish.py +14 -0
- data_annotations-2.4.0/src/data_annotations/provenance/recovery.py +0 -926
- {data_annotations-2.4.0 → data_annotations-2.6.0}/LICENSE +0 -0
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/__init__.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/annotations/__init__.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/cli.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/__init__.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/prompts.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/description/__init__.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/description/decorators.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/description/models.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/description/writers.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/provenance/git.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.6.0}/src/data_annotations/provenance/runtime.py +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-annotations
|
|
3
|
-
Version: 2.
|
|
4
|
-
Summary: Annotate
|
|
3
|
+
Version: 2.6.0
|
|
4
|
+
Summary: Annotate data artifacts with provenance and descriptions
|
|
5
5
|
Keywords: annotations,data,metadata,provenance,reproducibility
|
|
6
6
|
Author: Rodrigo C. G. Pena
|
|
7
7
|
Author-email: Rodrigo C. G. Pena <rodrigo.cerqueiragonzalezpena@unibas.ch>
|
|
@@ -102,16 +102,23 @@ Every annotation document includes provenance with:
|
|
|
102
102
|
- The script path relative to the Git repo root when it can be determined
|
|
103
103
|
- Git commit, branch, dirty state, canonical repository remote, exact tags, and
|
|
104
104
|
`git describe` output when available
|
|
105
|
+
- A source-code reference for recovery, derived from Git metadata when possible
|
|
106
|
+
or supplied explicitly for archives, individual files, and DOI/URI records
|
|
105
107
|
- The current `SLURM_JOB_ID` when available
|
|
106
108
|
- Structured snapshots for recorded local inputs, including file checksums,
|
|
107
109
|
directory content digests, and upstream annotation sidecar references when
|
|
108
110
|
present
|
|
109
111
|
|
|
112
|
+
Local file hashing defaults to checksum policy `auto`: existing files are hashed
|
|
113
|
+
only up to `10 * 1024**3` bytes (10 GiB). Larger files are still recorded, but
|
|
114
|
+
their `sha256` or directory `content_digest` is left unset unless you provide a
|
|
115
|
+
precomputed checksum yourself.
|
|
116
|
+
|
|
110
117
|
You can also attach your own parameters, input file paths, and function names.
|
|
111
118
|
Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
|
|
112
119
|
such as `s3://...` or `https://...` are preserved as provided.
|
|
113
|
-
Git tags and `git_describe` are human-friendly hints only
|
|
114
|
-
|
|
120
|
+
Git tags and `git_describe` are human-friendly hints only. For Git sources,
|
|
121
|
+
`git_sha` and `source_code.revision` identify the recoverable code state.
|
|
115
122
|
|
|
116
123
|
## Quick Start
|
|
117
124
|
|
|
@@ -500,6 +507,75 @@ README.
|
|
|
500
507
|
If you want the direct writer approach instead, use `write_file_manifest(...)` and
|
|
501
508
|
`write_directory_manifest(...)` (see `examples/`).
|
|
502
509
|
|
|
510
|
+
## Checksum Policy
|
|
511
|
+
|
|
512
|
+
All provenance and annotation entry points that hash local files support the same
|
|
513
|
+
policy controls:
|
|
514
|
+
|
|
515
|
+
- `checksum_policy="auto"`: hash existing local files only when they are at or
|
|
516
|
+
below `max_checksum_bytes`. This is the default, and
|
|
517
|
+
`max_checksum_bytes` defaults to `10 * 1024**3` bytes (10 GiB).
|
|
518
|
+
- `checksum_policy="always"`: hash existing local files regardless of size.
|
|
519
|
+
- `checksum_policy="never"`: never hash local files automatically. Checksums are
|
|
520
|
+
recorded only when you supply them explicitly.
|
|
521
|
+
|
|
522
|
+
When a checksum is skipped, JSON sidecars keep the same schema and simply store
|
|
523
|
+
`sha256: null`. Directory `content_digest` is also left unset when any tracked
|
|
524
|
+
member file lacks a checksum.
|
|
525
|
+
|
|
526
|
+
You can change the policy from Python:
|
|
527
|
+
|
|
528
|
+
```python
|
|
529
|
+
from data_annotations.annotations import annotate_file
|
|
530
|
+
from data_annotations.provenance import write_file_manifest
|
|
531
|
+
|
|
532
|
+
write_file_manifest(
|
|
533
|
+
"outputs/summary.txt",
|
|
534
|
+
checksum_policy="always",
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
annotate_file(
|
|
538
|
+
"outputs/summary.txt",
|
|
539
|
+
title="Run Summary",
|
|
540
|
+
summary="Post-hoc summary.",
|
|
541
|
+
artifact_sha256="precomputed-sha256",
|
|
542
|
+
checksum_policy="never",
|
|
543
|
+
)
|
|
544
|
+
```
|
|
545
|
+
|
|
546
|
+
You can also inject precomputed checksums directly:
|
|
547
|
+
|
|
548
|
+
- File APIs: pass `artifact_sha256=...`.
|
|
549
|
+
- File or directory APIs: pass `checksum_overrides={path: sha256}`. For
|
|
550
|
+
directory outputs, keys can be relative to the output directory or absolute
|
|
551
|
+
paths.
|
|
552
|
+
- Decorators such as `record_file_manifest(...)`, `record_directory_manifest(...)`,
|
|
553
|
+
`record_file_annotation(...)`, and `record_directory_annotation(...)` accept the
|
|
554
|
+
same checksum-policy arguments.
|
|
555
|
+
|
|
556
|
+
From the CLI, use `--checksum-policy`, `--max-checksum-bytes`, `--sha256`, and
|
|
557
|
+
repeatable `--checksum PATH=SHA256`:
|
|
558
|
+
|
|
559
|
+
```bash
|
|
560
|
+
data-annotations annotate file path/to/summary.txt \
|
|
561
|
+
--title "Run Summary" \
|
|
562
|
+
--summary "Post-hoc summary." \
|
|
563
|
+
--kind report \
|
|
564
|
+
--checksum-policy never \
|
|
565
|
+
--sha256 0123456789abcdef...
|
|
566
|
+
|
|
567
|
+
data-annotations annotate directory path/to/run-001 \
|
|
568
|
+
--title "Processing outputs" \
|
|
569
|
+
--summary "Directory-level outputs." \
|
|
570
|
+
--checksum-policy never \
|
|
571
|
+
--checksum processed.csv=0123456789abcdef...
|
|
572
|
+
|
|
573
|
+
data-annotations provenance chain path/to/run-001 \
|
|
574
|
+
--checksum-policy always
|
|
575
|
+
```
|
|
576
|
+
|
|
577
|
+
For a complete runnable workflow, see `examples/checksum_policy.py`.
|
|
578
|
+
|
|
503
579
|
## Description Layer
|
|
504
580
|
|
|
505
581
|
The `data_annotations.description` sub-package provides the structured description
|
|
@@ -537,8 +613,9 @@ per call.
|
|
|
537
613
|
Use `artifact_matches_manifest(...)` to verify whether a detached artifact still
|
|
538
614
|
matches an annotation document. Use `analyze_provenance_chain(...)` when you also
|
|
539
615
|
want to verify recorded inputs and recursively follow upstream annotation
|
|
540
|
-
sidecars. Use `
|
|
541
|
-
from Git metadata.
|
|
616
|
+
sidecars. Use `recover_manifest_source(...)` to recover the recorded source code
|
|
617
|
+
from Git metadata, a recorded source archive, or a recorded source file.
|
|
618
|
+
`checkout_manifest_source(...)` remains available as a compatibility alias.
|
|
542
619
|
|
|
543
620
|
```python
|
|
544
621
|
from pathlib import Path
|
|
@@ -546,7 +623,7 @@ from pathlib import Path
|
|
|
546
623
|
from data_annotations.provenance import (
|
|
547
624
|
analyze_provenance_chain,
|
|
548
625
|
artifact_matches_manifest,
|
|
549
|
-
|
|
626
|
+
recover_manifest_source,
|
|
550
627
|
)
|
|
551
628
|
|
|
552
629
|
annotation_path = Path("outputs/participants.csv.annotation.json")
|
|
@@ -555,7 +632,7 @@ artifact_path = Path("downloads/participants.csv")
|
|
|
555
632
|
if artifact_matches_manifest(artifact_path, annotation_path):
|
|
556
633
|
chain = analyze_provenance_chain(artifact_path)
|
|
557
634
|
print(chain.status)
|
|
558
|
-
recovered =
|
|
635
|
+
recovered = recover_manifest_source(annotation_path)
|
|
559
636
|
print(recovered.checkout_path)
|
|
560
637
|
print(recovered.script_path)
|
|
561
638
|
```
|
|
@@ -572,9 +649,9 @@ still attach provenance and description after the fact.
|
|
|
572
649
|
|
|
573
650
|
Post-hoc descriptions can still be very useful, but the quality of post-hoc
|
|
574
651
|
provenance depends on how exact the supplied answers are. In particular, fields
|
|
575
|
-
such as the generating script, command, function,
|
|
576
|
-
Git tags, `git describe` output, inputs, and parameters are
|
|
577
|
-
the information entered during annotation.
|
|
652
|
+
such as the generating script, command, function, source-code URI, Git commit,
|
|
653
|
+
repository path, Git tags, `git describe` output, inputs, and parameters are
|
|
654
|
+
only as reliable as the information entered during annotation.
|
|
578
655
|
|
|
579
656
|
## CLI Workflow
|
|
580
657
|
|
|
@@ -625,27 +702,34 @@ target: path/to/participants.csv
|
|
|
625
702
|
title: Participant Cohort
|
|
626
703
|
summary: Participant-level cohort assignments.
|
|
627
704
|
kind: dataset
|
|
705
|
+
sha256: 0123456789abcdef...
|
|
628
706
|
|
|
629
707
|
inputs:
|
|
630
|
-
|
|
708
|
+
- ${DATA_ROOT}/raw/participants.csv
|
|
631
709
|
|
|
632
710
|
params:
|
|
633
|
-
|
|
711
|
+
split: validation
|
|
634
712
|
|
|
635
713
|
provenance:
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
714
|
+
command: bash scripts/build_participants.sh
|
|
715
|
+
script: scripts/build_participants.sh
|
|
716
|
+
git_sha: deadbeef
|
|
717
|
+
source_code:
|
|
718
|
+
kind: archive
|
|
719
|
+
uri: https://doi.org/10.5281/zenodo.12345
|
|
720
|
+
download_uri: https://zenodo.org/records/12345/files/source.zip
|
|
721
|
+
path: scripts/build_participants.sh
|
|
722
|
+
sha256: 0000000000000000000000000000000000000000000000000000000000000000
|
|
639
723
|
|
|
640
724
|
fields:
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
725
|
+
- name: participant_id
|
|
726
|
+
summary: Stable participant identifier.
|
|
727
|
+
data_type: string
|
|
728
|
+
required: true
|
|
729
|
+
nullable: false
|
|
646
730
|
|
|
647
731
|
primary_key:
|
|
648
|
-
|
|
732
|
+
- participant_id
|
|
649
733
|
```
|
|
650
734
|
|
|
651
735
|
Directory answers use an explicit inventory. Paths in `artifacts`,
|
|
@@ -658,26 +742,29 @@ title: Processing outputs
|
|
|
658
742
|
summary: Files produced by the shell processing workflow.
|
|
659
743
|
|
|
660
744
|
provenance:
|
|
661
|
-
|
|
662
|
-
|
|
745
|
+
command: bash process_from_instrument.sh
|
|
746
|
+
script: process_from_instrument.sh
|
|
747
|
+
|
|
748
|
+
checksums:
|
|
749
|
+
processed.csv: 0123456789abcdef...
|
|
663
750
|
|
|
664
751
|
artifacts:
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
752
|
+
- path: processed.csv
|
|
753
|
+
kind: dataset
|
|
754
|
+
title: Processed instrument output
|
|
755
|
+
summary: Normalized output from the processing script.
|
|
669
756
|
|
|
670
757
|
artifact_groups:
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
758
|
+
- title: Diagnostic plots
|
|
759
|
+
kind: plot
|
|
760
|
+
selector: plots/*.png
|
|
761
|
+
paths:
|
|
762
|
+
- plots/qc-1.png
|
|
763
|
+
- plots/qc-2.png
|
|
677
764
|
|
|
678
765
|
child_bundles:
|
|
679
|
-
|
|
680
|
-
|
|
766
|
+
- path: model
|
|
767
|
+
annotation_path: model/data-annotations.json
|
|
681
768
|
```
|
|
682
769
|
|
|
683
770
|
Answers files may also use schema-style aliases such as `subject.path`,
|
|
@@ -685,14 +772,24 @@ Answers files may also use schema-style aliases such as `subject.path`,
|
|
|
685
772
|
`description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
|
|
686
773
|
and `provenance.params`.
|
|
687
774
|
|
|
775
|
+
For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
|
|
776
|
+
`file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
|
|
777
|
+
sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
|
|
778
|
+
the generating script inside the recovered source. DOI or landing-page-only
|
|
779
|
+
references can be recorded with `kind: uri`, but they are not directly
|
|
780
|
+
recoverable unless a direct archive or file `download_uri` is also recorded.
|
|
781
|
+
|
|
688
782
|
When group selectors are provided, the CLI expands them to concrete member paths
|
|
689
783
|
at annotation time. Grouped files are tracked in `subject.produced_files[]` but
|
|
690
784
|
are skipped by the per-file prompt flow, so you do not have to answer the same
|
|
691
785
|
questions for every matching file.
|
|
692
786
|
|
|
693
|
-
For post-hoc provenance, use
|
|
694
|
-
`--
|
|
695
|
-
|
|
787
|
+
For post-hoc provenance, use `--source-kind`, `--source-uri`,
|
|
788
|
+
`--source-download-uri`, `--source-path`, `--source-revision`, and
|
|
789
|
+
`--source-sha256` when the generating code is recoverable from a Git remote,
|
|
790
|
+
source archive, source file, or reference URI. Use repeatable `--git-tag` and
|
|
791
|
+
optional `--git-describe` when you know the original Git state; these values are
|
|
792
|
+
stored as human-readable hints.
|
|
696
793
|
|
|
697
794
|
For provenance inspection and source recovery:
|
|
698
795
|
|
|
@@ -703,8 +800,12 @@ data-annotations provenance chain path/to/artifact --full-paths
|
|
|
703
800
|
data-annotations provenance checkout path/to/artifact
|
|
704
801
|
```
|
|
705
802
|
|
|
706
|
-
Command `checkout`
|
|
707
|
-
|
|
803
|
+
Command `checkout` recovers the recorded source code. For Git sources, it clones
|
|
804
|
+
the recorded remote and checks out the recorded revision. For archive and file
|
|
805
|
+
sources, it downloads or copies the recorded object, verifies `sha256` when
|
|
806
|
+
present, and resolves the generating script path when recorded. Reference-only
|
|
807
|
+
URI sources are preserved in the annotation but are not directly recoverable.
|
|
808
|
+
The command prompts before downloading source code and defaults to No; use
|
|
708
809
|
`--force` when running trusted provenance checkout non-interactively.
|
|
709
810
|
|
|
710
811
|
Command `match` auto-discovers `*.annotation.json` for files and `data-annotations.json` for
|
|
@@ -742,6 +843,11 @@ resolving an older installed command. From a source checkout, use
|
|
|
742
843
|
`uv run data-annotations provenance chain ...`, or reinstall the CLI from the
|
|
743
844
|
updated source before using the bare `data-annotations` command.
|
|
744
845
|
|
|
846
|
+
Both `match` and `chain` also accept `--checksum-policy` and
|
|
847
|
+
`--max-checksum-bytes`. Use `--checksum-policy always` when you want full
|
|
848
|
+
verification of large local files, and leave the default `auto` when you prefer
|
|
849
|
+
to avoid long checksum passes on very large artifacts.
|
|
850
|
+
|
|
745
851
|
### Run With `uvx`
|
|
746
852
|
|
|
747
853
|
```bash
|
|
@@ -820,6 +926,8 @@ uv run data-annotations publish path/to/run-001 path/to/publish-bundle
|
|
|
820
926
|
- `ProducedFile`
|
|
821
927
|
- `ChildBundle`
|
|
822
928
|
- `InputArtifact`
|
|
929
|
+
- `SourceCodeKind`
|
|
930
|
+
- `SourceCodeReference`
|
|
823
931
|
- `BaseProvenance`
|
|
824
932
|
- `FileManifest`
|
|
825
933
|
- `DirectoryManifest`
|
|
@@ -837,6 +945,7 @@ uv run data-annotations publish path/to/run-001 path/to/publish-bundle
|
|
|
837
945
|
- `analyze_provenance_chain(...)`
|
|
838
946
|
- `provenance_chain_is_fresh(...)`
|
|
839
947
|
- `artifact_matches_manifest(...)`
|
|
948
|
+
- `recover_manifest_source(...)`
|
|
840
949
|
- `checkout_manifest_source(...)`
|
|
841
950
|
|
|
842
951
|
### Publish Functions
|
|
@@ -860,6 +969,7 @@ uv run python examples/record_file_description.py
|
|
|
860
969
|
uv run python examples/record_directory_description.py
|
|
861
970
|
uv run python examples/annotate_file.py
|
|
862
971
|
uv run python examples/annotate_directory.py
|
|
972
|
+
uv run python examples/checksum_policy.py
|
|
863
973
|
uv run python examples/annotate_file_answers_cli.py
|
|
864
974
|
uv run python examples/write_file_manifest.py
|
|
865
975
|
uv run python examples/write_directory_manifest.py
|
|
@@ -869,6 +979,7 @@ uv run python examples/provenance_chain.py
|
|
|
869
979
|
uv run python examples/provenance_chain_cli.py
|
|
870
980
|
uv run python examples/recover_provenance.py
|
|
871
981
|
uv run python examples/recover_provenance_cli.py
|
|
982
|
+
uv run python examples/recover_archive_source.py
|
|
872
983
|
uv run python examples/publish_cli.py
|
|
873
984
|
```
|
|
874
985
|
|
|
@@ -72,16 +72,23 @@ Every annotation document includes provenance with:
|
|
|
72
72
|
- The script path relative to the Git repo root when it can be determined
|
|
73
73
|
- Git commit, branch, dirty state, canonical repository remote, exact tags, and
|
|
74
74
|
`git describe` output when available
|
|
75
|
+
- A source-code reference for recovery, derived from Git metadata when possible
|
|
76
|
+
or supplied explicitly for archives, individual files, and DOI/URI records
|
|
75
77
|
- The current `SLURM_JOB_ID` when available
|
|
76
78
|
- Structured snapshots for recorded local inputs, including file checksums,
|
|
77
79
|
directory content digests, and upstream annotation sidecar references when
|
|
78
80
|
present
|
|
79
81
|
|
|
82
|
+
Local file hashing defaults to checksum policy `auto`: existing files are hashed
|
|
83
|
+
only up to `10 * 1024**3` bytes (10 GiB). Larger files are still recorded, but
|
|
84
|
+
their `sha256` or directory `content_digest` is left unset unless you provide a
|
|
85
|
+
precomputed checksum yourself.
|
|
86
|
+
|
|
80
87
|
You can also attach your own parameters, input file paths, and function names.
|
|
81
88
|
Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
|
|
82
89
|
such as `s3://...` or `https://...` are preserved as provided.
|
|
83
|
-
Git tags and `git_describe` are human-friendly hints only
|
|
84
|
-
|
|
90
|
+
Git tags and `git_describe` are human-friendly hints only. For Git sources,
|
|
91
|
+
`git_sha` and `source_code.revision` identify the recoverable code state.
|
|
85
92
|
|
|
86
93
|
## Quick Start
|
|
87
94
|
|
|
@@ -470,6 +477,75 @@ README.
|
|
|
470
477
|
If you want the direct writer approach instead, use `write_file_manifest(...)` and
|
|
471
478
|
`write_directory_manifest(...)` (see `examples/`).
|
|
472
479
|
|
|
480
|
+
## Checksum Policy
|
|
481
|
+
|
|
482
|
+
All provenance and annotation entry points that hash local files support the same
|
|
483
|
+
policy controls:
|
|
484
|
+
|
|
485
|
+
- `checksum_policy="auto"`: hash existing local files only when they are at or
|
|
486
|
+
below `max_checksum_bytes`. This is the default, and
|
|
487
|
+
`max_checksum_bytes` defaults to `10 * 1024**3` bytes (10 GiB).
|
|
488
|
+
- `checksum_policy="always"`: hash existing local files regardless of size.
|
|
489
|
+
- `checksum_policy="never"`: never hash local files automatically. Checksums are
|
|
490
|
+
recorded only when you supply them explicitly.
|
|
491
|
+
|
|
492
|
+
When a checksum is skipped, JSON sidecars keep the same schema and simply store
|
|
493
|
+
`sha256: null`. Directory `content_digest` is also left unset when any tracked
|
|
494
|
+
member file lacks a checksum.
|
|
495
|
+
|
|
496
|
+
You can change the policy from Python:
|
|
497
|
+
|
|
498
|
+
```python
|
|
499
|
+
from data_annotations.annotations import annotate_file
|
|
500
|
+
from data_annotations.provenance import write_file_manifest
|
|
501
|
+
|
|
502
|
+
write_file_manifest(
|
|
503
|
+
"outputs/summary.txt",
|
|
504
|
+
checksum_policy="always",
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
annotate_file(
|
|
508
|
+
"outputs/summary.txt",
|
|
509
|
+
title="Run Summary",
|
|
510
|
+
summary="Post-hoc summary.",
|
|
511
|
+
artifact_sha256="precomputed-sha256",
|
|
512
|
+
checksum_policy="never",
|
|
513
|
+
)
|
|
514
|
+
```
|
|
515
|
+
|
|
516
|
+
You can also inject precomputed checksums directly:
|
|
517
|
+
|
|
518
|
+
- File APIs: pass `artifact_sha256=...`.
|
|
519
|
+
- File or directory APIs: pass `checksum_overrides={path: sha256}`. For
|
|
520
|
+
directory outputs, keys can be relative to the output directory or absolute
|
|
521
|
+
paths.
|
|
522
|
+
- Decorators such as `record_file_manifest(...)`, `record_directory_manifest(...)`,
|
|
523
|
+
`record_file_annotation(...)`, and `record_directory_annotation(...)` accept the
|
|
524
|
+
same checksum-policy arguments.
|
|
525
|
+
|
|
526
|
+
From the CLI, use `--checksum-policy`, `--max-checksum-bytes`, `--sha256`, and
|
|
527
|
+
repeatable `--checksum PATH=SHA256`:
|
|
528
|
+
|
|
529
|
+
```bash
|
|
530
|
+
data-annotations annotate file path/to/summary.txt \
|
|
531
|
+
--title "Run Summary" \
|
|
532
|
+
--summary "Post-hoc summary." \
|
|
533
|
+
--kind report \
|
|
534
|
+
--checksum-policy never \
|
|
535
|
+
--sha256 0123456789abcdef...
|
|
536
|
+
|
|
537
|
+
data-annotations annotate directory path/to/run-001 \
|
|
538
|
+
--title "Processing outputs" \
|
|
539
|
+
--summary "Directory-level outputs." \
|
|
540
|
+
--checksum-policy never \
|
|
541
|
+
--checksum processed.csv=0123456789abcdef...
|
|
542
|
+
|
|
543
|
+
data-annotations provenance chain path/to/run-001 \
|
|
544
|
+
--checksum-policy always
|
|
545
|
+
```
|
|
546
|
+
|
|
547
|
+
For a complete runnable workflow, see `examples/checksum_policy.py`.
|
|
548
|
+
|
|
473
549
|
## Description Layer
|
|
474
550
|
|
|
475
551
|
The `data_annotations.description` sub-package provides the structured description
|
|
@@ -507,8 +583,9 @@ per call.
|
|
|
507
583
|
Use `artifact_matches_manifest(...)` to verify whether a detached artifact still
|
|
508
584
|
matches an annotation document. Use `analyze_provenance_chain(...)` when you also
|
|
509
585
|
want to verify recorded inputs and recursively follow upstream annotation
|
|
510
|
-
sidecars. Use `
|
|
511
|
-
from Git metadata.
|
|
586
|
+
sidecars. Use `recover_manifest_source(...)` to recover the recorded source code
|
|
587
|
+
from Git metadata, a recorded source archive, or a recorded source file.
|
|
588
|
+
`checkout_manifest_source(...)` remains available as a compatibility alias.
|
|
512
589
|
|
|
513
590
|
```python
|
|
514
591
|
from pathlib import Path
|
|
@@ -516,7 +593,7 @@ from pathlib import Path
|
|
|
516
593
|
from data_annotations.provenance import (
|
|
517
594
|
analyze_provenance_chain,
|
|
518
595
|
artifact_matches_manifest,
|
|
519
|
-
|
|
596
|
+
recover_manifest_source,
|
|
520
597
|
)
|
|
521
598
|
|
|
522
599
|
annotation_path = Path("outputs/participants.csv.annotation.json")
|
|
@@ -525,7 +602,7 @@ artifact_path = Path("downloads/participants.csv")
|
|
|
525
602
|
if artifact_matches_manifest(artifact_path, annotation_path):
|
|
526
603
|
chain = analyze_provenance_chain(artifact_path)
|
|
527
604
|
print(chain.status)
|
|
528
|
-
recovered =
|
|
605
|
+
recovered = recover_manifest_source(annotation_path)
|
|
529
606
|
print(recovered.checkout_path)
|
|
530
607
|
print(recovered.script_path)
|
|
531
608
|
```
|
|
@@ -542,9 +619,9 @@ still attach provenance and description after the fact.
|
|
|
542
619
|
|
|
543
620
|
Post-hoc descriptions can still be very useful, but the quality of post-hoc
|
|
544
621
|
provenance depends on how exact the supplied answers are. In particular, fields
|
|
545
|
-
such as the generating script, command, function,
|
|
546
|
-
Git tags, `git describe` output, inputs, and parameters are
|
|
547
|
-
the information entered during annotation.
|
|
622
|
+
such as the generating script, command, function, source-code URI, Git commit,
|
|
623
|
+
repository path, Git tags, `git describe` output, inputs, and parameters are
|
|
624
|
+
only as reliable as the information entered during annotation.
|
|
548
625
|
|
|
549
626
|
## CLI Workflow
|
|
550
627
|
|
|
@@ -595,27 +672,34 @@ target: path/to/participants.csv
|
|
|
595
672
|
title: Participant Cohort
|
|
596
673
|
summary: Participant-level cohort assignments.
|
|
597
674
|
kind: dataset
|
|
675
|
+
sha256: 0123456789abcdef...
|
|
598
676
|
|
|
599
677
|
inputs:
|
|
600
|
-
|
|
678
|
+
- ${DATA_ROOT}/raw/participants.csv
|
|
601
679
|
|
|
602
680
|
params:
|
|
603
|
-
|
|
681
|
+
split: validation
|
|
604
682
|
|
|
605
683
|
provenance:
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
684
|
+
command: bash scripts/build_participants.sh
|
|
685
|
+
script: scripts/build_participants.sh
|
|
686
|
+
git_sha: deadbeef
|
|
687
|
+
source_code:
|
|
688
|
+
kind: archive
|
|
689
|
+
uri: https://doi.org/10.5281/zenodo.12345
|
|
690
|
+
download_uri: https://zenodo.org/records/12345/files/source.zip
|
|
691
|
+
path: scripts/build_participants.sh
|
|
692
|
+
sha256: 0000000000000000000000000000000000000000000000000000000000000000
|
|
609
693
|
|
|
610
694
|
fields:
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
695
|
+
- name: participant_id
|
|
696
|
+
summary: Stable participant identifier.
|
|
697
|
+
data_type: string
|
|
698
|
+
required: true
|
|
699
|
+
nullable: false
|
|
616
700
|
|
|
617
701
|
primary_key:
|
|
618
|
-
|
|
702
|
+
- participant_id
|
|
619
703
|
```
|
|
620
704
|
|
|
621
705
|
Directory answers use an explicit inventory. Paths in `artifacts`,
|
|
@@ -628,26 +712,29 @@ title: Processing outputs
|
|
|
628
712
|
summary: Files produced by the shell processing workflow.
|
|
629
713
|
|
|
630
714
|
provenance:
|
|
631
|
-
|
|
632
|
-
|
|
715
|
+
command: bash process_from_instrument.sh
|
|
716
|
+
script: process_from_instrument.sh
|
|
717
|
+
|
|
718
|
+
checksums:
|
|
719
|
+
processed.csv: 0123456789abcdef...
|
|
633
720
|
|
|
634
721
|
artifacts:
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
722
|
+
- path: processed.csv
|
|
723
|
+
kind: dataset
|
|
724
|
+
title: Processed instrument output
|
|
725
|
+
summary: Normalized output from the processing script.
|
|
639
726
|
|
|
640
727
|
artifact_groups:
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
728
|
+
- title: Diagnostic plots
|
|
729
|
+
kind: plot
|
|
730
|
+
selector: plots/*.png
|
|
731
|
+
paths:
|
|
732
|
+
- plots/qc-1.png
|
|
733
|
+
- plots/qc-2.png
|
|
647
734
|
|
|
648
735
|
child_bundles:
|
|
649
|
-
|
|
650
|
-
|
|
736
|
+
- path: model
|
|
737
|
+
annotation_path: model/data-annotations.json
|
|
651
738
|
```
|
|
652
739
|
|
|
653
740
|
Answers files may also use schema-style aliases such as `subject.path`,
|
|
@@ -655,14 +742,24 @@ Answers files may also use schema-style aliases such as `subject.path`,
|
|
|
655
742
|
`description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
|
|
656
743
|
and `provenance.params`.
|
|
657
744
|
|
|
745
|
+
For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
|
|
746
|
+
`file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
|
|
747
|
+
sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
|
|
748
|
+
the generating script inside the recovered source. DOI or landing-page-only
|
|
749
|
+
references can be recorded with `kind: uri`, but they are not directly
|
|
750
|
+
recoverable unless a direct archive or file `download_uri` is also recorded.
|
|
751
|
+
|
|
658
752
|
When group selectors are provided, the CLI expands them to concrete member paths
|
|
659
753
|
at annotation time. Grouped files are tracked in `subject.produced_files[]` but
|
|
660
754
|
are skipped by the per-file prompt flow, so you do not have to answer the same
|
|
661
755
|
questions for every matching file.
|
|
662
756
|
|
|
663
|
-
For post-hoc provenance, use
|
|
664
|
-
`--
|
|
665
|
-
|
|
757
|
+
For post-hoc provenance, use `--source-kind`, `--source-uri`,
|
|
758
|
+
`--source-download-uri`, `--source-path`, `--source-revision`, and
|
|
759
|
+
`--source-sha256` when the generating code is recoverable from a Git remote,
|
|
760
|
+
source archive, source file, or reference URI. Use repeatable `--git-tag` and
|
|
761
|
+
optional `--git-describe` when you know the original Git state; these values are
|
|
762
|
+
stored as human-readable hints.
|
|
666
763
|
|
|
667
764
|
For provenance inspection and source recovery:
|
|
668
765
|
|
|
@@ -673,8 +770,12 @@ data-annotations provenance chain path/to/artifact --full-paths
|
|
|
673
770
|
data-annotations provenance checkout path/to/artifact
|
|
674
771
|
```
|
|
675
772
|
|
|
676
|
-
Command `checkout`
|
|
677
|
-
|
|
773
|
+
Command `checkout` recovers the recorded source code. For Git sources, it clones
|
|
774
|
+
the recorded remote and checks out the recorded revision. For archive and file
|
|
775
|
+
sources, it downloads or copies the recorded object, verifies `sha256` when
|
|
776
|
+
present, and resolves the generating script path when recorded. Reference-only
|
|
777
|
+
URI sources are preserved in the annotation but are not directly recoverable.
|
|
778
|
+
The command prompts before downloading source code and defaults to No; use
|
|
678
779
|
`--force` when running trusted provenance checkout non-interactively.
|
|
679
780
|
|
|
680
781
|
Command `match` auto-discovers `*.annotation.json` for files and `data-annotations.json` for
|
|
@@ -712,6 +813,11 @@ resolving an older installed command. From a source checkout, use
|
|
|
712
813
|
`uv run data-annotations provenance chain ...`, or reinstall the CLI from the
|
|
713
814
|
updated source before using the bare `data-annotations` command.
|
|
714
815
|
|
|
816
|
+
Both `match` and `chain` also accept `--checksum-policy` and
|
|
817
|
+
`--max-checksum-bytes`. Use `--checksum-policy always` when you want full
|
|
818
|
+
verification of large local files, and leave the default `auto` when you prefer
|
|
819
|
+
to avoid long checksum passes on very large artifacts.
|
|
820
|
+
|
|
715
821
|
### Run With `uvx`
|
|
716
822
|
|
|
717
823
|
```bash
|
|
@@ -790,6 +896,8 @@ uv run data-annotations publish path/to/run-001 path/to/publish-bundle
|
|
|
790
896
|
- `ProducedFile`
|
|
791
897
|
- `ChildBundle`
|
|
792
898
|
- `InputArtifact`
|
|
899
|
+
- `SourceCodeKind`
|
|
900
|
+
- `SourceCodeReference`
|
|
793
901
|
- `BaseProvenance`
|
|
794
902
|
- `FileManifest`
|
|
795
903
|
- `DirectoryManifest`
|
|
@@ -807,6 +915,7 @@ uv run data-annotations publish path/to/run-001 path/to/publish-bundle
|
|
|
807
915
|
- `analyze_provenance_chain(...)`
|
|
808
916
|
- `provenance_chain_is_fresh(...)`
|
|
809
917
|
- `artifact_matches_manifest(...)`
|
|
918
|
+
- `recover_manifest_source(...)`
|
|
810
919
|
- `checkout_manifest_source(...)`
|
|
811
920
|
|
|
812
921
|
### Publish Functions
|
|
@@ -830,6 +939,7 @@ uv run python examples/record_file_description.py
|
|
|
830
939
|
uv run python examples/record_directory_description.py
|
|
831
940
|
uv run python examples/annotate_file.py
|
|
832
941
|
uv run python examples/annotate_directory.py
|
|
942
|
+
uv run python examples/checksum_policy.py
|
|
833
943
|
uv run python examples/annotate_file_answers_cli.py
|
|
834
944
|
uv run python examples/write_file_manifest.py
|
|
835
945
|
uv run python examples/write_directory_manifest.py
|
|
@@ -839,6 +949,7 @@ uv run python examples/provenance_chain.py
|
|
|
839
949
|
uv run python examples/provenance_chain_cli.py
|
|
840
950
|
uv run python examples/recover_provenance.py
|
|
841
951
|
uv run python examples/recover_provenance_cli.py
|
|
952
|
+
uv run python examples/recover_archive_source.py
|
|
842
953
|
uv run python examples/publish_cli.py
|
|
843
954
|
```
|
|
844
955
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "data-annotations"
|
|
3
|
-
version = "2.
|
|
4
|
-
description = "Annotate
|
|
3
|
+
version = "2.6.0"
|
|
4
|
+
description = "Annotate data artifacts with provenance and descriptions"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
7
7
|
{ name = "Rodrigo C. G. Pena", email = "rodrigo.cerqueiragonzalezpena@unibas.ch" },
|