data-annotations 2.4.0__tar.gz → 2.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_annotations-2.4.0 → data_annotations-2.5.0}/PKG-INFO +68 -41
- {data_annotations-2.4.0 → data_annotations-2.5.0}/README.md +66 -39
- {data_annotations-2.4.0 → data_annotations-2.5.0}/pyproject.toml +2 -2
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/annotations/models.py +2 -2
- data_annotations-2.5.0/src/data_annotations/cli_app/annotate/__init__.py +492 -0
- data_annotations-2.4.0/src/data_annotations/cli_app/annotate.py → data_annotations-2.5.0/src/data_annotations/cli_app/annotate/helpers.py +105 -439
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/cli_app/answers.py +3 -1
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/cli_app/common.py +50 -4
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/cli_app/provenance_commands.py +28 -16
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/cli_app/publish.py +7 -1
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/provenance/__init__.py +6 -0
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/provenance/models.py +19 -2
- data_annotations-2.5.0/src/data_annotations/provenance/recovery/__init__.py +102 -0
- data_annotations-2.5.0/src/data_annotations/provenance/recovery/chain.py +312 -0
- data_annotations-2.5.0/src/data_annotations/provenance/recovery/manifest.py +179 -0
- data_annotations-2.5.0/src/data_annotations/provenance/recovery/matching.py +263 -0
- data_annotations-2.5.0/src/data_annotations/provenance/recovery/sources.py +507 -0
- data_annotations-2.5.0/src/data_annotations/provenance/recovery/types.py +32 -0
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/provenance/writers.py +23 -0
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/publish.py +14 -0
- data_annotations-2.4.0/src/data_annotations/provenance/recovery.py +0 -926
- {data_annotations-2.4.0 → data_annotations-2.5.0}/LICENSE +0 -0
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/__init__.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/_decorators.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/annotations/__init__.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/annotations/decorators.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/annotations/writers.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/cli.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/cli_app/__init__.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/cli_app/prompts.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/description/__init__.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/description/decorators.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/description/models.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/description/writers.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/provenance/decorators.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/provenance/git.py +0 -0
- {data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/provenance/runtime.py +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-annotations
|
|
3
|
-
Version: 2.
|
|
4
|
-
Summary: Annotate
|
|
3
|
+
Version: 2.5.0
|
|
4
|
+
Summary: Annotate data artifacts with provenance and descriptions
|
|
5
5
|
Keywords: annotations,data,metadata,provenance,reproducibility
|
|
6
6
|
Author: Rodrigo C. G. Pena
|
|
7
7
|
Author-email: Rodrigo C. G. Pena <rodrigo.cerqueiragonzalezpena@unibas.ch>
|
|
@@ -102,6 +102,8 @@ Every annotation document includes provenance with:
|
|
|
102
102
|
- The script path relative to the Git repo root when it can be determined
|
|
103
103
|
- Git commit, branch, dirty state, canonical repository remote, exact tags, and
|
|
104
104
|
`git describe` output when available
|
|
105
|
+
- A source-code reference for recovery, derived from Git metadata when possible
|
|
106
|
+
or supplied explicitly for archives, individual files, and DOI/URI records
|
|
105
107
|
- The current `SLURM_JOB_ID` when available
|
|
106
108
|
- Structured snapshots for recorded local inputs, including file checksums,
|
|
107
109
|
directory content digests, and upstream annotation sidecar references when
|
|
@@ -110,8 +112,8 @@ Every annotation document includes provenance with:
|
|
|
110
112
|
You can also attach your own parameters, input file paths, and function names.
|
|
111
113
|
Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
|
|
112
114
|
such as `s3://...` or `https://...` are preserved as provided.
|
|
113
|
-
Git tags and `git_describe` are human-friendly hints only
|
|
114
|
-
|
|
115
|
+
Git tags and `git_describe` are human-friendly hints only. For Git sources,
|
|
116
|
+
`git_sha` and `source_code.revision` identify the recoverable code state.
|
|
115
117
|
|
|
116
118
|
## Quick Start
|
|
117
119
|
|
|
@@ -537,8 +539,9 @@ per call.
|
|
|
537
539
|
Use `artifact_matches_manifest(...)` to verify whether a detached artifact still
|
|
538
540
|
matches an annotation document. Use `analyze_provenance_chain(...)` when you also
|
|
539
541
|
want to verify recorded inputs and recursively follow upstream annotation
|
|
540
|
-
sidecars. Use `
|
|
541
|
-
from Git metadata.
|
|
542
|
+
sidecars. Use `recover_manifest_source(...)` to recover the recorded source code
|
|
543
|
+
from Git metadata, a recorded source archive, or a recorded source file.
|
|
544
|
+
`checkout_manifest_source(...)` remains available as a compatibility alias.
|
|
542
545
|
|
|
543
546
|
```python
|
|
544
547
|
from pathlib import Path
|
|
@@ -546,7 +549,7 @@ from pathlib import Path
|
|
|
546
549
|
from data_annotations.provenance import (
|
|
547
550
|
analyze_provenance_chain,
|
|
548
551
|
artifact_matches_manifest,
|
|
549
|
-
|
|
552
|
+
recover_manifest_source,
|
|
550
553
|
)
|
|
551
554
|
|
|
552
555
|
annotation_path = Path("outputs/participants.csv.annotation.json")
|
|
@@ -555,7 +558,7 @@ artifact_path = Path("downloads/participants.csv")
|
|
|
555
558
|
if artifact_matches_manifest(artifact_path, annotation_path):
|
|
556
559
|
chain = analyze_provenance_chain(artifact_path)
|
|
557
560
|
print(chain.status)
|
|
558
|
-
recovered =
|
|
561
|
+
recovered = recover_manifest_source(annotation_path)
|
|
559
562
|
print(recovered.checkout_path)
|
|
560
563
|
print(recovered.script_path)
|
|
561
564
|
```
|
|
@@ -572,9 +575,9 @@ still attach provenance and description after the fact.
|
|
|
572
575
|
|
|
573
576
|
Post-hoc descriptions can still be very useful, but the quality of post-hoc
|
|
574
577
|
provenance depends on how exact the supplied answers are. In particular, fields
|
|
575
|
-
such as the generating script, command, function,
|
|
576
|
-
Git tags, `git describe` output, inputs, and parameters are
|
|
577
|
-
the information entered during annotation.
|
|
578
|
+
such as the generating script, command, function, source-code URI, Git commit,
|
|
579
|
+
repository path, Git tags, `git describe` output, inputs, and parameters are
|
|
580
|
+
only as reliable as the information entered during annotation.
|
|
578
581
|
|
|
579
582
|
## CLI Workflow
|
|
580
583
|
|
|
@@ -627,25 +630,31 @@ summary: Participant-level cohort assignments.
|
|
|
627
630
|
kind: dataset
|
|
628
631
|
|
|
629
632
|
inputs:
|
|
630
|
-
|
|
633
|
+
- ${DATA_ROOT}/raw/participants.csv
|
|
631
634
|
|
|
632
635
|
params:
|
|
633
|
-
|
|
636
|
+
split: validation
|
|
634
637
|
|
|
635
638
|
provenance:
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
+
command: bash scripts/build_participants.sh
|
|
640
|
+
script: scripts/build_participants.sh
|
|
641
|
+
git_sha: deadbeef
|
|
642
|
+
source_code:
|
|
643
|
+
kind: archive
|
|
644
|
+
uri: https://doi.org/10.5281/zenodo.12345
|
|
645
|
+
download_uri: https://zenodo.org/records/12345/files/source.zip
|
|
646
|
+
path: scripts/build_participants.sh
|
|
647
|
+
sha256: 0000000000000000000000000000000000000000000000000000000000000000
|
|
639
648
|
|
|
640
649
|
fields:
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
650
|
+
- name: participant_id
|
|
651
|
+
summary: Stable participant identifier.
|
|
652
|
+
data_type: string
|
|
653
|
+
required: true
|
|
654
|
+
nullable: false
|
|
646
655
|
|
|
647
656
|
primary_key:
|
|
648
|
-
|
|
657
|
+
- participant_id
|
|
649
658
|
```
|
|
650
659
|
|
|
651
660
|
Directory answers use an explicit inventory. Paths in `artifacts`,
|
|
@@ -658,26 +667,26 @@ title: Processing outputs
|
|
|
658
667
|
summary: Files produced by the shell processing workflow.
|
|
659
668
|
|
|
660
669
|
provenance:
|
|
661
|
-
|
|
662
|
-
|
|
670
|
+
command: bash process_from_instrument.sh
|
|
671
|
+
script: process_from_instrument.sh
|
|
663
672
|
|
|
664
673
|
artifacts:
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
674
|
+
- path: processed.csv
|
|
675
|
+
kind: dataset
|
|
676
|
+
title: Processed instrument output
|
|
677
|
+
summary: Normalized output from the processing script.
|
|
669
678
|
|
|
670
679
|
artifact_groups:
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
680
|
+
- title: Diagnostic plots
|
|
681
|
+
kind: plot
|
|
682
|
+
selector: plots/*.png
|
|
683
|
+
paths:
|
|
684
|
+
- plots/qc-1.png
|
|
685
|
+
- plots/qc-2.png
|
|
677
686
|
|
|
678
687
|
child_bundles:
|
|
679
|
-
|
|
680
|
-
|
|
688
|
+
- path: model
|
|
689
|
+
annotation_path: model/data-annotations.json
|
|
681
690
|
```
|
|
682
691
|
|
|
683
692
|
Answers files may also use schema-style aliases such as `subject.path`,
|
|
@@ -685,14 +694,24 @@ Answers files may also use schema-style aliases such as `subject.path`,
|
|
|
685
694
|
`description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
|
|
686
695
|
and `provenance.params`.
|
|
687
696
|
|
|
697
|
+
For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
|
|
698
|
+
`file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
|
|
699
|
+
sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
|
|
700
|
+
the generating script inside the recovered source. DOI or landing-page-only
|
|
701
|
+
references can be recorded with `kind: uri`, but they are not directly
|
|
702
|
+
recoverable unless a direct archive or file `download_uri` is also recorded.
|
|
703
|
+
|
|
688
704
|
When group selectors are provided, the CLI expands them to concrete member paths
|
|
689
705
|
at annotation time. Grouped files are tracked in `subject.produced_files[]` but
|
|
690
706
|
are skipped by the per-file prompt flow, so you do not have to answer the same
|
|
691
707
|
questions for every matching file.
|
|
692
708
|
|
|
693
|
-
For post-hoc provenance, use
|
|
694
|
-
`--
|
|
695
|
-
|
|
709
|
+
For post-hoc provenance, use `--source-kind`, `--source-uri`,
|
|
710
|
+
`--source-download-uri`, `--source-path`, `--source-revision`, and
|
|
711
|
+
`--source-sha256` when the generating code is recoverable from a Git remote,
|
|
712
|
+
source archive, source file, or reference URI. Use repeatable `--git-tag` and
|
|
713
|
+
optional `--git-describe` when you know the original Git state; these values are
|
|
714
|
+
stored as human-readable hints.
|
|
696
715
|
|
|
697
716
|
For provenance inspection and source recovery:
|
|
698
717
|
|
|
@@ -703,8 +722,12 @@ data-annotations provenance chain path/to/artifact --full-paths
|
|
|
703
722
|
data-annotations provenance checkout path/to/artifact
|
|
704
723
|
```
|
|
705
724
|
|
|
706
|
-
Command `checkout`
|
|
707
|
-
|
|
725
|
+
Command `checkout` recovers the recorded source code. For Git sources, it clones
|
|
726
|
+
the recorded remote and checks out the recorded revision. For archive and file
|
|
727
|
+
sources, it downloads or copies the recorded object, verifies `sha256` when
|
|
728
|
+
present, and resolves the generating script path when recorded. Reference-only
|
|
729
|
+
URI sources are preserved in the annotation but are not directly recoverable.
|
|
730
|
+
The command prompts before downloading source code and defaults to No; use
|
|
708
731
|
`--force` when running trusted provenance checkout non-interactively.
|
|
709
732
|
|
|
710
733
|
Command `match` auto-discovers `*.annotation.json` for files and `data-annotations.json` for
|
|
@@ -820,6 +843,8 @@ uv run data-annotations publish path/to/run-001 path/to/publish-bundle
|
|
|
820
843
|
- `ProducedFile`
|
|
821
844
|
- `ChildBundle`
|
|
822
845
|
- `InputArtifact`
|
|
846
|
+
- `SourceCodeKind`
|
|
847
|
+
- `SourceCodeReference`
|
|
823
848
|
- `BaseProvenance`
|
|
824
849
|
- `FileManifest`
|
|
825
850
|
- `DirectoryManifest`
|
|
@@ -837,6 +862,7 @@ uv run data-annotations publish path/to/run-001 path/to/publish-bundle
|
|
|
837
862
|
- `analyze_provenance_chain(...)`
|
|
838
863
|
- `provenance_chain_is_fresh(...)`
|
|
839
864
|
- `artifact_matches_manifest(...)`
|
|
865
|
+
- `recover_manifest_source(...)`
|
|
840
866
|
- `checkout_manifest_source(...)`
|
|
841
867
|
|
|
842
868
|
### Publish Functions
|
|
@@ -869,6 +895,7 @@ uv run python examples/provenance_chain.py
|
|
|
869
895
|
uv run python examples/provenance_chain_cli.py
|
|
870
896
|
uv run python examples/recover_provenance.py
|
|
871
897
|
uv run python examples/recover_provenance_cli.py
|
|
898
|
+
uv run python examples/recover_archive_source.py
|
|
872
899
|
uv run python examples/publish_cli.py
|
|
873
900
|
```
|
|
874
901
|
|
|
@@ -72,6 +72,8 @@ Every annotation document includes provenance with:
|
|
|
72
72
|
- The script path relative to the Git repo root when it can be determined
|
|
73
73
|
- Git commit, branch, dirty state, canonical repository remote, exact tags, and
|
|
74
74
|
`git describe` output when available
|
|
75
|
+
- A source-code reference for recovery, derived from Git metadata when possible
|
|
76
|
+
or supplied explicitly for archives, individual files, and DOI/URI records
|
|
75
77
|
- The current `SLURM_JOB_ID` when available
|
|
76
78
|
- Structured snapshots for recorded local inputs, including file checksums,
|
|
77
79
|
directory content digests, and upstream annotation sidecar references when
|
|
@@ -80,8 +82,8 @@ Every annotation document includes provenance with:
|
|
|
80
82
|
You can also attach your own parameters, input file paths, and function names.
|
|
81
83
|
Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
|
|
82
84
|
such as `s3://...` or `https://...` are preserved as provided.
|
|
83
|
-
Git tags and `git_describe` are human-friendly hints only
|
|
84
|
-
|
|
85
|
+
Git tags and `git_describe` are human-friendly hints only. For Git sources,
|
|
86
|
+
`git_sha` and `source_code.revision` identify the recoverable code state.
|
|
85
87
|
|
|
86
88
|
## Quick Start
|
|
87
89
|
|
|
@@ -507,8 +509,9 @@ per call.
|
|
|
507
509
|
Use `artifact_matches_manifest(...)` to verify whether a detached artifact still
|
|
508
510
|
matches an annotation document. Use `analyze_provenance_chain(...)` when you also
|
|
509
511
|
want to verify recorded inputs and recursively follow upstream annotation
|
|
510
|
-
sidecars. Use `
|
|
511
|
-
from Git metadata.
|
|
512
|
+
sidecars. Use `recover_manifest_source(...)` to recover the recorded source code
|
|
513
|
+
from Git metadata, a recorded source archive, or a recorded source file.
|
|
514
|
+
`checkout_manifest_source(...)` remains available as a compatibility alias.
|
|
512
515
|
|
|
513
516
|
```python
|
|
514
517
|
from pathlib import Path
|
|
@@ -516,7 +519,7 @@ from pathlib import Path
|
|
|
516
519
|
from data_annotations.provenance import (
|
|
517
520
|
analyze_provenance_chain,
|
|
518
521
|
artifact_matches_manifest,
|
|
519
|
-
|
|
522
|
+
recover_manifest_source,
|
|
520
523
|
)
|
|
521
524
|
|
|
522
525
|
annotation_path = Path("outputs/participants.csv.annotation.json")
|
|
@@ -525,7 +528,7 @@ artifact_path = Path("downloads/participants.csv")
|
|
|
525
528
|
if artifact_matches_manifest(artifact_path, annotation_path):
|
|
526
529
|
chain = analyze_provenance_chain(artifact_path)
|
|
527
530
|
print(chain.status)
|
|
528
|
-
recovered =
|
|
531
|
+
recovered = recover_manifest_source(annotation_path)
|
|
529
532
|
print(recovered.checkout_path)
|
|
530
533
|
print(recovered.script_path)
|
|
531
534
|
```
|
|
@@ -542,9 +545,9 @@ still attach provenance and description after the fact.
|
|
|
542
545
|
|
|
543
546
|
Post-hoc descriptions can still be very useful, but the quality of post-hoc
|
|
544
547
|
provenance depends on how exact the supplied answers are. In particular, fields
|
|
545
|
-
such as the generating script, command, function,
|
|
546
|
-
Git tags, `git describe` output, inputs, and parameters are
|
|
547
|
-
the information entered during annotation.
|
|
548
|
+
such as the generating script, command, function, source-code URI, Git commit,
|
|
549
|
+
repository path, Git tags, `git describe` output, inputs, and parameters are
|
|
550
|
+
only as reliable as the information entered during annotation.
|
|
548
551
|
|
|
549
552
|
## CLI Workflow
|
|
550
553
|
|
|
@@ -597,25 +600,31 @@ summary: Participant-level cohort assignments.
|
|
|
597
600
|
kind: dataset
|
|
598
601
|
|
|
599
602
|
inputs:
|
|
600
|
-
|
|
603
|
+
- ${DATA_ROOT}/raw/participants.csv
|
|
601
604
|
|
|
602
605
|
params:
|
|
603
|
-
|
|
606
|
+
split: validation
|
|
604
607
|
|
|
605
608
|
provenance:
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
+
command: bash scripts/build_participants.sh
|
|
610
|
+
script: scripts/build_participants.sh
|
|
611
|
+
git_sha: deadbeef
|
|
612
|
+
source_code:
|
|
613
|
+
kind: archive
|
|
614
|
+
uri: https://doi.org/10.5281/zenodo.12345
|
|
615
|
+
download_uri: https://zenodo.org/records/12345/files/source.zip
|
|
616
|
+
path: scripts/build_participants.sh
|
|
617
|
+
sha256: 0000000000000000000000000000000000000000000000000000000000000000
|
|
609
618
|
|
|
610
619
|
fields:
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
620
|
+
- name: participant_id
|
|
621
|
+
summary: Stable participant identifier.
|
|
622
|
+
data_type: string
|
|
623
|
+
required: true
|
|
624
|
+
nullable: false
|
|
616
625
|
|
|
617
626
|
primary_key:
|
|
618
|
-
|
|
627
|
+
- participant_id
|
|
619
628
|
```
|
|
620
629
|
|
|
621
630
|
Directory answers use an explicit inventory. Paths in `artifacts`,
|
|
@@ -628,26 +637,26 @@ title: Processing outputs
|
|
|
628
637
|
summary: Files produced by the shell processing workflow.
|
|
629
638
|
|
|
630
639
|
provenance:
|
|
631
|
-
|
|
632
|
-
|
|
640
|
+
command: bash process_from_instrument.sh
|
|
641
|
+
script: process_from_instrument.sh
|
|
633
642
|
|
|
634
643
|
artifacts:
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
644
|
+
- path: processed.csv
|
|
645
|
+
kind: dataset
|
|
646
|
+
title: Processed instrument output
|
|
647
|
+
summary: Normalized output from the processing script.
|
|
639
648
|
|
|
640
649
|
artifact_groups:
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
650
|
+
- title: Diagnostic plots
|
|
651
|
+
kind: plot
|
|
652
|
+
selector: plots/*.png
|
|
653
|
+
paths:
|
|
654
|
+
- plots/qc-1.png
|
|
655
|
+
- plots/qc-2.png
|
|
647
656
|
|
|
648
657
|
child_bundles:
|
|
649
|
-
|
|
650
|
-
|
|
658
|
+
- path: model
|
|
659
|
+
annotation_path: model/data-annotations.json
|
|
651
660
|
```
|
|
652
661
|
|
|
653
662
|
Answers files may also use schema-style aliases such as `subject.path`,
|
|
@@ -655,14 +664,24 @@ Answers files may also use schema-style aliases such as `subject.path`,
|
|
|
655
664
|
`description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
|
|
656
665
|
and `provenance.params`.
|
|
657
666
|
|
|
667
|
+
For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
|
|
668
|
+
`file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
|
|
669
|
+
sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
|
|
670
|
+
the generating script inside the recovered source. DOI or landing-page-only
|
|
671
|
+
references can be recorded with `kind: uri`, but they are not directly
|
|
672
|
+
recoverable unless a direct archive or file `download_uri` is also recorded.
|
|
673
|
+
|
|
658
674
|
When group selectors are provided, the CLI expands them to concrete member paths
|
|
659
675
|
at annotation time. Grouped files are tracked in `subject.produced_files[]` but
|
|
660
676
|
are skipped by the per-file prompt flow, so you do not have to answer the same
|
|
661
677
|
questions for every matching file.
|
|
662
678
|
|
|
663
|
-
For post-hoc provenance, use
|
|
664
|
-
`--
|
|
665
|
-
|
|
679
|
+
For post-hoc provenance, use `--source-kind`, `--source-uri`,
|
|
680
|
+
`--source-download-uri`, `--source-path`, `--source-revision`, and
|
|
681
|
+
`--source-sha256` when the generating code is recoverable from a Git remote,
|
|
682
|
+
source archive, source file, or reference URI. Use repeatable `--git-tag` and
|
|
683
|
+
optional `--git-describe` when you know the original Git state; these values are
|
|
684
|
+
stored as human-readable hints.
|
|
666
685
|
|
|
667
686
|
For provenance inspection and source recovery:
|
|
668
687
|
|
|
@@ -673,8 +692,12 @@ data-annotations provenance chain path/to/artifact --full-paths
|
|
|
673
692
|
data-annotations provenance checkout path/to/artifact
|
|
674
693
|
```
|
|
675
694
|
|
|
676
|
-
Command `checkout`
|
|
677
|
-
|
|
695
|
+
Command `checkout` recovers the recorded source code. For Git sources, it clones
|
|
696
|
+
the recorded remote and checks out the recorded revision. For archive and file
|
|
697
|
+
sources, it downloads or copies the recorded object, verifies `sha256` when
|
|
698
|
+
present, and resolves the generating script path when recorded. Reference-only
|
|
699
|
+
URI sources are preserved in the annotation but are not directly recoverable.
|
|
700
|
+
The command prompts before downloading source code and defaults to No; use
|
|
678
701
|
`--force` when running trusted provenance checkout non-interactively.
|
|
679
702
|
|
|
680
703
|
Command `match` auto-discovers `*.annotation.json` for files and `data-annotations.json` for
|
|
@@ -790,6 +813,8 @@ uv run data-annotations publish path/to/run-001 path/to/publish-bundle
|
|
|
790
813
|
- `ProducedFile`
|
|
791
814
|
- `ChildBundle`
|
|
792
815
|
- `InputArtifact`
|
|
816
|
+
- `SourceCodeKind`
|
|
817
|
+
- `SourceCodeReference`
|
|
793
818
|
- `BaseProvenance`
|
|
794
819
|
- `FileManifest`
|
|
795
820
|
- `DirectoryManifest`
|
|
@@ -807,6 +832,7 @@ uv run data-annotations publish path/to/run-001 path/to/publish-bundle
|
|
|
807
832
|
- `analyze_provenance_chain(...)`
|
|
808
833
|
- `provenance_chain_is_fresh(...)`
|
|
809
834
|
- `artifact_matches_manifest(...)`
|
|
835
|
+
- `recover_manifest_source(...)`
|
|
810
836
|
- `checkout_manifest_source(...)`
|
|
811
837
|
|
|
812
838
|
### Publish Functions
|
|
@@ -839,6 +865,7 @@ uv run python examples/provenance_chain.py
|
|
|
839
865
|
uv run python examples/provenance_chain_cli.py
|
|
840
866
|
uv run python examples/recover_provenance.py
|
|
841
867
|
uv run python examples/recover_provenance_cli.py
|
|
868
|
+
uv run python examples/recover_archive_source.py
|
|
842
869
|
uv run python examples/publish_cli.py
|
|
843
870
|
```
|
|
844
871
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "data-annotations"
|
|
3
|
-
version = "2.
|
|
4
|
-
description = "Annotate
|
|
3
|
+
version = "2.5.0"
|
|
4
|
+
description = "Annotate data artifacts with provenance and descriptions"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
7
7
|
{ name = "Rodrigo C. G. Pena", email = "rodrigo.cerqueiragonzalezpena@unibas.ch" },
|
{data_annotations-2.4.0 → data_annotations-2.5.0}/src/data_annotations/annotations/models.py
RENAMED
|
@@ -22,14 +22,14 @@ class DirectoryArtifactSubject(BaseModel):
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class FileAnnotationDocument(BaseModel):
|
|
25
|
-
annotation_version: Literal["
|
|
25
|
+
annotation_version: Literal["6"] = "6"
|
|
26
26
|
subject: FileArtifactSubject
|
|
27
27
|
provenance: BaseProvenance
|
|
28
28
|
description: FileDescription
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class DirectoryAnnotationDocument(BaseModel):
|
|
32
|
-
annotation_version: Literal["
|
|
32
|
+
annotation_version: Literal["6"] = "6"
|
|
33
33
|
subject: DirectoryArtifactSubject
|
|
34
34
|
provenance: BaseProvenance
|
|
35
35
|
description: DirectoryDescription
|