data-annotations 2.3.0__tar.gz → 2.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_annotations-2.3.0 → data_annotations-2.5.0}/PKG-INFO +162 -14
- {data_annotations-2.3.0 → data_annotations-2.5.0}/README.md +159 -12
- {data_annotations-2.3.0 → data_annotations-2.5.0}/pyproject.toml +3 -3
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/annotations/models.py +2 -2
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/cli.py +3 -1
- data_annotations-2.5.0/src/data_annotations/cli_app/annotate/__init__.py +492 -0
- data_annotations-2.5.0/src/data_annotations/cli_app/annotate/helpers.py +605 -0
- data_annotations-2.5.0/src/data_annotations/cli_app/answers.py +405 -0
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/cli_app/common.py +50 -4
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/cli_app/prompts.py +21 -6
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/cli_app/provenance_commands.py +48 -11
- data_annotations-2.5.0/src/data_annotations/cli_app/publish.py +98 -0
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/provenance/__init__.py +6 -0
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/provenance/models.py +19 -2
- data_annotations-2.5.0/src/data_annotations/provenance/recovery/__init__.py +102 -0
- data_annotations-2.5.0/src/data_annotations/provenance/recovery/chain.py +312 -0
- data_annotations-2.5.0/src/data_annotations/provenance/recovery/manifest.py +179 -0
- data_annotations-2.5.0/src/data_annotations/provenance/recovery/matching.py +263 -0
- data_annotations-2.5.0/src/data_annotations/provenance/recovery/sources.py +507 -0
- data_annotations-2.5.0/src/data_annotations/provenance/recovery/types.py +32 -0
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/provenance/writers.py +23 -0
- data_annotations-2.5.0/src/data_annotations/publish.py +546 -0
- data_annotations-2.3.0/src/data_annotations/cli_app/annotate.py +0 -483
- data_annotations-2.3.0/src/data_annotations/provenance/recovery.py +0 -926
- {data_annotations-2.3.0 → data_annotations-2.5.0}/LICENSE +0 -0
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/__init__.py +0 -0
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/_decorators.py +0 -0
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/annotations/__init__.py +0 -0
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/annotations/decorators.py +0 -0
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/annotations/writers.py +0 -0
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/cli_app/__init__.py +0 -0
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/description/__init__.py +0 -0
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/description/decorators.py +0 -0
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/description/models.py +0 -0
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/description/writers.py +0 -0
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/provenance/decorators.py +0 -0
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/provenance/git.py +0 -0
- {data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/provenance/runtime.py +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-annotations
|
|
3
|
-
Version: 2.
|
|
4
|
-
Summary: Annotate
|
|
3
|
+
Version: 2.5.0
|
|
4
|
+
Summary: Annotate data artifacts with provenance and descriptions
|
|
5
5
|
Keywords: annotations,data,metadata,provenance,reproducibility
|
|
6
6
|
Author: Rodrigo C. G. Pena
|
|
7
7
|
Author-email: Rodrigo C. G. Pena <rodrigo.cerqueiragonzalezpena@unibas.ch>
|
|
@@ -18,6 +18,7 @@ Classifier: Programming Language :: Python :: 3.14
|
|
|
18
18
|
Classifier: Topic :: Scientific/Engineering
|
|
19
19
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
20
|
Requires-Dist: pydantic>=2.13.1
|
|
21
|
+
Requires-Dist: pyyaml>=6.0.2 ; extra == 'cli'
|
|
21
22
|
Requires-Dist: questionary>=2.1.1 ; extra == 'cli'
|
|
22
23
|
Requires-Dist: typer>=0.16.0 ; extra == 'cli'
|
|
23
24
|
Requires-Python: >=3.12
|
|
@@ -101,6 +102,8 @@ Every annotation document includes provenance with:
|
|
|
101
102
|
- The script path relative to the Git repo root when it can be determined
|
|
102
103
|
- Git commit, branch, dirty state, canonical repository remote, exact tags, and
|
|
103
104
|
`git describe` output when available
|
|
105
|
+
- A source-code reference for recovery, derived from Git metadata when possible
|
|
106
|
+
or supplied explicitly for archives, individual files, and DOI/URI records
|
|
104
107
|
- The current `SLURM_JOB_ID` when available
|
|
105
108
|
- Structured snapshots for recorded local inputs, including file checksums,
|
|
106
109
|
directory content digests, and upstream annotation sidecar references when
|
|
@@ -109,8 +112,8 @@ Every annotation document includes provenance with:
|
|
|
109
112
|
You can also attach your own parameters, input file paths, and function names.
|
|
110
113
|
Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
|
|
111
114
|
such as `s3://...` or `https://...` are preserved as provided.
|
|
112
|
-
Git tags and `git_describe` are human-friendly hints only
|
|
113
|
-
|
|
115
|
+
Git tags and `git_describe` are human-friendly hints only. For Git sources,
|
|
116
|
+
`git_sha` and `source_code.revision` identify the recoverable code state.
|
|
114
117
|
|
|
115
118
|
## Quick Start
|
|
116
119
|
|
|
@@ -536,8 +539,9 @@ per call.
|
|
|
536
539
|
Use `artifact_matches_manifest(...)` to verify whether a detached artifact still
|
|
537
540
|
matches an annotation document. Use `analyze_provenance_chain(...)` when you also
|
|
538
541
|
want to verify recorded inputs and recursively follow upstream annotation
|
|
539
|
-
sidecars. Use `
|
|
540
|
-
from Git metadata.
|
|
542
|
+
sidecars. Use `recover_manifest_source(...)` to recover the recorded source code
|
|
543
|
+
from Git metadata, a recorded source archive, or a recorded source file.
|
|
544
|
+
`checkout_manifest_source(...)` remains available as a compatibility alias.
|
|
541
545
|
|
|
542
546
|
```python
|
|
543
547
|
from pathlib import Path
|
|
@@ -545,7 +549,7 @@ from pathlib import Path
|
|
|
545
549
|
from data_annotations.provenance import (
|
|
546
550
|
analyze_provenance_chain,
|
|
547
551
|
artifact_matches_manifest,
|
|
548
|
-
|
|
552
|
+
recover_manifest_source,
|
|
549
553
|
)
|
|
550
554
|
|
|
551
555
|
annotation_path = Path("outputs/participants.csv.annotation.json")
|
|
@@ -554,7 +558,7 @@ artifact_path = Path("downloads/participants.csv")
|
|
|
554
558
|
if artifact_matches_manifest(artifact_path, annotation_path):
|
|
555
559
|
chain = analyze_provenance_chain(artifact_path)
|
|
556
560
|
print(chain.status)
|
|
557
|
-
recovered =
|
|
561
|
+
recovered = recover_manifest_source(annotation_path)
|
|
558
562
|
print(recovered.checkout_path)
|
|
559
563
|
print(recovered.script_path)
|
|
560
564
|
```
|
|
@@ -571,9 +575,9 @@ still attach provenance and description after the fact.
|
|
|
571
575
|
|
|
572
576
|
Post-hoc descriptions can still be very useful, but the quality of post-hoc
|
|
573
577
|
provenance depends on how exact the supplied answers are. In particular, fields
|
|
574
|
-
such as the generating script, command, function,
|
|
575
|
-
Git tags, `git describe` output, inputs, and parameters are
|
|
576
|
-
the information entered during annotation.
|
|
578
|
+
such as the generating script, command, function, source-code URI, Git commit,
|
|
579
|
+
repository path, Git tags, `git describe` output, inputs, and parameters are
|
|
580
|
+
only as reliable as the information entered during annotation.
|
|
577
581
|
|
|
578
582
|
## CLI Workflow
|
|
579
583
|
|
|
@@ -599,14 +603,115 @@ These commands prompt for missing details, write `*.annotation.json` or `data-an
|
|
|
599
603
|
and optionally derive README sidecars. Post-hoc records are marked with
|
|
600
604
|
`capture_mode="post_hoc"`.
|
|
601
605
|
|
|
606
|
+
For shell workflows, you can move the prompt answers into a YAML file and run
|
|
607
|
+
the command non-interactively:
|
|
608
|
+
|
|
609
|
+
```bash
|
|
610
|
+
data-annotations annotate file path/to/participants.csv --answers participants.yaml
|
|
611
|
+
data-annotations annotate directory path/to/run-001 --answers run-001.yaml
|
|
612
|
+
data-annotations annotate answers check participants.yaml
|
|
613
|
+
```
|
|
614
|
+
|
|
615
|
+
When `--answers` is provided, `--no-interactive` is the default. Use
|
|
616
|
+
`--interactive` if you want the YAML file to provide defaults and still prompt
|
|
617
|
+
for missing required values. If the YAML file includes `target`, the positional
|
|
618
|
+
target may be omitted; when both are provided, they must resolve to the same
|
|
619
|
+
path. Environment variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded
|
|
620
|
+
inside string values, and validation fails if a referenced variable is not set.
|
|
621
|
+
The `answers check` helper requires `target` so it can infer whether the answers
|
|
622
|
+
describe a file or a directory.
|
|
623
|
+
|
|
624
|
+
File answers can use top-level prompt-style keys:
|
|
625
|
+
|
|
626
|
+
```yaml
|
|
627
|
+
target: path/to/participants.csv
|
|
628
|
+
title: Participant Cohort
|
|
629
|
+
summary: Participant-level cohort assignments.
|
|
630
|
+
kind: dataset
|
|
631
|
+
|
|
632
|
+
inputs:
|
|
633
|
+
- ${DATA_ROOT}/raw/participants.csv
|
|
634
|
+
|
|
635
|
+
params:
|
|
636
|
+
split: validation
|
|
637
|
+
|
|
638
|
+
provenance:
|
|
639
|
+
command: bash scripts/build_participants.sh
|
|
640
|
+
script: scripts/build_participants.sh
|
|
641
|
+
git_sha: deadbeef
|
|
642
|
+
source_code:
|
|
643
|
+
kind: archive
|
|
644
|
+
uri: https://doi.org/10.5281/zenodo.12345
|
|
645
|
+
download_uri: https://zenodo.org/records/12345/files/source.zip
|
|
646
|
+
path: scripts/build_participants.sh
|
|
647
|
+
sha256: 0000000000000000000000000000000000000000000000000000000000000000
|
|
648
|
+
|
|
649
|
+
fields:
|
|
650
|
+
- name: participant_id
|
|
651
|
+
summary: Stable participant identifier.
|
|
652
|
+
data_type: string
|
|
653
|
+
required: true
|
|
654
|
+
nullable: false
|
|
655
|
+
|
|
656
|
+
primary_key:
|
|
657
|
+
- participant_id
|
|
658
|
+
```
|
|
659
|
+
|
|
660
|
+
Directory answers use an explicit inventory. Paths in `artifacts`,
|
|
661
|
+
`artifact_groups.paths`, and `child_bundles` are relative to the annotated
|
|
662
|
+
directory unless absolute:
|
|
663
|
+
|
|
664
|
+
```yaml
|
|
665
|
+
target: path/to/run-001
|
|
666
|
+
title: Processing outputs
|
|
667
|
+
summary: Files produced by the shell processing workflow.
|
|
668
|
+
|
|
669
|
+
provenance:
|
|
670
|
+
command: bash process_from_instrument.sh
|
|
671
|
+
script: process_from_instrument.sh
|
|
672
|
+
|
|
673
|
+
artifacts:
|
|
674
|
+
- path: processed.csv
|
|
675
|
+
kind: dataset
|
|
676
|
+
title: Processed instrument output
|
|
677
|
+
summary: Normalized output from the processing script.
|
|
678
|
+
|
|
679
|
+
artifact_groups:
|
|
680
|
+
- title: Diagnostic plots
|
|
681
|
+
kind: plot
|
|
682
|
+
selector: plots/*.png
|
|
683
|
+
paths:
|
|
684
|
+
- plots/qc-1.png
|
|
685
|
+
- plots/qc-2.png
|
|
686
|
+
|
|
687
|
+
child_bundles:
|
|
688
|
+
- path: model
|
|
689
|
+
annotation_path: model/data-annotations.json
|
|
690
|
+
```
|
|
691
|
+
|
|
692
|
+
Answers files may also use schema-style aliases such as `subject.path`,
|
|
693
|
+
`subject.kind`, `description.title`, `description.summary`,
|
|
694
|
+
`description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
|
|
695
|
+
and `provenance.params`.
|
|
696
|
+
|
|
697
|
+
For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
|
|
698
|
+
`file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
|
|
699
|
+
sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
|
|
700
|
+
the generating script inside the recovered source. DOI or landing-page-only
|
|
701
|
+
references can be recorded with `kind: uri`, but they are not directly
|
|
702
|
+
recoverable unless a direct archive or file `download_uri` is also recorded.
|
|
703
|
+
|
|
602
704
|
When group selectors are provided, the CLI expands them to concrete member paths
|
|
603
705
|
at annotation time. Grouped files are tracked in `subject.produced_files[]` but
|
|
604
706
|
are skipped by the per-file prompt flow, so you do not have to answer the same
|
|
605
707
|
questions for every matching file.
|
|
606
708
|
|
|
607
|
-
For post-hoc provenance, use
|
|
608
|
-
`--
|
|
609
|
-
|
|
709
|
+
For post-hoc provenance, use `--source-kind`, `--source-uri`,
|
|
710
|
+
`--source-download-uri`, `--source-path`, `--source-revision`, and
|
|
711
|
+
`--source-sha256` when the generating code is recoverable from a Git remote,
|
|
712
|
+
source archive, source file, or reference URI. Use repeatable `--git-tag` and
|
|
713
|
+
optional `--git-describe` when you know the original Git state; these values are
|
|
714
|
+
stored as human-readable hints.
|
|
610
715
|
|
|
611
716
|
For provenance inspection and source recovery:
|
|
612
717
|
|
|
@@ -617,6 +722,14 @@ data-annotations provenance chain path/to/artifact --full-paths
|
|
|
617
722
|
data-annotations provenance checkout path/to/artifact
|
|
618
723
|
```
|
|
619
724
|
|
|
725
|
+
Command `checkout` recovers the recorded source code. For Git sources, it clones
|
|
726
|
+
the recorded remote and checks out the recorded revision. For archive and file
|
|
727
|
+
sources, it downloads or copies the recorded object, verifies `sha256` when
|
|
728
|
+
present, and resolves the generating script path when recorded. Reference-only
|
|
729
|
+
URI sources are preserved in the annotation but are not directly recoverable.
|
|
730
|
+
The command prompts before downloading source code and defaults to No; use
|
|
731
|
+
`--force` when running trusted provenance checkout non-interactively.
|
|
732
|
+
|
|
620
733
|
Command `match` auto-discovers `*.annotation.json` for files and `data-annotations.json` for
|
|
621
734
|
directories, prints a verification summary, and suggests the exact `checkout`
|
|
622
735
|
command to run next when Git recovery metadata is available.
|
|
@@ -626,6 +739,27 @@ those inputs. Its default output shows a compact relative-path tree and lists
|
|
|
626
739
|
stale, missing, or unverifiable nodes first; use `--full-paths` when you need
|
|
627
740
|
absolute paths.
|
|
628
741
|
|
|
742
|
+
For publication workflows, create a sanitized copy of an annotated artifact tree:
|
|
743
|
+
|
|
744
|
+
```bash
|
|
745
|
+
data-annotations publish path/to/run-001 path/to/publish-bundle
|
|
746
|
+
data-annotations publish path/to/run-001 path/to/publish-bundle \
|
|
747
|
+
--prefix /private/raw/study-a='$INPUT_ROOT'
|
|
748
|
+
data-annotations publish path/to/run-001 path/to/publish-metadata \
|
|
749
|
+
--annotations-only
|
|
750
|
+
data-annotations publish path/to/run-001 path/to/publish-bundle --dry-run
|
|
751
|
+
```
|
|
752
|
+
|
|
753
|
+
Command `publish` recursively discovers file annotations (`*.annotation.json`) and
|
|
754
|
+
directory annotations (`data-annotations.json`), writes a mirrored publish bundle,
|
|
755
|
+
and regenerates README sidecars from sanitized annotation JSON. Paths under the
|
|
756
|
+
source directory are rewritten to `$ARTIFACT_ROOT/...`; additional `--prefix`
|
|
757
|
+
mappings rewrite other private path roots. Hostname, username, and SLURM job ID
|
|
758
|
+
are redacted by default. Git remote URLs are preserved unless
|
|
759
|
+
`--redact-git-remote` is provided. Strict mode is enabled by default and fails if
|
|
760
|
+
any local absolute path remains after sanitization; use `--no-strict` only after
|
|
761
|
+
reviewing `--dry-run` output.
|
|
762
|
+
|
|
629
763
|
If `data-annotations provenance --help` does not list `chain`, your shell is
|
|
630
764
|
resolving an older installed command. From a source checkout, use
|
|
631
765
|
`uv run data-annotations provenance chain ...`, or reinstall the CLI from the
|
|
@@ -656,6 +790,7 @@ uv run data-annotations annotate directory path/to/run-001
|
|
|
656
790
|
uv run data-annotations provenance match path/to/participants.csv
|
|
657
791
|
uv run data-annotations provenance chain path/to/participants.csv
|
|
658
792
|
uv run data-annotations provenance checkout path/to/participants.csv
|
|
793
|
+
uv run data-annotations publish path/to/run-001 path/to/publish-bundle
|
|
659
794
|
```
|
|
660
795
|
|
|
661
796
|
## API Overview
|
|
@@ -708,6 +843,8 @@ uv run data-annotations provenance checkout path/to/participants.csv
|
|
|
708
843
|
- `ProducedFile`
|
|
709
844
|
- `ChildBundle`
|
|
710
845
|
- `InputArtifact`
|
|
846
|
+
- `SourceCodeKind`
|
|
847
|
+
- `SourceCodeReference`
|
|
711
848
|
- `BaseProvenance`
|
|
712
849
|
- `FileManifest`
|
|
713
850
|
- `DirectoryManifest`
|
|
@@ -725,8 +862,16 @@ uv run data-annotations provenance checkout path/to/participants.csv
|
|
|
725
862
|
- `analyze_provenance_chain(...)`
|
|
726
863
|
- `provenance_chain_is_fresh(...)`
|
|
727
864
|
- `artifact_matches_manifest(...)`
|
|
865
|
+
- `recover_manifest_source(...)`
|
|
728
866
|
- `checkout_manifest_source(...)`
|
|
729
867
|
|
|
868
|
+
### Publish Functions
|
|
869
|
+
|
|
870
|
+
- `discover_annotation_paths(...)`
|
|
871
|
+
- `sanitize_annotation_document(...)`
|
|
872
|
+
- `sanitize_annotation_path(...)`
|
|
873
|
+
- `publish_directory(...)`
|
|
874
|
+
|
|
730
875
|
## Examples
|
|
731
876
|
|
|
732
877
|
Runnable examples live in `examples/` and mirror the README workflows.
|
|
@@ -741,6 +886,7 @@ uv run python examples/record_file_description.py
|
|
|
741
886
|
uv run python examples/record_directory_description.py
|
|
742
887
|
uv run python examples/annotate_file.py
|
|
743
888
|
uv run python examples/annotate_directory.py
|
|
889
|
+
uv run python examples/annotate_file_answers_cli.py
|
|
744
890
|
uv run python examples/write_file_manifest.py
|
|
745
891
|
uv run python examples/write_directory_manifest.py
|
|
746
892
|
uv run python examples/write_file_description.py
|
|
@@ -749,6 +895,8 @@ uv run python examples/provenance_chain.py
|
|
|
749
895
|
uv run python examples/provenance_chain_cli.py
|
|
750
896
|
uv run python examples/recover_provenance.py
|
|
751
897
|
uv run python examples/recover_provenance_cli.py
|
|
898
|
+
uv run python examples/recover_archive_source.py
|
|
899
|
+
uv run python examples/publish_cli.py
|
|
752
900
|
```
|
|
753
901
|
|
|
754
902
|
Each example writes its outputs to a fresh temporary directory and prints the
|
|
@@ -72,6 +72,8 @@ Every annotation document includes provenance with:
|
|
|
72
72
|
- The script path relative to the Git repo root when it can be determined
|
|
73
73
|
- Git commit, branch, dirty state, canonical repository remote, exact tags, and
|
|
74
74
|
`git describe` output when available
|
|
75
|
+
- A source-code reference for recovery, derived from Git metadata when possible
|
|
76
|
+
or supplied explicitly for archives, individual files, and DOI/URI records
|
|
75
77
|
- The current `SLURM_JOB_ID` when available
|
|
76
78
|
- Structured snapshots for recorded local inputs, including file checksums,
|
|
77
79
|
directory content digests, and upstream annotation sidecar references when
|
|
@@ -80,8 +82,8 @@ Every annotation document includes provenance with:
|
|
|
80
82
|
You can also attach your own parameters, input file paths, and function names.
|
|
81
83
|
Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
|
|
82
84
|
such as `s3://...` or `https://...` are preserved as provided.
|
|
83
|
-
Git tags and `git_describe` are human-friendly hints only
|
|
84
|
-
|
|
85
|
+
Git tags and `git_describe` are human-friendly hints only. For Git sources,
|
|
86
|
+
`git_sha` and `source_code.revision` identify the recoverable code state.
|
|
85
87
|
|
|
86
88
|
## Quick Start
|
|
87
89
|
|
|
@@ -507,8 +509,9 @@ per call.
|
|
|
507
509
|
Use `artifact_matches_manifest(...)` to verify whether a detached artifact still
|
|
508
510
|
matches an annotation document. Use `analyze_provenance_chain(...)` when you also
|
|
509
511
|
want to verify recorded inputs and recursively follow upstream annotation
|
|
510
|
-
sidecars. Use `
|
|
511
|
-
from Git metadata.
|
|
512
|
+
sidecars. Use `recover_manifest_source(...)` to recover the recorded source code
|
|
513
|
+
from Git metadata, a recorded source archive, or a recorded source file.
|
|
514
|
+
`checkout_manifest_source(...)` remains available as a compatibility alias.
|
|
512
515
|
|
|
513
516
|
```python
|
|
514
517
|
from pathlib import Path
|
|
@@ -516,7 +519,7 @@ from pathlib import Path
|
|
|
516
519
|
from data_annotations.provenance import (
|
|
517
520
|
analyze_provenance_chain,
|
|
518
521
|
artifact_matches_manifest,
|
|
519
|
-
|
|
522
|
+
recover_manifest_source,
|
|
520
523
|
)
|
|
521
524
|
|
|
522
525
|
annotation_path = Path("outputs/participants.csv.annotation.json")
|
|
@@ -525,7 +528,7 @@ artifact_path = Path("downloads/participants.csv")
|
|
|
525
528
|
if artifact_matches_manifest(artifact_path, annotation_path):
|
|
526
529
|
chain = analyze_provenance_chain(artifact_path)
|
|
527
530
|
print(chain.status)
|
|
528
|
-
recovered =
|
|
531
|
+
recovered = recover_manifest_source(annotation_path)
|
|
529
532
|
print(recovered.checkout_path)
|
|
530
533
|
print(recovered.script_path)
|
|
531
534
|
```
|
|
@@ -542,9 +545,9 @@ still attach provenance and description after the fact.
|
|
|
542
545
|
|
|
543
546
|
Post-hoc descriptions can still be very useful, but the quality of post-hoc
|
|
544
547
|
provenance depends on how exact the supplied answers are. In particular, fields
|
|
545
|
-
such as the generating script, command, function,
|
|
546
|
-
Git tags, `git describe` output, inputs, and parameters are
|
|
547
|
-
the information entered during annotation.
|
|
548
|
+
such as the generating script, command, function, source-code URI, Git commit,
|
|
549
|
+
repository path, Git tags, `git describe` output, inputs, and parameters are
|
|
550
|
+
only as reliable as the information entered during annotation.
|
|
548
551
|
|
|
549
552
|
## CLI Workflow
|
|
550
553
|
|
|
@@ -570,14 +573,115 @@ These commands prompt for missing details, write `*.annotation.json` or `data-an
|
|
|
570
573
|
and optionally derive README sidecars. Post-hoc records are marked with
|
|
571
574
|
`capture_mode="post_hoc"`.
|
|
572
575
|
|
|
576
|
+
For shell workflows, you can move the prompt answers into a YAML file and run
|
|
577
|
+
the command non-interactively:
|
|
578
|
+
|
|
579
|
+
```bash
|
|
580
|
+
data-annotations annotate file path/to/participants.csv --answers participants.yaml
|
|
581
|
+
data-annotations annotate directory path/to/run-001 --answers run-001.yaml
|
|
582
|
+
data-annotations annotate answers check participants.yaml
|
|
583
|
+
```
|
|
584
|
+
|
|
585
|
+
When `--answers` is provided, `--no-interactive` is the default. Use
|
|
586
|
+
`--interactive` if you want the YAML file to provide defaults and still prompt
|
|
587
|
+
for missing required values. If the YAML file includes `target`, the positional
|
|
588
|
+
target may be omitted; when both are provided, they must resolve to the same
|
|
589
|
+
path. Environment variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded
|
|
590
|
+
inside string values, and validation fails if a referenced variable is not set.
|
|
591
|
+
The `answers check` helper requires `target` so it can infer whether the answers
|
|
592
|
+
describe a file or a directory.
|
|
593
|
+
|
|
594
|
+
File answers can use top-level prompt-style keys:
|
|
595
|
+
|
|
596
|
+
```yaml
|
|
597
|
+
target: path/to/participants.csv
|
|
598
|
+
title: Participant Cohort
|
|
599
|
+
summary: Participant-level cohort assignments.
|
|
600
|
+
kind: dataset
|
|
601
|
+
|
|
602
|
+
inputs:
|
|
603
|
+
- ${DATA_ROOT}/raw/participants.csv
|
|
604
|
+
|
|
605
|
+
params:
|
|
606
|
+
split: validation
|
|
607
|
+
|
|
608
|
+
provenance:
|
|
609
|
+
command: bash scripts/build_participants.sh
|
|
610
|
+
script: scripts/build_participants.sh
|
|
611
|
+
git_sha: deadbeef
|
|
612
|
+
source_code:
|
|
613
|
+
kind: archive
|
|
614
|
+
uri: https://doi.org/10.5281/zenodo.12345
|
|
615
|
+
download_uri: https://zenodo.org/records/12345/files/source.zip
|
|
616
|
+
path: scripts/build_participants.sh
|
|
617
|
+
sha256: 0000000000000000000000000000000000000000000000000000000000000000
|
|
618
|
+
|
|
619
|
+
fields:
|
|
620
|
+
- name: participant_id
|
|
621
|
+
summary: Stable participant identifier.
|
|
622
|
+
data_type: string
|
|
623
|
+
required: true
|
|
624
|
+
nullable: false
|
|
625
|
+
|
|
626
|
+
primary_key:
|
|
627
|
+
- participant_id
|
|
628
|
+
```
|
|
629
|
+
|
|
630
|
+
Directory answers use an explicit inventory. Paths in `artifacts`,
|
|
631
|
+
`artifact_groups.paths`, and `child_bundles` are relative to the annotated
|
|
632
|
+
directory unless absolute:
|
|
633
|
+
|
|
634
|
+
```yaml
|
|
635
|
+
target: path/to/run-001
|
|
636
|
+
title: Processing outputs
|
|
637
|
+
summary: Files produced by the shell processing workflow.
|
|
638
|
+
|
|
639
|
+
provenance:
|
|
640
|
+
command: bash process_from_instrument.sh
|
|
641
|
+
script: process_from_instrument.sh
|
|
642
|
+
|
|
643
|
+
artifacts:
|
|
644
|
+
- path: processed.csv
|
|
645
|
+
kind: dataset
|
|
646
|
+
title: Processed instrument output
|
|
647
|
+
summary: Normalized output from the processing script.
|
|
648
|
+
|
|
649
|
+
artifact_groups:
|
|
650
|
+
- title: Diagnostic plots
|
|
651
|
+
kind: plot
|
|
652
|
+
selector: plots/*.png
|
|
653
|
+
paths:
|
|
654
|
+
- plots/qc-1.png
|
|
655
|
+
- plots/qc-2.png
|
|
656
|
+
|
|
657
|
+
child_bundles:
|
|
658
|
+
- path: model
|
|
659
|
+
annotation_path: model/data-annotations.json
|
|
660
|
+
```
|
|
661
|
+
|
|
662
|
+
Answers files may also use schema-style aliases such as `subject.path`,
|
|
663
|
+
`subject.kind`, `description.title`, `description.summary`,
|
|
664
|
+
`description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
|
|
665
|
+
and `provenance.params`.
|
|
666
|
+
|
|
667
|
+
For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
|
|
668
|
+
`file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
|
|
669
|
+
sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
|
|
670
|
+
the generating script inside the recovered source. DOI or landing-page-only
|
|
671
|
+
references can be recorded with `kind: uri`, but they are not directly
|
|
672
|
+
recoverable unless a direct archive or file `download_uri` is also recorded.
|
|
673
|
+
|
|
573
674
|
When group selectors are provided, the CLI expands them to concrete member paths
|
|
574
675
|
at annotation time. Grouped files are tracked in `subject.produced_files[]` but
|
|
575
676
|
are skipped by the per-file prompt flow, so you do not have to answer the same
|
|
576
677
|
questions for every matching file.
|
|
577
678
|
|
|
578
|
-
For post-hoc provenance, use
|
|
579
|
-
`--
|
|
580
|
-
|
|
679
|
+
For post-hoc provenance, use `--source-kind`, `--source-uri`,
|
|
680
|
+
`--source-download-uri`, `--source-path`, `--source-revision`, and
|
|
681
|
+
`--source-sha256` when the generating code is recoverable from a Git remote,
|
|
682
|
+
source archive, source file, or reference URI. Use repeatable `--git-tag` and
|
|
683
|
+
optional `--git-describe` when you know the original Git state; these values are
|
|
684
|
+
stored as human-readable hints.
|
|
581
685
|
|
|
582
686
|
For provenance inspection and source recovery:
|
|
583
687
|
|
|
@@ -588,6 +692,14 @@ data-annotations provenance chain path/to/artifact --full-paths
|
|
|
588
692
|
data-annotations provenance checkout path/to/artifact
|
|
589
693
|
```
|
|
590
694
|
|
|
695
|
+
Command `checkout` recovers the recorded source code. For Git sources, it clones
|
|
696
|
+
the recorded remote and checks out the recorded revision. For archive and file
|
|
697
|
+
sources, it downloads or copies the recorded object, verifies `sha256` when
|
|
698
|
+
present, and resolves the generating script path when recorded. Reference-only
|
|
699
|
+
URI sources are preserved in the annotation but are not directly recoverable.
|
|
700
|
+
The command prompts before downloading source code and defaults to No; use
|
|
701
|
+
`--force` when running trusted provenance checkout non-interactively.
|
|
702
|
+
|
|
591
703
|
Command `match` auto-discovers `*.annotation.json` for files and `data-annotations.json` for
|
|
592
704
|
directories, prints a verification summary, and suggests the exact `checkout`
|
|
593
705
|
command to run next when Git recovery metadata is available.
|
|
@@ -597,6 +709,27 @@ those inputs. Its default output shows a compact relative-path tree and lists
|
|
|
597
709
|
stale, missing, or unverifiable nodes first; use `--full-paths` when you need
|
|
598
710
|
absolute paths.
|
|
599
711
|
|
|
712
|
+
For publication workflows, create a sanitized copy of an annotated artifact tree:
|
|
713
|
+
|
|
714
|
+
```bash
|
|
715
|
+
data-annotations publish path/to/run-001 path/to/publish-bundle
|
|
716
|
+
data-annotations publish path/to/run-001 path/to/publish-bundle \
|
|
717
|
+
--prefix /private/raw/study-a='$INPUT_ROOT'
|
|
718
|
+
data-annotations publish path/to/run-001 path/to/publish-metadata \
|
|
719
|
+
--annotations-only
|
|
720
|
+
data-annotations publish path/to/run-001 path/to/publish-bundle --dry-run
|
|
721
|
+
```
|
|
722
|
+
|
|
723
|
+
Command `publish` recursively discovers file annotations (`*.annotation.json`) and
|
|
724
|
+
directory annotations (`data-annotations.json`), writes a mirrored publish bundle,
|
|
725
|
+
and regenerates README sidecars from sanitized annotation JSON. Paths under the
|
|
726
|
+
source directory are rewritten to `$ARTIFACT_ROOT/...`; additional `--prefix`
|
|
727
|
+
mappings rewrite other private path roots. Hostname, username, and SLURM job ID
|
|
728
|
+
are redacted by default. Git remote URLs are preserved unless
|
|
729
|
+
`--redact-git-remote` is provided. Strict mode is enabled by default and fails if
|
|
730
|
+
any local absolute path remains after sanitization; use `--no-strict` only after
|
|
731
|
+
reviewing `--dry-run` output.
|
|
732
|
+
|
|
600
733
|
If `data-annotations provenance --help` does not list `chain`, your shell is
|
|
601
734
|
resolving an older installed command. From a source checkout, use
|
|
602
735
|
`uv run data-annotations provenance chain ...`, or reinstall the CLI from the
|
|
@@ -627,6 +760,7 @@ uv run data-annotations annotate directory path/to/run-001
|
|
|
627
760
|
uv run data-annotations provenance match path/to/participants.csv
|
|
628
761
|
uv run data-annotations provenance chain path/to/participants.csv
|
|
629
762
|
uv run data-annotations provenance checkout path/to/participants.csv
|
|
763
|
+
uv run data-annotations publish path/to/run-001 path/to/publish-bundle
|
|
630
764
|
```
|
|
631
765
|
|
|
632
766
|
## API Overview
|
|
@@ -679,6 +813,8 @@ uv run data-annotations provenance checkout path/to/participants.csv
|
|
|
679
813
|
- `ProducedFile`
|
|
680
814
|
- `ChildBundle`
|
|
681
815
|
- `InputArtifact`
|
|
816
|
+
- `SourceCodeKind`
|
|
817
|
+
- `SourceCodeReference`
|
|
682
818
|
- `BaseProvenance`
|
|
683
819
|
- `FileManifest`
|
|
684
820
|
- `DirectoryManifest`
|
|
@@ -696,8 +832,16 @@ uv run data-annotations provenance checkout path/to/participants.csv
|
|
|
696
832
|
- `analyze_provenance_chain(...)`
|
|
697
833
|
- `provenance_chain_is_fresh(...)`
|
|
698
834
|
- `artifact_matches_manifest(...)`
|
|
835
|
+
- `recover_manifest_source(...)`
|
|
699
836
|
- `checkout_manifest_source(...)`
|
|
700
837
|
|
|
838
|
+
### Publish Functions
|
|
839
|
+
|
|
840
|
+
- `discover_annotation_paths(...)`
|
|
841
|
+
- `sanitize_annotation_document(...)`
|
|
842
|
+
- `sanitize_annotation_path(...)`
|
|
843
|
+
- `publish_directory(...)`
|
|
844
|
+
|
|
701
845
|
## Examples
|
|
702
846
|
|
|
703
847
|
Runnable examples live in `examples/` and mirror the README workflows.
|
|
@@ -712,6 +856,7 @@ uv run python examples/record_file_description.py
|
|
|
712
856
|
uv run python examples/record_directory_description.py
|
|
713
857
|
uv run python examples/annotate_file.py
|
|
714
858
|
uv run python examples/annotate_directory.py
|
|
859
|
+
uv run python examples/annotate_file_answers_cli.py
|
|
715
860
|
uv run python examples/write_file_manifest.py
|
|
716
861
|
uv run python examples/write_directory_manifest.py
|
|
717
862
|
uv run python examples/write_file_description.py
|
|
@@ -720,6 +865,8 @@ uv run python examples/provenance_chain.py
|
|
|
720
865
|
uv run python examples/provenance_chain_cli.py
|
|
721
866
|
uv run python examples/recover_provenance.py
|
|
722
867
|
uv run python examples/recover_provenance_cli.py
|
|
868
|
+
uv run python examples/recover_archive_source.py
|
|
869
|
+
uv run python examples/publish_cli.py
|
|
723
870
|
```
|
|
724
871
|
|
|
725
872
|
Each example writes its outputs to a fresh temporary directory and prints the
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "data-annotations"
|
|
3
|
-
version = "2.
|
|
4
|
-
description = "Annotate
|
|
3
|
+
version = "2.5.0"
|
|
4
|
+
description = "Annotate data artifacts with provenance and descriptions"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
7
7
|
{ name = "Rodrigo C. G. Pena", email = "rodrigo.cerqueiragonzalezpena@unibas.ch" },
|
|
@@ -30,7 +30,7 @@ Changelog = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/blob/main/C
|
|
|
30
30
|
Issues = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/issues"
|
|
31
31
|
|
|
32
32
|
[project.optional-dependencies]
|
|
33
|
-
cli = ["questionary>=2.1.1", "typer>=0.16.0"]
|
|
33
|
+
cli = ["PyYAML>=6.0.2", "questionary>=2.1.1", "typer>=0.16.0"]
|
|
34
34
|
|
|
35
35
|
[project.scripts]
|
|
36
36
|
data-annotations = "data_annotations.cli:main"
|
{data_annotations-2.3.0 → data_annotations-2.5.0}/src/data_annotations/annotations/models.py
RENAMED
|
@@ -22,14 +22,14 @@ class DirectoryArtifactSubject(BaseModel):
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class FileAnnotationDocument(BaseModel):
|
|
25
|
-
annotation_version: Literal["
|
|
25
|
+
annotation_version: Literal["6"] = "6"
|
|
26
26
|
subject: FileArtifactSubject
|
|
27
27
|
provenance: BaseProvenance
|
|
28
28
|
description: FileDescription
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class DirectoryAnnotationDocument(BaseModel):
|
|
32
|
-
annotation_version: Literal["
|
|
32
|
+
annotation_version: Literal["6"] = "6"
|
|
33
33
|
subject: DirectoryArtifactSubject
|
|
34
34
|
provenance: BaseProvenance
|
|
35
35
|
description: DirectoryDescription
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
3
|
_CLI_IMPORT_ERROR: ModuleNotFoundError | None = None
|
|
4
|
-
_CLI_OPTIONAL_DEPENDENCIES = {"questionary", "typer"}
|
|
4
|
+
_CLI_OPTIONAL_DEPENDENCIES = {"questionary", "typer", "yaml"}
|
|
5
5
|
app: Any = None
|
|
6
6
|
|
|
7
7
|
try:
|
|
@@ -9,6 +9,7 @@ try:
|
|
|
9
9
|
|
|
10
10
|
from data_annotations.cli_app.annotate import annotate_app
|
|
11
11
|
from data_annotations.cli_app.provenance_commands import provenance_app
|
|
12
|
+
from data_annotations.cli_app.publish import publish_command
|
|
12
13
|
except ModuleNotFoundError as exc:
|
|
13
14
|
if exc.name not in _CLI_OPTIONAL_DEPENDENCIES:
|
|
14
15
|
raise
|
|
@@ -17,6 +18,7 @@ else:
|
|
|
17
18
|
app = typer.Typer(no_args_is_help=True)
|
|
18
19
|
app.add_typer(annotate_app, name="annotate")
|
|
19
20
|
app.add_typer(provenance_app, name="provenance")
|
|
21
|
+
app.command("publish")(publish_command)
|
|
20
22
|
|
|
21
23
|
|
|
22
24
|
def main() -> None:
|