data-annotations 2.2.0__tar.gz → 2.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_annotations-2.2.0 → data_annotations-2.4.0}/PKG-INFO +154 -3
- {data_annotations-2.2.0 → data_annotations-2.4.0}/README.md +152 -2
- {data_annotations-2.2.0 → data_annotations-2.4.0}/pyproject.toml +2 -2
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/annotations/models.py +2 -2
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/annotations/writers.py +7 -6
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/cli.py +3 -1
- data_annotations-2.4.0/src/data_annotations/cli_app/annotate.py +939 -0
- data_annotations-2.4.0/src/data_annotations/cli_app/answers.py +403 -0
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/cli_app/prompts.py +21 -6
- data_annotations-2.4.0/src/data_annotations/cli_app/provenance_commands.py +346 -0
- data_annotations-2.4.0/src/data_annotations/cli_app/publish.py +92 -0
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/provenance/__init__.py +14 -1
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/provenance/models.py +10 -0
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/provenance/recovery.py +326 -4
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/provenance/writers.py +119 -10
- data_annotations-2.4.0/src/data_annotations/publish.py +532 -0
- data_annotations-2.2.0/src/data_annotations/cli_app/annotate.py +0 -483
- data_annotations-2.2.0/src/data_annotations/cli_app/provenance_commands.py +0 -107
- {data_annotations-2.2.0 → data_annotations-2.4.0}/LICENSE +0 -0
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/__init__.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/_decorators.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/annotations/__init__.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/annotations/decorators.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/cli_app/__init__.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/cli_app/common.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/description/__init__.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/description/decorators.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/description/models.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/description/writers.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/provenance/decorators.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/provenance/git.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/provenance/runtime.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-annotations
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.4.0
|
|
4
4
|
Summary: Annotate generated data artifacts
|
|
5
5
|
Keywords: annotations,data,metadata,provenance,reproducibility
|
|
6
6
|
Author: Rodrigo C. G. Pena
|
|
@@ -18,6 +18,7 @@ Classifier: Programming Language :: Python :: 3.14
|
|
|
18
18
|
Classifier: Topic :: Scientific/Engineering
|
|
19
19
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
20
|
Requires-Dist: pydantic>=2.13.1
|
|
21
|
+
Requires-Dist: pyyaml>=6.0.2 ; extra == 'cli'
|
|
21
22
|
Requires-Dist: questionary>=2.1.1 ; extra == 'cli'
|
|
22
23
|
Requires-Dist: typer>=0.16.0 ; extra == 'cli'
|
|
23
24
|
Requires-Python: >=3.12
|
|
@@ -102,6 +103,9 @@ Every annotation document includes provenance with:
|
|
|
102
103
|
- Git commit, branch, dirty state, canonical repository remote, exact tags, and
|
|
103
104
|
`git describe` output when available
|
|
104
105
|
- The current `SLURM_JOB_ID` when available
|
|
106
|
+
- Structured snapshots for recorded local inputs, including file checksums,
|
|
107
|
+
directory content digests, and upstream annotation sidecar references when
|
|
108
|
+
present
|
|
105
109
|
|
|
106
110
|
You can also attach your own parameters, input file paths, and function names.
|
|
107
111
|
Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
|
|
@@ -380,6 +384,7 @@ File annotations store:
|
|
|
380
384
|
- `subject.kind`
|
|
381
385
|
- `subject.sha256`
|
|
382
386
|
- `provenance.*`
|
|
387
|
+
- `provenance.input_artifacts[]`
|
|
383
388
|
- `description.title`
|
|
384
389
|
- `description.summary`
|
|
385
390
|
- `description.fields`
|
|
@@ -396,6 +401,7 @@ Directory annotations store:
|
|
|
396
401
|
- `subject.child_bundles[]`
|
|
397
402
|
- `subject.content_digest`
|
|
398
403
|
- `provenance.*`
|
|
404
|
+
- `provenance.input_artifacts[]`
|
|
399
405
|
- `description.title`
|
|
400
406
|
- `description.summary`
|
|
401
407
|
- `description.artifact_groups[]`
|
|
@@ -529,13 +535,16 @@ per call.
|
|
|
529
535
|
## Recovery Helpers
|
|
530
536
|
|
|
531
537
|
Use `artifact_matches_manifest(...)` to verify whether a detached artifact still
|
|
532
|
-
matches an annotation document
|
|
533
|
-
recorded
|
|
538
|
+
matches an annotation document. Use `analyze_provenance_chain(...)` when you also
|
|
539
|
+
want to verify recorded inputs and recursively follow upstream annotation
|
|
540
|
+
sidecars. Use `checkout_manifest_source(...)` to recover the recorded code state
|
|
541
|
+
from Git metadata.
|
|
534
542
|
|
|
535
543
|
```python
|
|
536
544
|
from pathlib import Path
|
|
537
545
|
|
|
538
546
|
from data_annotations.provenance import (
|
|
547
|
+
analyze_provenance_chain,
|
|
539
548
|
artifact_matches_manifest,
|
|
540
549
|
checkout_manifest_source,
|
|
541
550
|
)
|
|
@@ -544,6 +553,8 @@ annotation_path = Path("outputs/participants.csv.annotation.json")
|
|
|
544
553
|
artifact_path = Path("downloads/participants.csv")
|
|
545
554
|
|
|
546
555
|
if artifact_matches_manifest(artifact_path, annotation_path):
|
|
556
|
+
chain = analyze_provenance_chain(artifact_path)
|
|
557
|
+
print(chain.status)
|
|
547
558
|
recovered = checkout_manifest_source(annotation_path)
|
|
548
559
|
print(recovered.checkout_path)
|
|
549
560
|
print(recovered.script_path)
|
|
@@ -589,6 +600,91 @@ These commands prompt for missing details, write `*.annotation.json` or `data-an
|
|
|
589
600
|
and optionally derive README sidecars. Post-hoc records are marked with
|
|
590
601
|
`capture_mode="post_hoc"`.
|
|
591
602
|
|
|
603
|
+
For shell workflows, you can move the prompt answers into a YAML file and run
|
|
604
|
+
the command non-interactively:
|
|
605
|
+
|
|
606
|
+
```bash
|
|
607
|
+
data-annotations annotate file path/to/participants.csv --answers participants.yaml
|
|
608
|
+
data-annotations annotate directory path/to/run-001 --answers run-001.yaml
|
|
609
|
+
data-annotations annotate answers check participants.yaml
|
|
610
|
+
```
|
|
611
|
+
|
|
612
|
+
When `--answers` is provided, `--no-interactive` is the default. Use
|
|
613
|
+
`--interactive` if you want the YAML file to provide defaults and still prompt
|
|
614
|
+
for missing required values. If the YAML file includes `target`, the positional
|
|
615
|
+
target may be omitted; when both are provided, they must resolve to the same
|
|
616
|
+
path. Environment variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded
|
|
617
|
+
inside string values, and validation fails if a referenced variable is not set.
|
|
618
|
+
The `answers check` helper requires `target` so it can infer whether the answers
|
|
619
|
+
describe a file or a directory.
|
|
620
|
+
|
|
621
|
+
File answers can use top-level prompt-style keys:
|
|
622
|
+
|
|
623
|
+
```yaml
|
|
624
|
+
target: path/to/participants.csv
|
|
625
|
+
title: Participant Cohort
|
|
626
|
+
summary: Participant-level cohort assignments.
|
|
627
|
+
kind: dataset
|
|
628
|
+
|
|
629
|
+
inputs:
|
|
630
|
+
- ${DATA_ROOT}/raw/participants.csv
|
|
631
|
+
|
|
632
|
+
params:
|
|
633
|
+
split: validation
|
|
634
|
+
|
|
635
|
+
provenance:
|
|
636
|
+
command: bash scripts/build_participants.sh
|
|
637
|
+
script: scripts/build_participants.sh
|
|
638
|
+
git_sha: deadbeef
|
|
639
|
+
|
|
640
|
+
fields:
|
|
641
|
+
- name: participant_id
|
|
642
|
+
summary: Stable participant identifier.
|
|
643
|
+
data_type: string
|
|
644
|
+
required: true
|
|
645
|
+
nullable: false
|
|
646
|
+
|
|
647
|
+
primary_key:
|
|
648
|
+
- participant_id
|
|
649
|
+
```
|
|
650
|
+
|
|
651
|
+
Directory answers use an explicit inventory. Paths in `artifacts`,
|
|
652
|
+
`artifact_groups.paths`, and `child_bundles` are relative to the annotated
|
|
653
|
+
directory unless absolute:
|
|
654
|
+
|
|
655
|
+
```yaml
|
|
656
|
+
target: path/to/run-001
|
|
657
|
+
title: Processing outputs
|
|
658
|
+
summary: Files produced by the shell processing workflow.
|
|
659
|
+
|
|
660
|
+
provenance:
|
|
661
|
+
command: bash process_from_instrument.sh
|
|
662
|
+
script: process_from_instrument.sh
|
|
663
|
+
|
|
664
|
+
artifacts:
|
|
665
|
+
- path: processed.csv
|
|
666
|
+
kind: dataset
|
|
667
|
+
title: Processed instrument output
|
|
668
|
+
summary: Normalized output from the processing script.
|
|
669
|
+
|
|
670
|
+
artifact_groups:
|
|
671
|
+
- title: Diagnostic plots
|
|
672
|
+
kind: plot
|
|
673
|
+
selector: plots/*.png
|
|
674
|
+
paths:
|
|
675
|
+
- plots/qc-1.png
|
|
676
|
+
- plots/qc-2.png
|
|
677
|
+
|
|
678
|
+
child_bundles:
|
|
679
|
+
- path: model
|
|
680
|
+
annotation_path: model/data-annotations.json
|
|
681
|
+
```
|
|
682
|
+
|
|
683
|
+
Answers files may also use schema-style aliases such as `subject.path`,
|
|
684
|
+
`subject.kind`, `description.title`, `description.summary`,
|
|
685
|
+
`description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
|
|
686
|
+
and `provenance.params`.
|
|
687
|
+
|
|
592
688
|
When group selectors are provided, the CLI expands them to concrete member paths
|
|
593
689
|
at annotation time. Grouped files are tracked in `subject.produced_files[]` but
|
|
594
690
|
are skipped by the per-file prompt flow, so you do not have to answer the same
|
|
@@ -602,12 +698,49 @@ For provenance inspection and source recovery:
|
|
|
602
698
|
|
|
603
699
|
```bash
|
|
604
700
|
data-annotations provenance match path/to/artifact
|
|
701
|
+
data-annotations provenance chain path/to/artifact
|
|
702
|
+
data-annotations provenance chain path/to/artifact --full-paths
|
|
605
703
|
data-annotations provenance checkout path/to/artifact
|
|
606
704
|
```
|
|
607
705
|
|
|
706
|
+
Command `checkout` downloads the recorded Git remote and checks out the recorded
|
|
707
|
+
commit. It prompts before downloading source code and defaults to No; use
|
|
708
|
+
`--force` when running trusted provenance checkout non-interactively.
|
|
709
|
+
|
|
608
710
|
Command `match` auto-discovers `*.annotation.json` for files and `data-annotations.json` for
|
|
609
711
|
directories, prints a verification summary, and suggests the exact `checkout`
|
|
610
712
|
command to run next when Git recovery metadata is available.
|
|
713
|
+
Command `chain` uses the same sidecar discovery, then verifies the artifact,
|
|
714
|
+
recorded input snapshots, and any upstream annotation sidecars reachable from
|
|
715
|
+
those inputs. Its default output shows a compact relative-path tree and lists
|
|
716
|
+
stale, missing, or unverifiable nodes first; use `--full-paths` when you need
|
|
717
|
+
absolute paths.
|
|
718
|
+
|
|
719
|
+
For publication workflows, create a sanitized copy of an annotated artifact tree:
|
|
720
|
+
|
|
721
|
+
```bash
|
|
722
|
+
data-annotations publish path/to/run-001 path/to/publish-bundle
|
|
723
|
+
data-annotations publish path/to/run-001 path/to/publish-bundle \
|
|
724
|
+
--prefix /private/raw/study-a='$INPUT_ROOT'
|
|
725
|
+
data-annotations publish path/to/run-001 path/to/publish-metadata \
|
|
726
|
+
--annotations-only
|
|
727
|
+
data-annotations publish path/to/run-001 path/to/publish-bundle --dry-run
|
|
728
|
+
```
|
|
729
|
+
|
|
730
|
+
Command `publish` recursively discovers file annotations (`*.annotation.json`) and
|
|
731
|
+
directory annotations (`data-annotations.json`), writes a mirrored publish bundle,
|
|
732
|
+
and regenerates README sidecars from sanitized annotation JSON. Paths under the
|
|
733
|
+
source directory are rewritten to `$ARTIFACT_ROOT/...`; additional `--prefix`
|
|
734
|
+
mappings rewrite other private path roots. Hostname, username, and SLURM job ID
|
|
735
|
+
are redacted by default. Git remote URLs are preserved unless
|
|
736
|
+
`--redact-git-remote` is provided. Strict mode is enabled by default and fails if
|
|
737
|
+
any local absolute path remains after sanitization; use `--no-strict` only after
|
|
738
|
+
reviewing `--dry-run` output.
|
|
739
|
+
|
|
740
|
+
If `data-annotations provenance --help` does not list `chain`, your shell is
|
|
741
|
+
resolving an older installed command. From a source checkout, use
|
|
742
|
+
`uv run data-annotations provenance chain ...`, or reinstall the CLI from the
|
|
743
|
+
updated source before using the bare `data-annotations` command.
|
|
611
744
|
|
|
612
745
|
### Run With `uvx`
|
|
613
746
|
|
|
@@ -632,7 +765,9 @@ the project environment. You can then run:
|
|
|
632
765
|
uv run data-annotations annotate file path/to/participants.csv
|
|
633
766
|
uv run data-annotations annotate directory path/to/run-001
|
|
634
767
|
uv run data-annotations provenance match path/to/participants.csv
|
|
768
|
+
uv run data-annotations provenance chain path/to/participants.csv
|
|
635
769
|
uv run data-annotations provenance checkout path/to/participants.csv
|
|
770
|
+
uv run data-annotations publish path/to/run-001 path/to/publish-bundle
|
|
636
771
|
```
|
|
637
772
|
|
|
638
773
|
## API Overview
|
|
@@ -684,9 +819,12 @@ uv run data-annotations provenance checkout path/to/participants.csv
|
|
|
684
819
|
|
|
685
820
|
- `ProducedFile`
|
|
686
821
|
- `ChildBundle`
|
|
822
|
+
- `InputArtifact`
|
|
687
823
|
- `BaseProvenance`
|
|
688
824
|
- `FileManifest`
|
|
689
825
|
- `DirectoryManifest`
|
|
826
|
+
- `ProvenanceChainNode`
|
|
827
|
+
- `ProvenanceChainReport`
|
|
690
828
|
- `RecoveredSource`
|
|
691
829
|
|
|
692
830
|
### Provenance Functions
|
|
@@ -696,9 +834,18 @@ uv run data-annotations provenance checkout path/to/participants.csv
|
|
|
696
834
|
- `write_file_manifest(...)`
|
|
697
835
|
- `write_directory_manifest(...)`
|
|
698
836
|
- `directory_content_digest(...)`
|
|
837
|
+
- `analyze_provenance_chain(...)`
|
|
838
|
+
- `provenance_chain_is_fresh(...)`
|
|
699
839
|
- `artifact_matches_manifest(...)`
|
|
700
840
|
- `checkout_manifest_source(...)`
|
|
701
841
|
|
|
842
|
+
### Publish Functions
|
|
843
|
+
|
|
844
|
+
- `discover_annotation_paths(...)`
|
|
845
|
+
- `sanitize_annotation_document(...)`
|
|
846
|
+
- `sanitize_annotation_path(...)`
|
|
847
|
+
- `publish_directory(...)`
|
|
848
|
+
|
|
702
849
|
## Examples
|
|
703
850
|
|
|
704
851
|
Runnable examples live in `examples/` and mirror the README workflows.
|
|
@@ -713,12 +860,16 @@ uv run python examples/record_file_description.py
|
|
|
713
860
|
uv run python examples/record_directory_description.py
|
|
714
861
|
uv run python examples/annotate_file.py
|
|
715
862
|
uv run python examples/annotate_directory.py
|
|
863
|
+
uv run python examples/annotate_file_answers_cli.py
|
|
716
864
|
uv run python examples/write_file_manifest.py
|
|
717
865
|
uv run python examples/write_directory_manifest.py
|
|
718
866
|
uv run python examples/write_file_description.py
|
|
719
867
|
uv run python examples/write_directory_description.py
|
|
868
|
+
uv run python examples/provenance_chain.py
|
|
869
|
+
uv run python examples/provenance_chain_cli.py
|
|
720
870
|
uv run python examples/recover_provenance.py
|
|
721
871
|
uv run python examples/recover_provenance_cli.py
|
|
872
|
+
uv run python examples/publish_cli.py
|
|
722
873
|
```
|
|
723
874
|
|
|
724
875
|
Each example writes its outputs to a fresh temporary directory and prints the
|
|
@@ -73,6 +73,9 @@ Every annotation document includes provenance with:
|
|
|
73
73
|
- Git commit, branch, dirty state, canonical repository remote, exact tags, and
|
|
74
74
|
`git describe` output when available
|
|
75
75
|
- The current `SLURM_JOB_ID` when available
|
|
76
|
+
- Structured snapshots for recorded local inputs, including file checksums,
|
|
77
|
+
directory content digests, and upstream annotation sidecar references when
|
|
78
|
+
present
|
|
76
79
|
|
|
77
80
|
You can also attach your own parameters, input file paths, and function names.
|
|
78
81
|
Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
|
|
@@ -351,6 +354,7 @@ File annotations store:
|
|
|
351
354
|
- `subject.kind`
|
|
352
355
|
- `subject.sha256`
|
|
353
356
|
- `provenance.*`
|
|
357
|
+
- `provenance.input_artifacts[]`
|
|
354
358
|
- `description.title`
|
|
355
359
|
- `description.summary`
|
|
356
360
|
- `description.fields`
|
|
@@ -367,6 +371,7 @@ Directory annotations store:
|
|
|
367
371
|
- `subject.child_bundles[]`
|
|
368
372
|
- `subject.content_digest`
|
|
369
373
|
- `provenance.*`
|
|
374
|
+
- `provenance.input_artifacts[]`
|
|
370
375
|
- `description.title`
|
|
371
376
|
- `description.summary`
|
|
372
377
|
- `description.artifact_groups[]`
|
|
@@ -500,13 +505,16 @@ per call.
|
|
|
500
505
|
## Recovery Helpers
|
|
501
506
|
|
|
502
507
|
Use `artifact_matches_manifest(...)` to verify whether a detached artifact still
|
|
503
|
-
matches an annotation document
|
|
504
|
-
recorded
|
|
508
|
+
matches an annotation document. Use `analyze_provenance_chain(...)` when you also
|
|
509
|
+
want to verify recorded inputs and recursively follow upstream annotation
|
|
510
|
+
sidecars. Use `checkout_manifest_source(...)` to recover the recorded code state
|
|
511
|
+
from Git metadata.
|
|
505
512
|
|
|
506
513
|
```python
|
|
507
514
|
from pathlib import Path
|
|
508
515
|
|
|
509
516
|
from data_annotations.provenance import (
|
|
517
|
+
analyze_provenance_chain,
|
|
510
518
|
artifact_matches_manifest,
|
|
511
519
|
checkout_manifest_source,
|
|
512
520
|
)
|
|
@@ -515,6 +523,8 @@ annotation_path = Path("outputs/participants.csv.annotation.json")
|
|
|
515
523
|
artifact_path = Path("downloads/participants.csv")
|
|
516
524
|
|
|
517
525
|
if artifact_matches_manifest(artifact_path, annotation_path):
|
|
526
|
+
chain = analyze_provenance_chain(artifact_path)
|
|
527
|
+
print(chain.status)
|
|
518
528
|
recovered = checkout_manifest_source(annotation_path)
|
|
519
529
|
print(recovered.checkout_path)
|
|
520
530
|
print(recovered.script_path)
|
|
@@ -560,6 +570,91 @@ These commands prompt for missing details, write `*.annotation.json` or `data-an
|
|
|
560
570
|
and optionally derive README sidecars. Post-hoc records are marked with
|
|
561
571
|
`capture_mode="post_hoc"`.
|
|
562
572
|
|
|
573
|
+
For shell workflows, you can move the prompt answers into a YAML file and run
|
|
574
|
+
the command non-interactively:
|
|
575
|
+
|
|
576
|
+
```bash
|
|
577
|
+
data-annotations annotate file path/to/participants.csv --answers participants.yaml
|
|
578
|
+
data-annotations annotate directory path/to/run-001 --answers run-001.yaml
|
|
579
|
+
data-annotations annotate answers check participants.yaml
|
|
580
|
+
```
|
|
581
|
+
|
|
582
|
+
When `--answers` is provided, `--no-interactive` is the default. Use
|
|
583
|
+
`--interactive` if you want the YAML file to provide defaults and still prompt
|
|
584
|
+
for missing required values. If the YAML file includes `target`, the positional
|
|
585
|
+
target may be omitted; when both are provided, they must resolve to the same
|
|
586
|
+
path. Environment variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded
|
|
587
|
+
inside string values, and validation fails if a referenced variable is not set.
|
|
588
|
+
The `answers check` helper requires `target` so it can infer whether the answers
|
|
589
|
+
describe a file or a directory.
|
|
590
|
+
|
|
591
|
+
File answers can use top-level prompt-style keys:
|
|
592
|
+
|
|
593
|
+
```yaml
|
|
594
|
+
target: path/to/participants.csv
|
|
595
|
+
title: Participant Cohort
|
|
596
|
+
summary: Participant-level cohort assignments.
|
|
597
|
+
kind: dataset
|
|
598
|
+
|
|
599
|
+
inputs:
|
|
600
|
+
- ${DATA_ROOT}/raw/participants.csv
|
|
601
|
+
|
|
602
|
+
params:
|
|
603
|
+
split: validation
|
|
604
|
+
|
|
605
|
+
provenance:
|
|
606
|
+
command: bash scripts/build_participants.sh
|
|
607
|
+
script: scripts/build_participants.sh
|
|
608
|
+
git_sha: deadbeef
|
|
609
|
+
|
|
610
|
+
fields:
|
|
611
|
+
- name: participant_id
|
|
612
|
+
summary: Stable participant identifier.
|
|
613
|
+
data_type: string
|
|
614
|
+
required: true
|
|
615
|
+
nullable: false
|
|
616
|
+
|
|
617
|
+
primary_key:
|
|
618
|
+
- participant_id
|
|
619
|
+
```
|
|
620
|
+
|
|
621
|
+
Directory answers use an explicit inventory. Paths in `artifacts`,
|
|
622
|
+
`artifact_groups.paths`, and `child_bundles` are relative to the annotated
|
|
623
|
+
directory unless absolute:
|
|
624
|
+
|
|
625
|
+
```yaml
|
|
626
|
+
target: path/to/run-001
|
|
627
|
+
title: Processing outputs
|
|
628
|
+
summary: Files produced by the shell processing workflow.
|
|
629
|
+
|
|
630
|
+
provenance:
|
|
631
|
+
command: bash process_from_instrument.sh
|
|
632
|
+
script: process_from_instrument.sh
|
|
633
|
+
|
|
634
|
+
artifacts:
|
|
635
|
+
- path: processed.csv
|
|
636
|
+
kind: dataset
|
|
637
|
+
title: Processed instrument output
|
|
638
|
+
summary: Normalized output from the processing script.
|
|
639
|
+
|
|
640
|
+
artifact_groups:
|
|
641
|
+
- title: Diagnostic plots
|
|
642
|
+
kind: plot
|
|
643
|
+
selector: plots/*.png
|
|
644
|
+
paths:
|
|
645
|
+
- plots/qc-1.png
|
|
646
|
+
- plots/qc-2.png
|
|
647
|
+
|
|
648
|
+
child_bundles:
|
|
649
|
+
- path: model
|
|
650
|
+
annotation_path: model/data-annotations.json
|
|
651
|
+
```
|
|
652
|
+
|
|
653
|
+
Answers files may also use schema-style aliases such as `subject.path`,
|
|
654
|
+
`subject.kind`, `description.title`, `description.summary`,
|
|
655
|
+
`description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
|
|
656
|
+
and `provenance.params`.
|
|
657
|
+
|
|
563
658
|
When group selectors are provided, the CLI expands them to concrete member paths
|
|
564
659
|
at annotation time. Grouped files are tracked in `subject.produced_files[]` but
|
|
565
660
|
are skipped by the per-file prompt flow, so you do not have to answer the same
|
|
@@ -573,12 +668,49 @@ For provenance inspection and source recovery:
|
|
|
573
668
|
|
|
574
669
|
```bash
|
|
575
670
|
data-annotations provenance match path/to/artifact
|
|
671
|
+
data-annotations provenance chain path/to/artifact
|
|
672
|
+
data-annotations provenance chain path/to/artifact --full-paths
|
|
576
673
|
data-annotations provenance checkout path/to/artifact
|
|
577
674
|
```
|
|
578
675
|
|
|
676
|
+
Command `checkout` downloads the recorded Git remote and checks out the recorded
|
|
677
|
+
commit. It prompts before downloading source code and defaults to No; use
|
|
678
|
+
`--force` when running trusted provenance checkout non-interactively.
|
|
679
|
+
|
|
579
680
|
Command `match` auto-discovers `*.annotation.json` for files and `data-annotations.json` for
|
|
580
681
|
directories, prints a verification summary, and suggests the exact `checkout`
|
|
581
682
|
command to run next when Git recovery metadata is available.
|
|
683
|
+
Command `chain` uses the same sidecar discovery, then verifies the artifact,
|
|
684
|
+
recorded input snapshots, and any upstream annotation sidecars reachable from
|
|
685
|
+
those inputs. Its default output shows a compact relative-path tree and lists
|
|
686
|
+
stale, missing, or unverifiable nodes first; use `--full-paths` when you need
|
|
687
|
+
absolute paths.
|
|
688
|
+
|
|
689
|
+
For publication workflows, create a sanitized copy of an annotated artifact tree:
|
|
690
|
+
|
|
691
|
+
```bash
|
|
692
|
+
data-annotations publish path/to/run-001 path/to/publish-bundle
|
|
693
|
+
data-annotations publish path/to/run-001 path/to/publish-bundle \
|
|
694
|
+
--prefix /private/raw/study-a='$INPUT_ROOT'
|
|
695
|
+
data-annotations publish path/to/run-001 path/to/publish-metadata \
|
|
696
|
+
--annotations-only
|
|
697
|
+
data-annotations publish path/to/run-001 path/to/publish-bundle --dry-run
|
|
698
|
+
```
|
|
699
|
+
|
|
700
|
+
Command `publish` recursively discovers file annotations (`*.annotation.json`) and
|
|
701
|
+
directory annotations (`data-annotations.json`), writes a mirrored publish bundle,
|
|
702
|
+
and regenerates README sidecars from sanitized annotation JSON. Paths under the
|
|
703
|
+
source directory are rewritten to `$ARTIFACT_ROOT/...`; additional `--prefix`
|
|
704
|
+
mappings rewrite other private path roots. Hostname, username, and SLURM job ID
|
|
705
|
+
are redacted by default. Git remote URLs are preserved unless
|
|
706
|
+
`--redact-git-remote` is provided. Strict mode is enabled by default and fails if
|
|
707
|
+
any local absolute path remains after sanitization; use `--no-strict` only after
|
|
708
|
+
reviewing `--dry-run` output.
|
|
709
|
+
|
|
710
|
+
If `data-annotations provenance --help` does not list `chain`, your shell is
|
|
711
|
+
resolving an older installed command. From a source checkout, use
|
|
712
|
+
`uv run data-annotations provenance chain ...`, or reinstall the CLI from the
|
|
713
|
+
updated source before using the bare `data-annotations` command.
|
|
582
714
|
|
|
583
715
|
### Run With `uvx`
|
|
584
716
|
|
|
@@ -603,7 +735,9 @@ the project environment. You can then run:
|
|
|
603
735
|
uv run data-annotations annotate file path/to/participants.csv
|
|
604
736
|
uv run data-annotations annotate directory path/to/run-001
|
|
605
737
|
uv run data-annotations provenance match path/to/participants.csv
|
|
738
|
+
uv run data-annotations provenance chain path/to/participants.csv
|
|
606
739
|
uv run data-annotations provenance checkout path/to/participants.csv
|
|
740
|
+
uv run data-annotations publish path/to/run-001 path/to/publish-bundle
|
|
607
741
|
```
|
|
608
742
|
|
|
609
743
|
## API Overview
|
|
@@ -655,9 +789,12 @@ uv run data-annotations provenance checkout path/to/participants.csv
|
|
|
655
789
|
|
|
656
790
|
- `ProducedFile`
|
|
657
791
|
- `ChildBundle`
|
|
792
|
+
- `InputArtifact`
|
|
658
793
|
- `BaseProvenance`
|
|
659
794
|
- `FileManifest`
|
|
660
795
|
- `DirectoryManifest`
|
|
796
|
+
- `ProvenanceChainNode`
|
|
797
|
+
- `ProvenanceChainReport`
|
|
661
798
|
- `RecoveredSource`
|
|
662
799
|
|
|
663
800
|
### Provenance Functions
|
|
@@ -667,9 +804,18 @@ uv run data-annotations provenance checkout path/to/participants.csv
|
|
|
667
804
|
- `write_file_manifest(...)`
|
|
668
805
|
- `write_directory_manifest(...)`
|
|
669
806
|
- `directory_content_digest(...)`
|
|
807
|
+
- `analyze_provenance_chain(...)`
|
|
808
|
+
- `provenance_chain_is_fresh(...)`
|
|
670
809
|
- `artifact_matches_manifest(...)`
|
|
671
810
|
- `checkout_manifest_source(...)`
|
|
672
811
|
|
|
812
|
+
### Publish Functions
|
|
813
|
+
|
|
814
|
+
- `discover_annotation_paths(...)`
|
|
815
|
+
- `sanitize_annotation_document(...)`
|
|
816
|
+
- `sanitize_annotation_path(...)`
|
|
817
|
+
- `publish_directory(...)`
|
|
818
|
+
|
|
673
819
|
## Examples
|
|
674
820
|
|
|
675
821
|
Runnable examples live in `examples/` and mirror the README workflows.
|
|
@@ -684,12 +830,16 @@ uv run python examples/record_file_description.py
|
|
|
684
830
|
uv run python examples/record_directory_description.py
|
|
685
831
|
uv run python examples/annotate_file.py
|
|
686
832
|
uv run python examples/annotate_directory.py
|
|
833
|
+
uv run python examples/annotate_file_answers_cli.py
|
|
687
834
|
uv run python examples/write_file_manifest.py
|
|
688
835
|
uv run python examples/write_directory_manifest.py
|
|
689
836
|
uv run python examples/write_file_description.py
|
|
690
837
|
uv run python examples/write_directory_description.py
|
|
838
|
+
uv run python examples/provenance_chain.py
|
|
839
|
+
uv run python examples/provenance_chain_cli.py
|
|
691
840
|
uv run python examples/recover_provenance.py
|
|
692
841
|
uv run python examples/recover_provenance_cli.py
|
|
842
|
+
uv run python examples/publish_cli.py
|
|
693
843
|
```
|
|
694
844
|
|
|
695
845
|
Each example writes its outputs to a fresh temporary directory and prints the
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "data-annotations"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.4.0"
|
|
4
4
|
description = "Annotate generated data artifacts"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
@@ -30,7 +30,7 @@ Changelog = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/blob/main/C
|
|
|
30
30
|
Issues = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/issues"
|
|
31
31
|
|
|
32
32
|
[project.optional-dependencies]
|
|
33
|
-
cli = ["questionary>=2.1.1", "typer>=0.16.0"]
|
|
33
|
+
cli = ["PyYAML>=6.0.2", "questionary>=2.1.1", "typer>=0.16.0"]
|
|
34
34
|
|
|
35
35
|
[project.scripts]
|
|
36
36
|
data-annotations = "data_annotations.cli:main"
|
{data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/annotations/models.py
RENAMED
|
@@ -22,14 +22,14 @@ class DirectoryArtifactSubject(BaseModel):
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class FileAnnotationDocument(BaseModel):
|
|
25
|
-
annotation_version: Literal["
|
|
25
|
+
annotation_version: Literal["5"] = "5"
|
|
26
26
|
subject: FileArtifactSubject
|
|
27
27
|
provenance: BaseProvenance
|
|
28
28
|
description: FileDescription
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class DirectoryAnnotationDocument(BaseModel):
|
|
32
|
-
annotation_version: Literal["
|
|
32
|
+
annotation_version: Literal["5"] = "5"
|
|
33
33
|
subject: DirectoryArtifactSubject
|
|
34
34
|
provenance: BaseProvenance
|
|
35
35
|
description: DirectoryDescription
|
{data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/annotations/writers.py
RENAMED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
1
2
|
from pathlib import Path
|
|
2
3
|
from typing import Any, Callable
|
|
3
4
|
|
|
@@ -154,7 +155,7 @@ def _build_file_annotation_document(
|
|
|
154
155
|
generation_context: dict[str, Any] | None = None,
|
|
155
156
|
artifact_kind: ArtifactKind = "other",
|
|
156
157
|
params: dict[str, Any] | None = None,
|
|
157
|
-
inputs:
|
|
158
|
+
inputs: Sequence[str | Path] | None = None,
|
|
158
159
|
function: Callable[..., Any] | None = None,
|
|
159
160
|
capture_mode: str = "runtime",
|
|
160
161
|
provenance_overrides: dict[str, Any] | None = None,
|
|
@@ -205,7 +206,7 @@ def _build_directory_annotation_document(
|
|
|
205
206
|
acquisition_context: dict[str, Any] | None = None,
|
|
206
207
|
generation_context: dict[str, Any] | None = None,
|
|
207
208
|
params: dict[str, Any] | None = None,
|
|
208
|
-
inputs:
|
|
209
|
+
inputs: Sequence[str | Path] | None = None,
|
|
209
210
|
function: Callable[..., Any] | None = None,
|
|
210
211
|
capture_mode: str = "runtime",
|
|
211
212
|
provenance_overrides: dict[str, Any] | None = None,
|
|
@@ -285,7 +286,7 @@ def write_file_annotation(
|
|
|
285
286
|
generation_context: dict[str, Any] | None = None,
|
|
286
287
|
artifact_kind: ArtifactKind = "other",
|
|
287
288
|
params: dict[str, Any] | None = None,
|
|
288
|
-
inputs:
|
|
289
|
+
inputs: Sequence[str | Path] | None = None,
|
|
289
290
|
function: Callable[..., Any] | None = None,
|
|
290
291
|
capture_mode: str = "runtime",
|
|
291
292
|
provenance_overrides: dict[str, Any] | None = None,
|
|
@@ -324,7 +325,7 @@ def write_directory_annotation(
|
|
|
324
325
|
acquisition_context: dict[str, Any] | None = None,
|
|
325
326
|
generation_context: dict[str, Any] | None = None,
|
|
326
327
|
params: dict[str, Any] | None = None,
|
|
327
|
-
inputs:
|
|
328
|
+
inputs: Sequence[str | Path] | None = None,
|
|
328
329
|
function: Callable[..., Any] | None = None,
|
|
329
330
|
capture_mode: str = "runtime",
|
|
330
331
|
provenance_overrides: dict[str, Any] | None = None,
|
|
@@ -363,7 +364,7 @@ def annotate_file(
|
|
|
363
364
|
generation_context: dict[str, Any] | None = None,
|
|
364
365
|
artifact_kind: ArtifactKind = "other",
|
|
365
366
|
params: dict[str, Any] | None = None,
|
|
366
|
-
inputs:
|
|
367
|
+
inputs: Sequence[str | Path] | None = None,
|
|
367
368
|
function: Callable[..., Any] | None = None,
|
|
368
369
|
write_readme: bool = True,
|
|
369
370
|
write_schema: bool | None = None,
|
|
@@ -418,7 +419,7 @@ def annotate_directory(
|
|
|
418
419
|
acquisition_context: dict[str, Any] | None = None,
|
|
419
420
|
generation_context: dict[str, Any] | None = None,
|
|
420
421
|
params: dict[str, Any] | None = None,
|
|
421
|
-
inputs:
|
|
422
|
+
inputs: Sequence[str | Path] | None = None,
|
|
422
423
|
function: Callable[..., Any] | None = None,
|
|
423
424
|
write_readme: bool = True,
|
|
424
425
|
write_schema: bool | None = None,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
3
|
_CLI_IMPORT_ERROR: ModuleNotFoundError | None = None
|
|
4
|
-
_CLI_OPTIONAL_DEPENDENCIES = {"questionary", "typer"}
|
|
4
|
+
_CLI_OPTIONAL_DEPENDENCIES = {"questionary", "typer", "yaml"}
|
|
5
5
|
app: Any = None
|
|
6
6
|
|
|
7
7
|
try:
|
|
@@ -9,6 +9,7 @@ try:
|
|
|
9
9
|
|
|
10
10
|
from data_annotations.cli_app.annotate import annotate_app
|
|
11
11
|
from data_annotations.cli_app.provenance_commands import provenance_app
|
|
12
|
+
from data_annotations.cli_app.publish import publish_command
|
|
12
13
|
except ModuleNotFoundError as exc:
|
|
13
14
|
if exc.name not in _CLI_OPTIONAL_DEPENDENCIES:
|
|
14
15
|
raise
|
|
@@ -17,6 +18,7 @@ else:
|
|
|
17
18
|
app = typer.Typer(no_args_is_help=True)
|
|
18
19
|
app.add_typer(annotate_app, name="annotate")
|
|
19
20
|
app.add_typer(provenance_app, name="provenance")
|
|
21
|
+
app.command("publish")(publish_command)
|
|
20
22
|
|
|
21
23
|
|
|
22
24
|
def main() -> None:
|