data-annotations 2.2.0__tar.gz → 2.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {data_annotations-2.2.0 → data_annotations-2.4.0}/PKG-INFO +154 -3
  2. {data_annotations-2.2.0 → data_annotations-2.4.0}/README.md +152 -2
  3. {data_annotations-2.2.0 → data_annotations-2.4.0}/pyproject.toml +2 -2
  4. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/annotations/models.py +2 -2
  5. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/annotations/writers.py +7 -6
  6. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/cli.py +3 -1
  7. data_annotations-2.4.0/src/data_annotations/cli_app/annotate.py +939 -0
  8. data_annotations-2.4.0/src/data_annotations/cli_app/answers.py +403 -0
  9. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/cli_app/prompts.py +21 -6
  10. data_annotations-2.4.0/src/data_annotations/cli_app/provenance_commands.py +346 -0
  11. data_annotations-2.4.0/src/data_annotations/cli_app/publish.py +92 -0
  12. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/provenance/__init__.py +14 -1
  13. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/provenance/models.py +10 -0
  14. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/provenance/recovery.py +326 -4
  15. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/provenance/writers.py +119 -10
  16. data_annotations-2.4.0/src/data_annotations/publish.py +532 -0
  17. data_annotations-2.2.0/src/data_annotations/cli_app/annotate.py +0 -483
  18. data_annotations-2.2.0/src/data_annotations/cli_app/provenance_commands.py +0 -107
  19. {data_annotations-2.2.0 → data_annotations-2.4.0}/LICENSE +0 -0
  20. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/__init__.py +0 -0
  21. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/_decorators.py +0 -0
  22. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/annotations/__init__.py +0 -0
  23. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/annotations/decorators.py +0 -0
  24. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/cli_app/__init__.py +0 -0
  25. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/cli_app/common.py +0 -0
  26. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/description/__init__.py +0 -0
  27. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/description/decorators.py +0 -0
  28. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/description/models.py +0 -0
  29. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/description/writers.py +0 -0
  30. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/provenance/decorators.py +0 -0
  31. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/provenance/git.py +0 -0
  32. {data_annotations-2.2.0 → data_annotations-2.4.0}/src/data_annotations/provenance/runtime.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-annotations
3
- Version: 2.2.0
3
+ Version: 2.4.0
4
4
  Summary: Annotate generated data artifacts
5
5
  Keywords: annotations,data,metadata,provenance,reproducibility
6
6
  Author: Rodrigo C. G. Pena
@@ -18,6 +18,7 @@ Classifier: Programming Language :: Python :: 3.14
18
18
  Classifier: Topic :: Scientific/Engineering
19
19
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
20
  Requires-Dist: pydantic>=2.13.1
21
+ Requires-Dist: pyyaml>=6.0.2 ; extra == 'cli'
21
22
  Requires-Dist: questionary>=2.1.1 ; extra == 'cli'
22
23
  Requires-Dist: typer>=0.16.0 ; extra == 'cli'
23
24
  Requires-Python: >=3.12
@@ -102,6 +103,9 @@ Every annotation document includes provenance with:
102
103
  - Git commit, branch, dirty state, canonical repository remote, exact tags, and
103
104
  `git describe` output when available
104
105
  - The current `SLURM_JOB_ID` when available
106
+ - Structured snapshots for recorded local inputs, including file checksums,
107
+ directory content digests, and upstream annotation sidecar references when
108
+ present
105
109
 
106
110
  You can also attach your own parameters, input file paths, and function names.
107
111
  Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
@@ -380,6 +384,7 @@ File annotations store:
380
384
  - `subject.kind`
381
385
  - `subject.sha256`
382
386
  - `provenance.*`
387
+ - `provenance.input_artifacts[]`
383
388
  - `description.title`
384
389
  - `description.summary`
385
390
  - `description.fields`
@@ -396,6 +401,7 @@ Directory annotations store:
396
401
  - `subject.child_bundles[]`
397
402
  - `subject.content_digest`
398
403
  - `provenance.*`
404
+ - `provenance.input_artifacts[]`
399
405
  - `description.title`
400
406
  - `description.summary`
401
407
  - `description.artifact_groups[]`
@@ -529,13 +535,16 @@ per call.
529
535
  ## Recovery Helpers
530
536
 
531
537
  Use `artifact_matches_manifest(...)` to verify whether a detached artifact still
532
- matches an annotation document, and `checkout_manifest_source(...)` to recover the
533
- recorded code state from Git metadata.
538
+ matches an annotation document. Use `analyze_provenance_chain(...)` when you also
539
+ want to verify recorded inputs and recursively follow upstream annotation
540
+ sidecars. Use `checkout_manifest_source(...)` to recover the recorded code state
541
+ from Git metadata.
534
542
 
535
543
  ```python
536
544
  from pathlib import Path
537
545
 
538
546
  from data_annotations.provenance import (
547
+ analyze_provenance_chain,
539
548
  artifact_matches_manifest,
540
549
  checkout_manifest_source,
541
550
  )
@@ -544,6 +553,8 @@ annotation_path = Path("outputs/participants.csv.annotation.json")
544
553
  artifact_path = Path("downloads/participants.csv")
545
554
 
546
555
  if artifact_matches_manifest(artifact_path, annotation_path):
556
+ chain = analyze_provenance_chain(artifact_path)
557
+ print(chain.status)
547
558
  recovered = checkout_manifest_source(annotation_path)
548
559
  print(recovered.checkout_path)
549
560
  print(recovered.script_path)
@@ -589,6 +600,91 @@ These commands prompt for missing details, write `*.annotation.json` or `data-an
589
600
  and optionally derive README sidecars. Post-hoc records are marked with
590
601
  `capture_mode="post_hoc"`.
591
602
 
603
+ For shell workflows, you can move the prompt answers into a YAML file and run
604
+ the command non-interactively:
605
+
606
+ ```bash
607
+ data-annotations annotate file path/to/participants.csv --answers participants.yaml
608
+ data-annotations annotate directory path/to/run-001 --answers run-001.yaml
609
+ data-annotations annotate answers check participants.yaml
610
+ ```
611
+
612
+ When `--answers` is provided, `--no-interactive` is the default. Use
613
+ `--interactive` if you want the YAML file to provide defaults and still prompt
614
+ for missing required values. If the YAML file includes `target`, the positional
615
+ target may be omitted; when both are provided, they must resolve to the same
616
+ path. Environment variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded
617
+ inside string values, and validation fails if a referenced variable is not set.
618
+ The `answers check` helper requires `target` so it can infer whether the answers
619
+ describe a file or a directory.
620
+
621
+ File answers can use top-level prompt-style keys:
622
+
623
+ ```yaml
624
+ target: path/to/participants.csv
625
+ title: Participant Cohort
626
+ summary: Participant-level cohort assignments.
627
+ kind: dataset
628
+
629
+ inputs:
630
+ - ${DATA_ROOT}/raw/participants.csv
631
+
632
+ params:
633
+ split: validation
634
+
635
+ provenance:
636
+ command: bash scripts/build_participants.sh
637
+ script: scripts/build_participants.sh
638
+ git_sha: deadbeef
639
+
640
+ fields:
641
+ - name: participant_id
642
+ summary: Stable participant identifier.
643
+ data_type: string
644
+ required: true
645
+ nullable: false
646
+
647
+ primary_key:
648
+ - participant_id
649
+ ```
650
+
651
+ Directory answers use an explicit inventory. Paths in `artifacts`,
652
+ `artifact_groups.paths`, and `child_bundles` are relative to the annotated
653
+ directory unless absolute:
654
+
655
+ ```yaml
656
+ target: path/to/run-001
657
+ title: Processing outputs
658
+ summary: Files produced by the shell processing workflow.
659
+
660
+ provenance:
661
+ command: bash process_from_instrument.sh
662
+ script: process_from_instrument.sh
663
+
664
+ artifacts:
665
+ - path: processed.csv
666
+ kind: dataset
667
+ title: Processed instrument output
668
+ summary: Normalized output from the processing script.
669
+
670
+ artifact_groups:
671
+ - title: Diagnostic plots
672
+ kind: plot
673
+ selector: plots/*.png
674
+ paths:
675
+ - plots/qc-1.png
676
+ - plots/qc-2.png
677
+
678
+ child_bundles:
679
+ - path: model
680
+ annotation_path: model/data-annotations.json
681
+ ```
682
+
683
+ Answers files may also use schema-style aliases such as `subject.path`,
684
+ `subject.kind`, `description.title`, `description.summary`,
685
+ `description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
686
+ and `provenance.params`.
687
+
592
688
  When group selectors are provided, the CLI expands them to concrete member paths
593
689
  at annotation time. Grouped files are tracked in `subject.produced_files[]` but
594
690
  are skipped by the per-file prompt flow, so you do not have to answer the same
@@ -602,12 +698,49 @@ For provenance inspection and source recovery:
602
698
 
603
699
  ```bash
604
700
  data-annotations provenance match path/to/artifact
701
+ data-annotations provenance chain path/to/artifact
702
+ data-annotations provenance chain path/to/artifact --full-paths
605
703
  data-annotations provenance checkout path/to/artifact
606
704
  ```
607
705
 
706
+ Command `checkout` downloads the recorded Git remote and checks out the recorded
707
+ commit. It prompts before downloading source code and defaults to No; use
708
+ `--force` when running trusted provenance checkout non-interactively.
709
+
608
710
  Command `match` auto-discovers `*.annotation.json` for files and `data-annotations.json` for
609
711
  directories, prints a verification summary, and suggests the exact `checkout`
610
712
  command to run next when Git recovery metadata is available.
713
+ Command `chain` uses the same sidecar discovery, then verifies the artifact,
714
+ recorded input snapshots, and any upstream annotation sidecars reachable from
715
+ those inputs. Its default output shows a compact relative-path tree and lists
716
+ stale, missing, or unverifiable nodes first; use `--full-paths` when you need
717
+ absolute paths.
718
+
719
+ For publication workflows, create a sanitized copy of an annotated artifact tree:
720
+
721
+ ```bash
722
+ data-annotations publish path/to/run-001 path/to/publish-bundle
723
+ data-annotations publish path/to/run-001 path/to/publish-bundle \
724
+ --prefix /private/raw/study-a='$INPUT_ROOT'
725
+ data-annotations publish path/to/run-001 path/to/publish-metadata \
726
+ --annotations-only
727
+ data-annotations publish path/to/run-001 path/to/publish-bundle --dry-run
728
+ ```
729
+
730
+ Command `publish` recursively discovers file annotations (`*.annotation.json`) and
731
+ directory annotations (`data-annotations.json`), writes a mirrored publish bundle,
732
+ and regenerates README sidecars from sanitized annotation JSON. Paths under the
733
+ source directory are rewritten to `$ARTIFACT_ROOT/...`; additional `--prefix`
734
+ mappings rewrite other private path roots. Hostname, username, and SLURM job ID
735
+ are redacted by default. Git remote URLs are preserved unless
736
+ `--redact-git-remote` is provided. Strict mode is enabled by default and fails if
737
+ any local absolute path remains after sanitization; use `--no-strict` only after
738
+ reviewing `--dry-run` output.
739
+
740
+ If `data-annotations provenance --help` does not list `chain`, your shell is
741
+ resolving an older installed command. From a source checkout, use
742
+ `uv run data-annotations provenance chain ...`, or reinstall the CLI from the
743
+ updated source before using the bare `data-annotations` command.
611
744
 
612
745
  ### Run With `uvx`
613
746
 
@@ -632,7 +765,9 @@ the project environment. You can then run:
632
765
  uv run data-annotations annotate file path/to/participants.csv
633
766
  uv run data-annotations annotate directory path/to/run-001
634
767
  uv run data-annotations provenance match path/to/participants.csv
768
+ uv run data-annotations provenance chain path/to/participants.csv
635
769
  uv run data-annotations provenance checkout path/to/participants.csv
770
+ uv run data-annotations publish path/to/run-001 path/to/publish-bundle
636
771
  ```
637
772
 
638
773
  ## API Overview
@@ -684,9 +819,12 @@ uv run data-annotations provenance checkout path/to/participants.csv
684
819
 
685
820
  - `ProducedFile`
686
821
  - `ChildBundle`
822
+ - `InputArtifact`
687
823
  - `BaseProvenance`
688
824
  - `FileManifest`
689
825
  - `DirectoryManifest`
826
+ - `ProvenanceChainNode`
827
+ - `ProvenanceChainReport`
690
828
  - `RecoveredSource`
691
829
 
692
830
  ### Provenance Functions
@@ -696,9 +834,18 @@ uv run data-annotations provenance checkout path/to/participants.csv
696
834
  - `write_file_manifest(...)`
697
835
  - `write_directory_manifest(...)`
698
836
  - `directory_content_digest(...)`
837
+ - `analyze_provenance_chain(...)`
838
+ - `provenance_chain_is_fresh(...)`
699
839
  - `artifact_matches_manifest(...)`
700
840
  - `checkout_manifest_source(...)`
701
841
 
842
+ ### Publish Functions
843
+
844
+ - `discover_annotation_paths(...)`
845
+ - `sanitize_annotation_document(...)`
846
+ - `sanitize_annotation_path(...)`
847
+ - `publish_directory(...)`
848
+
702
849
  ## Examples
703
850
 
704
851
  Runnable examples live in `examples/` and mirror the README workflows.
@@ -713,12 +860,16 @@ uv run python examples/record_file_description.py
713
860
  uv run python examples/record_directory_description.py
714
861
  uv run python examples/annotate_file.py
715
862
  uv run python examples/annotate_directory.py
863
+ uv run python examples/annotate_file_answers_cli.py
716
864
  uv run python examples/write_file_manifest.py
717
865
  uv run python examples/write_directory_manifest.py
718
866
  uv run python examples/write_file_description.py
719
867
  uv run python examples/write_directory_description.py
868
+ uv run python examples/provenance_chain.py
869
+ uv run python examples/provenance_chain_cli.py
720
870
  uv run python examples/recover_provenance.py
721
871
  uv run python examples/recover_provenance_cli.py
872
+ uv run python examples/publish_cli.py
722
873
  ```
723
874
 
724
875
  Each example writes its outputs to a fresh temporary directory and prints the
@@ -73,6 +73,9 @@ Every annotation document includes provenance with:
73
73
  - Git commit, branch, dirty state, canonical repository remote, exact tags, and
74
74
  `git describe` output when available
75
75
  - The current `SLURM_JOB_ID` when available
76
+ - Structured snapshots for recorded local inputs, including file checksums,
77
+ directory content digests, and upstream annotation sidecar references when
78
+ present
76
79
 
77
80
  You can also attach your own parameters, input file paths, and function names.
78
81
  Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
@@ -351,6 +354,7 @@ File annotations store:
351
354
  - `subject.kind`
352
355
  - `subject.sha256`
353
356
  - `provenance.*`
357
+ - `provenance.input_artifacts[]`
354
358
  - `description.title`
355
359
  - `description.summary`
356
360
  - `description.fields`
@@ -367,6 +371,7 @@ Directory annotations store:
367
371
  - `subject.child_bundles[]`
368
372
  - `subject.content_digest`
369
373
  - `provenance.*`
374
+ - `provenance.input_artifacts[]`
370
375
  - `description.title`
371
376
  - `description.summary`
372
377
  - `description.artifact_groups[]`
@@ -500,13 +505,16 @@ per call.
500
505
  ## Recovery Helpers
501
506
 
502
507
  Use `artifact_matches_manifest(...)` to verify whether a detached artifact still
503
- matches an annotation document, and `checkout_manifest_source(...)` to recover the
504
- recorded code state from Git metadata.
508
+ matches an annotation document. Use `analyze_provenance_chain(...)` when you also
509
+ want to verify recorded inputs and recursively follow upstream annotation
510
+ sidecars. Use `checkout_manifest_source(...)` to recover the recorded code state
511
+ from Git metadata.
505
512
 
506
513
  ```python
507
514
  from pathlib import Path
508
515
 
509
516
  from data_annotations.provenance import (
517
+ analyze_provenance_chain,
510
518
  artifact_matches_manifest,
511
519
  checkout_manifest_source,
512
520
  )
@@ -515,6 +523,8 @@ annotation_path = Path("outputs/participants.csv.annotation.json")
515
523
  artifact_path = Path("downloads/participants.csv")
516
524
 
517
525
  if artifact_matches_manifest(artifact_path, annotation_path):
526
+ chain = analyze_provenance_chain(artifact_path)
527
+ print(chain.status)
518
528
  recovered = checkout_manifest_source(annotation_path)
519
529
  print(recovered.checkout_path)
520
530
  print(recovered.script_path)
@@ -560,6 +570,91 @@ These commands prompt for missing details, write `*.annotation.json` or `data-an
560
570
  and optionally derive README sidecars. Post-hoc records are marked with
561
571
  `capture_mode="post_hoc"`.
562
572
 
573
+ For shell workflows, you can move the prompt answers into a YAML file and run
574
+ the command non-interactively:
575
+
576
+ ```bash
577
+ data-annotations annotate file path/to/participants.csv --answers participants.yaml
578
+ data-annotations annotate directory path/to/run-001 --answers run-001.yaml
579
+ data-annotations annotate answers check participants.yaml
580
+ ```
581
+
582
+ When `--answers` is provided, `--no-interactive` is the default. Use
583
+ `--interactive` if you want the YAML file to provide defaults and still prompt
584
+ for missing required values. If the YAML file includes `target`, the positional
585
+ target may be omitted; when both are provided, they must resolve to the same
586
+ path. Environment variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded
587
+ inside string values, and validation fails if a referenced variable is not set.
588
+ The `answers check` helper requires `target` so it can infer whether the answers
589
+ describe a file or a directory.
590
+
591
+ File answers can use top-level prompt-style keys:
592
+
593
+ ```yaml
594
+ target: path/to/participants.csv
595
+ title: Participant Cohort
596
+ summary: Participant-level cohort assignments.
597
+ kind: dataset
598
+
599
+ inputs:
600
+ - ${DATA_ROOT}/raw/participants.csv
601
+
602
+ params:
603
+ split: validation
604
+
605
+ provenance:
606
+ command: bash scripts/build_participants.sh
607
+ script: scripts/build_participants.sh
608
+ git_sha: deadbeef
609
+
610
+ fields:
611
+ - name: participant_id
612
+ summary: Stable participant identifier.
613
+ data_type: string
614
+ required: true
615
+ nullable: false
616
+
617
+ primary_key:
618
+ - participant_id
619
+ ```
620
+
621
+ Directory answers use an explicit inventory. Paths in `artifacts`,
622
+ `artifact_groups.paths`, and `child_bundles` are relative to the annotated
623
+ directory unless absolute:
624
+
625
+ ```yaml
626
+ target: path/to/run-001
627
+ title: Processing outputs
628
+ summary: Files produced by the shell processing workflow.
629
+
630
+ provenance:
631
+ command: bash process_from_instrument.sh
632
+ script: process_from_instrument.sh
633
+
634
+ artifacts:
635
+ - path: processed.csv
636
+ kind: dataset
637
+ title: Processed instrument output
638
+ summary: Normalized output from the processing script.
639
+
640
+ artifact_groups:
641
+ - title: Diagnostic plots
642
+ kind: plot
643
+ selector: plots/*.png
644
+ paths:
645
+ - plots/qc-1.png
646
+ - plots/qc-2.png
647
+
648
+ child_bundles:
649
+ - path: model
650
+ annotation_path: model/data-annotations.json
651
+ ```
652
+
653
+ Answers files may also use schema-style aliases such as `subject.path`,
654
+ `subject.kind`, `description.title`, `description.summary`,
655
+ `description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
656
+ and `provenance.params`.
657
+
563
658
  When group selectors are provided, the CLI expands them to concrete member paths
564
659
  at annotation time. Grouped files are tracked in `subject.produced_files[]` but
565
660
  are skipped by the per-file prompt flow, so you do not have to answer the same
@@ -573,12 +668,49 @@ For provenance inspection and source recovery:
573
668
 
574
669
  ```bash
575
670
  data-annotations provenance match path/to/artifact
671
+ data-annotations provenance chain path/to/artifact
672
+ data-annotations provenance chain path/to/artifact --full-paths
576
673
  data-annotations provenance checkout path/to/artifact
577
674
  ```
578
675
 
676
+ Command `checkout` downloads the recorded Git remote and checks out the recorded
677
+ commit. It prompts before downloading source code and defaults to No; use
678
+ `--force` when running trusted provenance checkout non-interactively.
679
+
579
680
  Command `match` auto-discovers `*.annotation.json` for files and `data-annotations.json` for
580
681
  directories, prints a verification summary, and suggests the exact `checkout`
581
682
  command to run next when Git recovery metadata is available.
683
+ Command `chain` uses the same sidecar discovery, then verifies the artifact,
684
+ recorded input snapshots, and any upstream annotation sidecars reachable from
685
+ those inputs. Its default output shows a compact relative-path tree and lists
686
+ stale, missing, or unverifiable nodes first; use `--full-paths` when you need
687
+ absolute paths.
688
+
689
+ For publication workflows, create a sanitized copy of an annotated artifact tree:
690
+
691
+ ```bash
692
+ data-annotations publish path/to/run-001 path/to/publish-bundle
693
+ data-annotations publish path/to/run-001 path/to/publish-bundle \
694
+ --prefix /private/raw/study-a='$INPUT_ROOT'
695
+ data-annotations publish path/to/run-001 path/to/publish-metadata \
696
+ --annotations-only
697
+ data-annotations publish path/to/run-001 path/to/publish-bundle --dry-run
698
+ ```
699
+
700
+ Command `publish` recursively discovers file annotations (`*.annotation.json`) and
701
+ directory annotations (`data-annotations.json`), writes a mirrored publish bundle,
702
+ and regenerates README sidecars from sanitized annotation JSON. Paths under the
703
+ source directory are rewritten to `$ARTIFACT_ROOT/...`; additional `--prefix`
704
+ mappings rewrite other private path roots. Hostname, username, and SLURM job ID
705
+ are redacted by default. Git remote URLs are preserved unless
706
+ `--redact-git-remote` is provided. Strict mode is enabled by default and fails if
707
+ any local absolute path remains after sanitization; use `--no-strict` only after
708
+ reviewing `--dry-run` output.
709
+
710
+ If `data-annotations provenance --help` does not list `chain`, your shell is
711
+ resolving an older installed command. From a source checkout, use
712
+ `uv run data-annotations provenance chain ...`, or reinstall the CLI from the
713
+ updated source before using the bare `data-annotations` command.
582
714
 
583
715
  ### Run With `uvx`
584
716
 
@@ -603,7 +735,9 @@ the project environment. You can then run:
603
735
  uv run data-annotations annotate file path/to/participants.csv
604
736
  uv run data-annotations annotate directory path/to/run-001
605
737
  uv run data-annotations provenance match path/to/participants.csv
738
+ uv run data-annotations provenance chain path/to/participants.csv
606
739
  uv run data-annotations provenance checkout path/to/participants.csv
740
+ uv run data-annotations publish path/to/run-001 path/to/publish-bundle
607
741
  ```
608
742
 
609
743
  ## API Overview
@@ -655,9 +789,12 @@ uv run data-annotations provenance checkout path/to/participants.csv
655
789
 
656
790
  - `ProducedFile`
657
791
  - `ChildBundle`
792
+ - `InputArtifact`
658
793
  - `BaseProvenance`
659
794
  - `FileManifest`
660
795
  - `DirectoryManifest`
796
+ - `ProvenanceChainNode`
797
+ - `ProvenanceChainReport`
661
798
  - `RecoveredSource`
662
799
 
663
800
  ### Provenance Functions
@@ -667,9 +804,18 @@ uv run data-annotations provenance checkout path/to/participants.csv
667
804
  - `write_file_manifest(...)`
668
805
  - `write_directory_manifest(...)`
669
806
  - `directory_content_digest(...)`
807
+ - `analyze_provenance_chain(...)`
808
+ - `provenance_chain_is_fresh(...)`
670
809
  - `artifact_matches_manifest(...)`
671
810
  - `checkout_manifest_source(...)`
672
811
 
812
+ ### Publish Functions
813
+
814
+ - `discover_annotation_paths(...)`
815
+ - `sanitize_annotation_document(...)`
816
+ - `sanitize_annotation_path(...)`
817
+ - `publish_directory(...)`
818
+
673
819
  ## Examples
674
820
 
675
821
  Runnable examples live in `examples/` and mirror the README workflows.
@@ -684,12 +830,16 @@ uv run python examples/record_file_description.py
684
830
  uv run python examples/record_directory_description.py
685
831
  uv run python examples/annotate_file.py
686
832
  uv run python examples/annotate_directory.py
833
+ uv run python examples/annotate_file_answers_cli.py
687
834
  uv run python examples/write_file_manifest.py
688
835
  uv run python examples/write_directory_manifest.py
689
836
  uv run python examples/write_file_description.py
690
837
  uv run python examples/write_directory_description.py
838
+ uv run python examples/provenance_chain.py
839
+ uv run python examples/provenance_chain_cli.py
691
840
  uv run python examples/recover_provenance.py
692
841
  uv run python examples/recover_provenance_cli.py
842
+ uv run python examples/publish_cli.py
693
843
  ```
694
844
 
695
845
  Each example writes its outputs to a fresh temporary directory and prints the
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "data-annotations"
3
- version = "2.2.0"
3
+ version = "2.4.0"
4
4
  description = "Annotate generated data artifacts"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -30,7 +30,7 @@ Changelog = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/blob/main/C
30
30
  Issues = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/issues"
31
31
 
32
32
  [project.optional-dependencies]
33
- cli = ["questionary>=2.1.1", "typer>=0.16.0"]
33
+ cli = ["PyYAML>=6.0.2", "questionary>=2.1.1", "typer>=0.16.0"]
34
34
 
35
35
  [project.scripts]
36
36
  data-annotations = "data_annotations.cli:main"
@@ -22,14 +22,14 @@ class DirectoryArtifactSubject(BaseModel):
22
22
 
23
23
 
24
24
  class FileAnnotationDocument(BaseModel):
25
- annotation_version: Literal["4"] = "4"
25
+ annotation_version: Literal["5"] = "5"
26
26
  subject: FileArtifactSubject
27
27
  provenance: BaseProvenance
28
28
  description: FileDescription
29
29
 
30
30
 
31
31
  class DirectoryAnnotationDocument(BaseModel):
32
- annotation_version: Literal["4"] = "4"
32
+ annotation_version: Literal["5"] = "5"
33
33
  subject: DirectoryArtifactSubject
34
34
  provenance: BaseProvenance
35
35
  description: DirectoryDescription
@@ -1,3 +1,4 @@
1
+ from collections.abc import Sequence
1
2
  from pathlib import Path
2
3
  from typing import Any, Callable
3
4
 
@@ -154,7 +155,7 @@ def _build_file_annotation_document(
154
155
  generation_context: dict[str, Any] | None = None,
155
156
  artifact_kind: ArtifactKind = "other",
156
157
  params: dict[str, Any] | None = None,
157
- inputs: list[str] | None = None,
158
+ inputs: Sequence[str | Path] | None = None,
158
159
  function: Callable[..., Any] | None = None,
159
160
  capture_mode: str = "runtime",
160
161
  provenance_overrides: dict[str, Any] | None = None,
@@ -205,7 +206,7 @@ def _build_directory_annotation_document(
205
206
  acquisition_context: dict[str, Any] | None = None,
206
207
  generation_context: dict[str, Any] | None = None,
207
208
  params: dict[str, Any] | None = None,
208
- inputs: list[str] | None = None,
209
+ inputs: Sequence[str | Path] | None = None,
209
210
  function: Callable[..., Any] | None = None,
210
211
  capture_mode: str = "runtime",
211
212
  provenance_overrides: dict[str, Any] | None = None,
@@ -285,7 +286,7 @@ def write_file_annotation(
285
286
  generation_context: dict[str, Any] | None = None,
286
287
  artifact_kind: ArtifactKind = "other",
287
288
  params: dict[str, Any] | None = None,
288
- inputs: list[str] | None = None,
289
+ inputs: Sequence[str | Path] | None = None,
289
290
  function: Callable[..., Any] | None = None,
290
291
  capture_mode: str = "runtime",
291
292
  provenance_overrides: dict[str, Any] | None = None,
@@ -324,7 +325,7 @@ def write_directory_annotation(
324
325
  acquisition_context: dict[str, Any] | None = None,
325
326
  generation_context: dict[str, Any] | None = None,
326
327
  params: dict[str, Any] | None = None,
327
- inputs: list[str] | None = None,
328
+ inputs: Sequence[str | Path] | None = None,
328
329
  function: Callable[..., Any] | None = None,
329
330
  capture_mode: str = "runtime",
330
331
  provenance_overrides: dict[str, Any] | None = None,
@@ -363,7 +364,7 @@ def annotate_file(
363
364
  generation_context: dict[str, Any] | None = None,
364
365
  artifact_kind: ArtifactKind = "other",
365
366
  params: dict[str, Any] | None = None,
366
- inputs: list[str] | None = None,
367
+ inputs: Sequence[str | Path] | None = None,
367
368
  function: Callable[..., Any] | None = None,
368
369
  write_readme: bool = True,
369
370
  write_schema: bool | None = None,
@@ -418,7 +419,7 @@ def annotate_directory(
418
419
  acquisition_context: dict[str, Any] | None = None,
419
420
  generation_context: dict[str, Any] | None = None,
420
421
  params: dict[str, Any] | None = None,
421
- inputs: list[str] | None = None,
422
+ inputs: Sequence[str | Path] | None = None,
422
423
  function: Callable[..., Any] | None = None,
423
424
  write_readme: bool = True,
424
425
  write_schema: bool | None = None,
@@ -1,7 +1,7 @@
1
1
  from typing import Any
2
2
 
3
3
  _CLI_IMPORT_ERROR: ModuleNotFoundError | None = None
4
- _CLI_OPTIONAL_DEPENDENCIES = {"questionary", "typer"}
4
+ _CLI_OPTIONAL_DEPENDENCIES = {"questionary", "typer", "yaml"}
5
5
  app: Any = None
6
6
 
7
7
  try:
@@ -9,6 +9,7 @@ try:
9
9
 
10
10
  from data_annotations.cli_app.annotate import annotate_app
11
11
  from data_annotations.cli_app.provenance_commands import provenance_app
12
+ from data_annotations.cli_app.publish import publish_command
12
13
  except ModuleNotFoundError as exc:
13
14
  if exc.name not in _CLI_OPTIONAL_DEPENDENCIES:
14
15
  raise
@@ -17,6 +18,7 @@ else:
17
18
  app = typer.Typer(no_args_is_help=True)
18
19
  app.add_typer(annotate_app, name="annotate")
19
20
  app.add_typer(provenance_app, name="provenance")
21
+ app.command("publish")(publish_command)
20
22
 
21
23
 
22
24
  def main() -> None: