data-annotations 2.3.0__tar.gz → 2.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {data_annotations-2.3.0 → data_annotations-2.4.0}/PKG-INFO +122 -1
  2. {data_annotations-2.3.0 → data_annotations-2.4.0}/README.md +120 -0
  3. {data_annotations-2.3.0 → data_annotations-2.4.0}/pyproject.toml +2 -2
  4. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/cli.py +3 -1
  5. data_annotations-2.4.0/src/data_annotations/cli_app/annotate.py +939 -0
  6. data_annotations-2.4.0/src/data_annotations/cli_app/answers.py +403 -0
  7. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/cli_app/prompts.py +21 -6
  8. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/cli_app/provenance_commands.py +26 -1
  9. data_annotations-2.4.0/src/data_annotations/cli_app/publish.py +92 -0
  10. data_annotations-2.4.0/src/data_annotations/publish.py +532 -0
  11. data_annotations-2.3.0/src/data_annotations/cli_app/annotate.py +0 -483
  12. {data_annotations-2.3.0 → data_annotations-2.4.0}/LICENSE +0 -0
  13. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/__init__.py +0 -0
  14. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/_decorators.py +0 -0
  15. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/annotations/__init__.py +0 -0
  16. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/annotations/decorators.py +0 -0
  17. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/annotations/models.py +0 -0
  18. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/annotations/writers.py +0 -0
  19. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/cli_app/__init__.py +0 -0
  20. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/cli_app/common.py +0 -0
  21. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/description/__init__.py +0 -0
  22. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/description/decorators.py +0 -0
  23. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/description/models.py +0 -0
  24. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/description/writers.py +0 -0
  25. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/provenance/__init__.py +0 -0
  26. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/provenance/decorators.py +0 -0
  27. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/provenance/git.py +0 -0
  28. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/provenance/models.py +0 -0
  29. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/provenance/recovery.py +0 -0
  30. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/provenance/runtime.py +0 -0
  31. {data_annotations-2.3.0 → data_annotations-2.4.0}/src/data_annotations/provenance/writers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-annotations
3
- Version: 2.3.0
3
+ Version: 2.4.0
4
4
  Summary: Annotate generated data artifacts
5
5
  Keywords: annotations,data,metadata,provenance,reproducibility
6
6
  Author: Rodrigo C. G. Pena
@@ -18,6 +18,7 @@ Classifier: Programming Language :: Python :: 3.14
18
18
  Classifier: Topic :: Scientific/Engineering
19
19
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
20
  Requires-Dist: pydantic>=2.13.1
21
+ Requires-Dist: pyyaml>=6.0.2 ; extra == 'cli'
21
22
  Requires-Dist: questionary>=2.1.1 ; extra == 'cli'
22
23
  Requires-Dist: typer>=0.16.0 ; extra == 'cli'
23
24
  Requires-Python: >=3.12
@@ -599,6 +600,91 @@ These commands prompt for missing details, write `*.annotation.json` or `data-an
599
600
  and optionally derive README sidecars. Post-hoc records are marked with
600
601
  `capture_mode="post_hoc"`.
601
602
 
603
+ For shell workflows, you can move the prompt answers into a YAML file and run
604
+ the command non-interactively:
605
+
606
+ ```bash
607
+ data-annotations annotate file path/to/participants.csv --answers participants.yaml
608
+ data-annotations annotate directory path/to/run-001 --answers run-001.yaml
609
+ data-annotations annotate answers check participants.yaml
610
+ ```
611
+
612
+ When `--answers` is provided, `--no-interactive` is the default. Use
613
+ `--interactive` if you want the YAML file to provide defaults and still prompt
614
+ for missing required values. If the YAML file includes `target`, the positional
615
+ target may be omitted; when both are provided, they must resolve to the same
616
+ path. Environment variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded
617
+ inside string values, and validation fails if a referenced variable is not set.
618
+ The `answers check` helper requires `target` so it can infer whether the answers
619
+ describe a file or a directory.
620
+
621
+ File answers can use top-level prompt-style keys:
622
+
623
+ ```yaml
624
+ target: path/to/participants.csv
625
+ title: Participant Cohort
626
+ summary: Participant-level cohort assignments.
627
+ kind: dataset
628
+
629
+ inputs:
630
+ - ${DATA_ROOT}/raw/participants.csv
631
+
632
+ params:
633
+ split: validation
634
+
635
+ provenance:
636
+ command: bash scripts/build_participants.sh
637
+ script: scripts/build_participants.sh
638
+ git_sha: deadbeef
639
+
640
+ fields:
641
+ - name: participant_id
642
+ summary: Stable participant identifier.
643
+ data_type: string
644
+ required: true
645
+ nullable: false
646
+
647
+ primary_key:
648
+ - participant_id
649
+ ```
650
+
651
+ Directory answers use an explicit inventory. Paths in `artifacts`,
652
+ `artifact_groups.paths`, and `child_bundles` are relative to the annotated
653
+ directory unless absolute:
654
+
655
+ ```yaml
656
+ target: path/to/run-001
657
+ title: Processing outputs
658
+ summary: Files produced by the shell processing workflow.
659
+
660
+ provenance:
661
+ command: bash process_from_instrument.sh
662
+ script: process_from_instrument.sh
663
+
664
+ artifacts:
665
+ - path: processed.csv
666
+ kind: dataset
667
+ title: Processed instrument output
668
+ summary: Normalized output from the processing script.
669
+
670
+ artifact_groups:
671
+ - title: Diagnostic plots
672
+ kind: plot
673
+ selector: plots/*.png
674
+ paths:
675
+ - plots/qc-1.png
676
+ - plots/qc-2.png
677
+
678
+ child_bundles:
679
+ - path: model
680
+ annotation_path: model/data-annotations.json
681
+ ```
682
+
683
+ Answers files may also use schema-style aliases such as `subject.path`,
684
+ `subject.kind`, `description.title`, `description.summary`,
685
+ `description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
686
+ and `provenance.params`.
687
+
602
688
  When group selectors are provided, the CLI expands them to concrete member paths
603
689
  at annotation time. Grouped files are tracked in `subject.produced_files[]` but
604
690
  are skipped by the per-file prompt flow, so you do not have to answer the same
@@ -617,6 +703,10 @@ data-annotations provenance chain path/to/artifact --full-paths
617
703
  data-annotations provenance checkout path/to/artifact
618
704
  ```
619
705
 
706
+ Command `checkout` downloads the recorded Git remote and checks out the recorded
707
+ commit. It prompts before downloading source code and defaults to No; use
708
+ `--force` when running trusted provenance checkout non-interactively.
709
+
620
710
  Command `match` auto-discovers `*.annotation.json` for files and `data-annotations.json` for
621
711
  directories, prints a verification summary, and suggests the exact `checkout`
622
712
  command to run next when Git recovery metadata is available.
@@ -626,6 +716,27 @@ those inputs. Its default output shows a compact relative-path tree and lists
626
716
  stale, missing, or unverifiable nodes first; use `--full-paths` when you need
627
717
  absolute paths.
628
718
 
719
+ For publication workflows, create a sanitized copy of an annotated artifact tree:
720
+
721
+ ```bash
722
+ data-annotations publish path/to/run-001 path/to/publish-bundle
723
+ data-annotations publish path/to/run-001 path/to/publish-bundle \
724
+ --prefix /private/raw/study-a='$INPUT_ROOT'
725
+ data-annotations publish path/to/run-001 path/to/publish-metadata \
726
+ --annotations-only
727
+ data-annotations publish path/to/run-001 path/to/publish-bundle --dry-run
728
+ ```
729
+
730
+ Command `publish` recursively discovers file annotations (`*.annotation.json`) and
731
+ directory annotations (`data-annotations.json`), writes a mirrored publish bundle,
732
+ and regenerates README sidecars from sanitized annotation JSON. Paths under the
733
+ source directory are rewritten to `$ARTIFACT_ROOT/...`; additional `--prefix`
734
+ mappings rewrite other private path roots. Hostname, username, and SLURM job ID
735
+ are redacted by default. Git remote URLs are preserved unless
736
+ `--redact-git-remote` is provided. Strict mode is enabled by default and fails if
737
+ any local absolute path remains after sanitization; use `--no-strict` only after
738
+ reviewing `--dry-run` output.
739
+
629
740
  If `data-annotations provenance --help` does not list `chain`, your shell is
630
741
  resolving an older installed command. From a source checkout, use
631
742
  `uv run data-annotations provenance chain ...`, or reinstall the CLI from the
@@ -656,6 +767,7 @@ uv run data-annotations annotate directory path/to/run-001
656
767
  uv run data-annotations provenance match path/to/participants.csv
657
768
  uv run data-annotations provenance chain path/to/participants.csv
658
769
  uv run data-annotations provenance checkout path/to/participants.csv
770
+ uv run data-annotations publish path/to/run-001 path/to/publish-bundle
659
771
  ```
660
772
 
661
773
  ## API Overview
@@ -727,6 +839,13 @@ uv run data-annotations provenance checkout path/to/participants.csv
727
839
  - `artifact_matches_manifest(...)`
728
840
  - `checkout_manifest_source(...)`
729
841
 
842
+ ### Publish Functions
843
+
844
+ - `discover_annotation_paths(...)`
845
+ - `sanitize_annotation_document(...)`
846
+ - `sanitize_annotation_path(...)`
847
+ - `publish_directory(...)`
848
+
730
849
  ## Examples
731
850
 
732
851
  Runnable examples live in `examples/` and mirror the README workflows.
@@ -741,6 +860,7 @@ uv run python examples/record_file_description.py
741
860
  uv run python examples/record_directory_description.py
742
861
  uv run python examples/annotate_file.py
743
862
  uv run python examples/annotate_directory.py
863
+ uv run python examples/annotate_file_answers_cli.py
744
864
  uv run python examples/write_file_manifest.py
745
865
  uv run python examples/write_directory_manifest.py
746
866
  uv run python examples/write_file_description.py
@@ -749,6 +869,7 @@ uv run python examples/provenance_chain.py
749
869
  uv run python examples/provenance_chain_cli.py
750
870
  uv run python examples/recover_provenance.py
751
871
  uv run python examples/recover_provenance_cli.py
872
+ uv run python examples/publish_cli.py
752
873
  ```
753
874
 
754
875
  Each example writes its outputs to a fresh temporary directory and prints the
@@ -570,6 +570,91 @@ These commands prompt for missing details, write `*.annotation.json` or `data-an
570
570
  and optionally derive README sidecars. Post-hoc records are marked with
571
571
  `capture_mode="post_hoc"`.
572
572
 
573
+ For shell workflows, you can move the prompt answers into a YAML file and run
574
+ the command non-interactively:
575
+
576
+ ```bash
577
+ data-annotations annotate file path/to/participants.csv --answers participants.yaml
578
+ data-annotations annotate directory path/to/run-001 --answers run-001.yaml
579
+ data-annotations annotate answers check participants.yaml
580
+ ```
581
+
582
+ When `--answers` is provided, `--no-interactive` is the default. Use
583
+ `--interactive` if you want the YAML file to provide defaults and still prompt
584
+ for missing required values. If the YAML file includes `target`, the positional
585
+ target may be omitted; when both are provided, they must resolve to the same
586
+ path. Environment variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded
587
+ inside string values, and validation fails if a referenced variable is not set.
588
+ The `answers check` helper requires `target` so it can infer whether the answers
589
+ describe a file or a directory.
590
+
591
+ File answers can use top-level prompt-style keys:
592
+
593
+ ```yaml
594
+ target: path/to/participants.csv
595
+ title: Participant Cohort
596
+ summary: Participant-level cohort assignments.
597
+ kind: dataset
598
+
599
+ inputs:
600
+ - ${DATA_ROOT}/raw/participants.csv
601
+
602
+ params:
603
+ split: validation
604
+
605
+ provenance:
606
+ command: bash scripts/build_participants.sh
607
+ script: scripts/build_participants.sh
608
+ git_sha: deadbeef
609
+
610
+ fields:
611
+ - name: participant_id
612
+ summary: Stable participant identifier.
613
+ data_type: string
614
+ required: true
615
+ nullable: false
616
+
617
+ primary_key:
618
+ - participant_id
619
+ ```
620
+
621
+ Directory answers use an explicit inventory. Paths in `artifacts`,
622
+ `artifact_groups.paths`, and `child_bundles` are relative to the annotated
623
+ directory unless absolute:
624
+
625
+ ```yaml
626
+ target: path/to/run-001
627
+ title: Processing outputs
628
+ summary: Files produced by the shell processing workflow.
629
+
630
+ provenance:
631
+ command: bash process_from_instrument.sh
632
+ script: process_from_instrument.sh
633
+
634
+ artifacts:
635
+ - path: processed.csv
636
+ kind: dataset
637
+ title: Processed instrument output
638
+ summary: Normalized output from the processing script.
639
+
640
+ artifact_groups:
641
+ - title: Diagnostic plots
642
+ kind: plot
643
+ selector: plots/*.png
644
+ paths:
645
+ - plots/qc-1.png
646
+ - plots/qc-2.png
647
+
648
+ child_bundles:
649
+ - path: model
650
+ annotation_path: model/data-annotations.json
651
+ ```
652
+
653
+ Answers files may also use schema-style aliases such as `subject.path`,
654
+ `subject.kind`, `description.title`, `description.summary`,
655
+ `description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
656
+ and `provenance.params`.
657
+
573
658
  When group selectors are provided, the CLI expands them to concrete member paths
574
659
  at annotation time. Grouped files are tracked in `subject.produced_files[]` but
575
660
  are skipped by the per-file prompt flow, so you do not have to answer the same
@@ -588,6 +673,10 @@ data-annotations provenance chain path/to/artifact --full-paths
588
673
  data-annotations provenance checkout path/to/artifact
589
674
  ```
590
675
 
676
+ Command `checkout` downloads the recorded Git remote and checks out the recorded
677
+ commit. It prompts before downloading source code and defaults to No; use
678
+ `--force` when running trusted provenance checkout non-interactively.
679
+
591
680
  Command `match` auto-discovers `*.annotation.json` for files and `data-annotations.json` for
592
681
  directories, prints a verification summary, and suggests the exact `checkout`
593
682
  command to run next when Git recovery metadata is available.
@@ -597,6 +686,27 @@ those inputs. Its default output shows a compact relative-path tree and lists
597
686
  stale, missing, or unverifiable nodes first; use `--full-paths` when you need
598
687
  absolute paths.
599
688
 
689
+ For publication workflows, create a sanitized copy of an annotated artifact tree:
690
+
691
+ ```bash
692
+ data-annotations publish path/to/run-001 path/to/publish-bundle
693
+ data-annotations publish path/to/run-001 path/to/publish-bundle \
694
+ --prefix /private/raw/study-a='$INPUT_ROOT'
695
+ data-annotations publish path/to/run-001 path/to/publish-metadata \
696
+ --annotations-only
697
+ data-annotations publish path/to/run-001 path/to/publish-bundle --dry-run
698
+ ```
699
+
700
+ Command `publish` recursively discovers file annotations (`*.annotation.json`) and
701
+ directory annotations (`data-annotations.json`), writes a mirrored publish bundle,
702
+ and regenerates README sidecars from sanitized annotation JSON. Paths under the
703
+ source directory are rewritten to `$ARTIFACT_ROOT/...`; additional `--prefix`
704
+ mappings rewrite other private path roots. Hostname, username, and SLURM job ID
705
+ are redacted by default. Git remote URLs are preserved unless
706
+ `--redact-git-remote` is provided. Strict mode is enabled by default and fails if
707
+ any local absolute path remains after sanitization; use `--no-strict` only after
708
+ reviewing `--dry-run` output.
709
+
600
710
  If `data-annotations provenance --help` does not list `chain`, your shell is
601
711
  resolving an older installed command. From a source checkout, use
602
712
  `uv run data-annotations provenance chain ...`, or reinstall the CLI from the
@@ -627,6 +737,7 @@ uv run data-annotations annotate directory path/to/run-001
627
737
  uv run data-annotations provenance match path/to/participants.csv
628
738
  uv run data-annotations provenance chain path/to/participants.csv
629
739
  uv run data-annotations provenance checkout path/to/participants.csv
740
+ uv run data-annotations publish path/to/run-001 path/to/publish-bundle
630
741
  ```
631
742
 
632
743
  ## API Overview
@@ -698,6 +809,13 @@ uv run data-annotations provenance checkout path/to/participants.csv
698
809
  - `artifact_matches_manifest(...)`
699
810
  - `checkout_manifest_source(...)`
700
811
 
812
+ ### Publish Functions
813
+
814
+ - `discover_annotation_paths(...)`
815
+ - `sanitize_annotation_document(...)`
816
+ - `sanitize_annotation_path(...)`
817
+ - `publish_directory(...)`
818
+
701
819
  ## Examples
702
820
 
703
821
  Runnable examples live in `examples/` and mirror the README workflows.
@@ -712,6 +830,7 @@ uv run python examples/record_file_description.py
712
830
  uv run python examples/record_directory_description.py
713
831
  uv run python examples/annotate_file.py
714
832
  uv run python examples/annotate_directory.py
833
+ uv run python examples/annotate_file_answers_cli.py
715
834
  uv run python examples/write_file_manifest.py
716
835
  uv run python examples/write_directory_manifest.py
717
836
  uv run python examples/write_file_description.py
@@ -720,6 +839,7 @@ uv run python examples/provenance_chain.py
720
839
  uv run python examples/provenance_chain_cli.py
721
840
  uv run python examples/recover_provenance.py
722
841
  uv run python examples/recover_provenance_cli.py
842
+ uv run python examples/publish_cli.py
723
843
  ```
724
844
 
725
845
  Each example writes its outputs to a fresh temporary directory and prints the
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "data-annotations"
3
- version = "2.3.0"
3
+ version = "2.4.0"
4
4
  description = "Annotate generated data artifacts"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -30,7 +30,7 @@ Changelog = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/blob/main/C
30
30
  Issues = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/issues"
31
31
 
32
32
  [project.optional-dependencies]
33
- cli = ["questionary>=2.1.1", "typer>=0.16.0"]
33
+ cli = ["PyYAML>=6.0.2", "questionary>=2.1.1", "typer>=0.16.0"]
34
34
 
35
35
  [project.scripts]
36
36
  data-annotations = "data_annotations.cli:main"
@@ -1,7 +1,7 @@
1
1
  from typing import Any
2
2
 
3
3
  _CLI_IMPORT_ERROR: ModuleNotFoundError | None = None
4
- _CLI_OPTIONAL_DEPENDENCIES = {"questionary", "typer"}
4
+ _CLI_OPTIONAL_DEPENDENCIES = {"questionary", "typer", "yaml"}
5
5
  app: Any = None
6
6
 
7
7
  try:
@@ -9,6 +9,7 @@ try:
9
9
 
10
10
  from data_annotations.cli_app.annotate import annotate_app
11
11
  from data_annotations.cli_app.provenance_commands import provenance_app
12
+ from data_annotations.cli_app.publish import publish_command
12
13
  except ModuleNotFoundError as exc:
13
14
  if exc.name not in _CLI_OPTIONAL_DEPENDENCIES:
14
15
  raise
@@ -17,6 +18,7 @@ else:
17
18
  app = typer.Typer(no_args_is_help=True)
18
19
  app.add_typer(annotate_app, name="annotate")
19
20
  app.add_typer(provenance_app, name="provenance")
21
+ app.command("publish")(publish_command)
20
22
 
21
23
 
22
24
  def main() -> None: