data-annotations 2.5.0__tar.gz → 2.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {data_annotations-2.5.0 → data_annotations-2.7.0}/PKG-INFO +148 -4
  2. {data_annotations-2.5.0 → data_annotations-2.7.0}/README.md +146 -2
  3. {data_annotations-2.5.0 → data_annotations-2.7.0}/pyproject.toml +3 -3
  4. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/_decorators.py +39 -4
  5. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/annotations/__init__.py +12 -0
  6. {data_annotations-2.5.0/src/data_annotations/cli_app → data_annotations-2.7.0/src/data_annotations/annotations}/answers.py +151 -11
  7. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/annotations/decorators.py +104 -8
  8. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/annotations/writers.py +326 -27
  9. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/annotate/__init__.py +32 -0
  10. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/annotate/helpers.py +94 -3
  11. data_annotations-2.7.0/src/data_annotations/cli_app/answers.py +35 -0
  12. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/common.py +57 -1
  13. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/provenance_commands.py +15 -1
  14. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/__init__.py +4 -0
  15. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/decorators.py +17 -1
  16. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/models.py +1 -0
  17. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/chain.py +60 -11
  18. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/matching.py +76 -15
  19. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/writers.py +200 -18
  20. {data_annotations-2.5.0 → data_annotations-2.7.0}/LICENSE +0 -0
  21. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/__init__.py +0 -0
  22. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/annotations/models.py +0 -0
  23. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/cli.py +0 -0
  24. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/__init__.py +0 -0
  25. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/prompts.py +0 -0
  26. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/publish.py +0 -0
  27. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/description/__init__.py +0 -0
  28. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/description/decorators.py +0 -0
  29. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/description/models.py +0 -0
  30. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/description/writers.py +0 -0
  31. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/git.py +0 -0
  32. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/__init__.py +0 -0
  33. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/manifest.py +0 -0
  34. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/sources.py +0 -0
  35. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/types.py +0 -0
  36. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/runtime.py +0 -0
  37. {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/publish.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-annotations
3
- Version: 2.5.0
3
+ Version: 2.7.0
4
4
  Summary: Annotate data artifacts with provenance and descriptions
5
5
  Keywords: annotations,data,metadata,provenance,reproducibility
6
6
  Author: Rodrigo C. G. Pena
@@ -18,7 +18,7 @@ Classifier: Programming Language :: Python :: 3.14
18
18
  Classifier: Topic :: Scientific/Engineering
19
19
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
20
  Requires-Dist: pydantic>=2.13.1
21
- Requires-Dist: pyyaml>=6.0.2 ; extra == 'cli'
21
+ Requires-Dist: pyyaml>=6.0.2
22
22
  Requires-Dist: questionary>=2.1.1 ; extra == 'cli'
23
23
  Requires-Dist: typer>=0.16.0 ; extra == 'cli'
24
24
  Requires-Python: >=3.12
@@ -71,8 +71,9 @@ Or add it to a project with [uv](https://astral.sh/uv/):
71
71
  uv add data-annotations
72
72
  ```
73
73
 
74
- The command-line interface uses optional dependencies. Install the package with
75
- CLI support when you want to run `data-annotations` commands:
74
+ The command-line interface uses optional dependencies for prompting and command
75
+ parsing. Install the package with CLI support when you want to run
76
+ `data-annotations` commands:
76
77
 
77
78
  ```bash
78
79
  pip install "data-annotations[cli]"
@@ -109,6 +110,11 @@ Every annotation document includes provenance with:
109
110
  directory content digests, and upstream annotation sidecar references when
110
111
  present
111
112
 
113
+ Local file hashing defaults to checksum policy `auto`: existing files are hashed
114
+ only up to `10 * 1024**3` bytes (10 GiB). Larger files are still recorded, but
115
+ their `sha256` or directory `content_digest` is left unset unless you provide a
116
+ precomputed checksum yourself.
117
+
112
118
  You can also attach your own parameters, input file paths, and function names.
113
119
  Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
114
120
  such as `s3://...` or `https://...` are preserved as provided.
@@ -373,6 +379,39 @@ metadata to vary per call instead of staying fixed at decoration time, use
373
379
  `write_directory_annotation(...)` directly instead. See the example gallery in
374
380
  `examples/` for runnable examples of all approaches.
375
381
 
382
+ The Python API can also load the same YAML answers payloads used by the
383
+ CLI:
384
+
385
+ ```python
386
+ from data_annotations.annotations import (
387
+ annotate_directory,
388
+ annotate_file,
389
+ record_file_annotation,
390
+ )
391
+
392
+ annotate_file(answers="participants.yaml")
393
+ annotate_directory(answers="run-001.yaml")
394
+
395
+ annotate_file(
396
+ "outputs/summary.txt",
397
+ answers={"title": "Run Summary", "summary": "Validation run summary."},
398
+ )
399
+
400
+ @record_file_annotation(answers="participants.yaml")
401
+ def write_participants(artifact_path, input_path):
402
+ ...
403
+ ```
404
+
405
+ If an answers payload includes `target`, the positional artifact path or directory
406
+ may be omitted. When both are provided, they must resolve to the same path.
407
+ Explicit Python keyword arguments override values from `answers`. Environment
408
+ variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded inside string
409
+ values in both YAML files and mapping payloads.
410
+
411
+ For directory decorators, the wrapped function still provides the produced output
412
+ inventory. Matching `answers.artifacts` entries can supply titles, summaries,
413
+ kinds, fields, primary keys, and missing-value codes for those returned paths.
414
+
376
415
  ### When To Use Decorators Vs Direct Functions
377
416
 
378
417
  If a function is only a final serializer for already-prepared data, prefer the
@@ -502,6 +541,75 @@ README.
502
541
  If you want the direct writer approach instead, use `write_file_manifest(...)` and
503
542
  `write_directory_manifest(...)` (see `examples/`).
504
543
 
544
+ ## Checksum Policy
545
+
546
+ All provenance and annotation entry points that hash local files support the same
547
+ policy controls:
548
+
549
+ - `checksum_policy="auto"`: hash existing local files only when they are at or
550
+ below `max_checksum_bytes`. This is the default, and
551
+ `max_checksum_bytes` defaults to `10 * 1024**3` bytes (10 GiB).
552
+ - `checksum_policy="always"`: hash existing local files regardless of size.
553
+ - `checksum_policy="never"`: never hash local files automatically. Checksums are
554
+ recorded only when you supply them explicitly.
555
+
556
+ When a checksum is skipped, JSON sidecars keep the same schema and simply store
557
+ `sha256: null`. Directory `content_digest` is also left unset when any tracked
558
+ member file lacks a checksum.
559
+
560
+ You can change the policy from Python:
561
+
562
+ ```python
563
+ from data_annotations.annotations import annotate_file
564
+ from data_annotations.provenance import write_file_manifest
565
+
566
+ write_file_manifest(
567
+ "outputs/summary.txt",
568
+ checksum_policy="always",
569
+ )
570
+
571
+ annotate_file(
572
+ "outputs/summary.txt",
573
+ title="Run Summary",
574
+ summary="Post-hoc summary.",
575
+ artifact_sha256="precomputed-sha256",
576
+ checksum_policy="never",
577
+ )
578
+ ```
579
+
580
+ You can also inject precomputed checksums directly:
581
+
582
+ - File APIs: pass `artifact_sha256=...`.
583
+ - File or directory APIs: pass `checksum_overrides={path: sha256}`. For
584
+ directory outputs, keys can be relative to the output directory or absolute
585
+ paths.
586
+ - Decorators such as `record_file_manifest(...)`, `record_directory_manifest(...)`,
587
+ `record_file_annotation(...)`, and `record_directory_annotation(...)` accept the
588
+ same checksum-policy arguments.
589
+
590
+ From the CLI, use `--checksum-policy`, `--max-checksum-bytes`, `--sha256`, and
591
+ repeatable `--checksum PATH=SHA256`:
592
+
593
+ ```bash
594
+ data-annotations annotate file path/to/summary.txt \
595
+ --title "Run Summary" \
596
+ --summary "Post-hoc summary." \
597
+ --kind report \
598
+ --checksum-policy never \
599
+ --sha256 0123456789abcdef...
600
+
601
+ data-annotations annotate directory path/to/run-001 \
602
+ --title "Processing outputs" \
603
+ --summary "Directory-level outputs." \
604
+ --checksum-policy never \
605
+ --checksum processed.csv=0123456789abcdef...
606
+
607
+ data-annotations provenance chain path/to/run-001 \
608
+ --checksum-policy always
609
+ ```
610
+
611
+ For a complete runnable workflow, see `examples/checksum_policy.py`.
612
+
505
613
  ## Description Layer
506
614
 
507
615
  The `data_annotations.description` sub-package provides the structured description
@@ -628,6 +736,7 @@ target: path/to/participants.csv
628
736
  title: Participant Cohort
629
737
  summary: Participant-level cohort assignments.
630
738
  kind: dataset
739
+ sha256: 0123456789abcdef...
631
740
 
632
741
  inputs:
633
742
  - ${DATA_ROOT}/raw/participants.csv
@@ -670,6 +779,9 @@ provenance:
670
779
  command: bash process_from_instrument.sh
671
780
  script: process_from_instrument.sh
672
781
 
782
+ checksums:
783
+ processed.csv: 0123456789abcdef...
784
+
673
785
  artifacts:
674
786
  - path: processed.csv
675
787
  kind: dataset
@@ -694,6 +806,32 @@ Answers files may also use schema-style aliases such as `subject.path`,
694
806
  `description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
695
807
  and `provenance.params`.
696
808
 
809
+ Answers can request selected provenance fields from the current runtime instead
810
+ of taking them from the payload:
811
+
812
+ ```yaml
813
+ target: path/to/run-001
814
+ title: Processing outputs
815
+ summary: Files produced by the shell processing workflow.
816
+
817
+ provenance:
818
+ command: bash generate_some_data_artifact.sh
819
+ script: generate_some_data_artifact.sh
820
+ infer_from_runtime:
821
+ - runtime
822
+ - git
823
+ - source_code
824
+ ```
825
+
826
+ `runtime` covers `created_at`, `hostname`, `username`, and `slurm_job_id`. `git`
827
+ covers Git commit, branch, dirty state, remote, tags, and `git describe`.
828
+ `source_code` leaves the source-code reference derived from runtime Git metadata.
829
+ This is especially useful for timestamps, host/user and SLURM context, Git state,
830
+ and derived `source_code`. Provide generation `command` and `script` explicitly
831
+ in CLI answers files, because the runtime command and script would describe the
832
+ `data-annotations annotate ...` invocation rather than the script that generated
833
+ the artifact.
834
+
697
835
  For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
698
836
  `file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
699
837
  sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
@@ -765,6 +903,11 @@ resolving an older installed command. From a source checkout, use
765
903
  `uv run data-annotations provenance chain ...`, or reinstall the CLI from the
766
904
  updated source before using the bare `data-annotations` command.
767
905
 
906
+ Both `match` and `chain` also accept `--checksum-policy` and
907
+ `--max-checksum-bytes`. Use `--checksum-policy always` when you want full
908
+ verification of large local files, and leave the default `auto` when you prefer
909
+ to avoid long checksum passes on very large artifacts.
910
+
768
911
  ### Run With `uvx`
769
912
 
770
913
  ```bash
@@ -886,6 +1029,7 @@ uv run python examples/record_file_description.py
886
1029
  uv run python examples/record_directory_description.py
887
1030
  uv run python examples/annotate_file.py
888
1031
  uv run python examples/annotate_directory.py
1032
+ uv run python examples/checksum_policy.py
889
1033
  uv run python examples/annotate_file_answers_cli.py
890
1034
  uv run python examples/write_file_manifest.py
891
1035
  uv run python examples/write_directory_manifest.py
@@ -41,8 +41,9 @@ Or add it to a project with [uv](https://astral.sh/uv/):
41
41
  uv add data-annotations
42
42
  ```
43
43
 
44
- The command-line interface uses optional dependencies. Install the package with
45
- CLI support when you want to run `data-annotations` commands:
44
+ The command-line interface uses optional dependencies for prompting and command
45
+ parsing. Install the package with CLI support when you want to run
46
+ `data-annotations` commands:
46
47
 
47
48
  ```bash
48
49
  pip install "data-annotations[cli]"
@@ -79,6 +80,11 @@ Every annotation document includes provenance with:
79
80
  directory content digests, and upstream annotation sidecar references when
80
81
  present
81
82
 
83
+ Local file hashing defaults to checksum policy `auto`: existing files are hashed
84
+ only up to `10 * 1024**3` bytes (10 GiB). Larger files are still recorded, but
85
+ their `sha256` or directory `content_digest` is left unset unless you provide a
86
+ precomputed checksum yourself.
87
+
82
88
  You can also attach your own parameters, input file paths, and function names.
83
89
  Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
84
90
  such as `s3://...` or `https://...` are preserved as provided.
@@ -343,6 +349,39 @@ metadata to vary per call instead of staying fixed at decoration time, use
343
349
  `write_directory_annotation(...)` directly instead. See the example gallery in
344
350
  `examples/` for runnable examples of all approaches.
345
351
 
352
+ The Python API can also load the same YAML answers payloads used by the
353
+ CLI:
354
+
355
+ ```python
356
+ from data_annotations.annotations import (
357
+ annotate_directory,
358
+ annotate_file,
359
+ record_file_annotation,
360
+ )
361
+
362
+ annotate_file(answers="participants.yaml")
363
+ annotate_directory(answers="run-001.yaml")
364
+
365
+ annotate_file(
366
+ "outputs/summary.txt",
367
+ answers={"title": "Run Summary", "summary": "Validation run summary."},
368
+ )
369
+
370
+ @record_file_annotation(answers="participants.yaml")
371
+ def write_participants(artifact_path, input_path):
372
+ ...
373
+ ```
374
+
375
+ If an answers payload includes `target`, the positional artifact path or directory
376
+ may be omitted. When both are provided, they must resolve to the same path.
377
+ Explicit Python keyword arguments override values from `answers`. Environment
378
+ variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded inside string
379
+ values in both YAML files and mapping payloads.
380
+
381
+ For directory decorators, the wrapped function still provides the produced output
382
+ inventory. Matching `answers.artifacts` entries can supply titles, summaries,
383
+ kinds, fields, primary keys, and missing-value codes for those returned paths.
384
+
346
385
  ### When To Use Decorators Vs Direct Functions
347
386
 
348
387
  If a function is only a final serializer for already-prepared data, prefer the
@@ -472,6 +511,75 @@ README.
472
511
  If you want the direct writer approach instead, use `write_file_manifest(...)` and
473
512
  `write_directory_manifest(...)` (see `examples/`).
474
513
 
514
+ ## Checksum Policy
515
+
516
+ All provenance and annotation entry points that hash local files support the same
517
+ policy controls:
518
+
519
+ - `checksum_policy="auto"`: hash existing local files only when they are at or
520
+ below `max_checksum_bytes`. This is the default, and
521
+ `max_checksum_bytes` defaults to `10 * 1024**3` bytes (10 GiB).
522
+ - `checksum_policy="always"`: hash existing local files regardless of size.
523
+ - `checksum_policy="never"`: never hash local files automatically. Checksums are
524
+ recorded only when you supply them explicitly.
525
+
526
+ When a checksum is skipped, JSON sidecars keep the same schema and simply store
527
+ `sha256: null`. Directory `content_digest` is also left unset when any tracked
528
+ member file lacks a checksum.
529
+
530
+ You can change the policy from Python:
531
+
532
+ ```python
533
+ from data_annotations.annotations import annotate_file
534
+ from data_annotations.provenance import write_file_manifest
535
+
536
+ write_file_manifest(
537
+ "outputs/summary.txt",
538
+ checksum_policy="always",
539
+ )
540
+
541
+ annotate_file(
542
+ "outputs/summary.txt",
543
+ title="Run Summary",
544
+ summary="Post-hoc summary.",
545
+ artifact_sha256="precomputed-sha256",
546
+ checksum_policy="never",
547
+ )
548
+ ```
549
+
550
+ You can also inject precomputed checksums directly:
551
+
552
+ - File APIs: pass `artifact_sha256=...`.
553
+ - File or directory APIs: pass `checksum_overrides={path: sha256}`. For
554
+ directory outputs, keys can be relative to the output directory or absolute
555
+ paths.
556
+ - Decorators such as `record_file_manifest(...)`, `record_directory_manifest(...)`,
557
+ `record_file_annotation(...)`, and `record_directory_annotation(...)` accept the
558
+ same checksum-policy arguments.
559
+
560
+ From the CLI, use `--checksum-policy`, `--max-checksum-bytes`, `--sha256`, and
561
+ repeatable `--checksum PATH=SHA256`:
562
+
563
+ ```bash
564
+ data-annotations annotate file path/to/summary.txt \
565
+ --title "Run Summary" \
566
+ --summary "Post-hoc summary." \
567
+ --kind report \
568
+ --checksum-policy never \
569
+ --sha256 0123456789abcdef...
570
+
571
+ data-annotations annotate directory path/to/run-001 \
572
+ --title "Processing outputs" \
573
+ --summary "Directory-level outputs." \
574
+ --checksum-policy never \
575
+ --checksum processed.csv=0123456789abcdef...
576
+
577
+ data-annotations provenance chain path/to/run-001 \
578
+ --checksum-policy always
579
+ ```
580
+
581
+ For a complete runnable workflow, see `examples/checksum_policy.py`.
582
+
475
583
  ## Description Layer
476
584
 
477
585
  The `data_annotations.description` sub-package provides the structured description
@@ -598,6 +706,7 @@ target: path/to/participants.csv
598
706
  title: Participant Cohort
599
707
  summary: Participant-level cohort assignments.
600
708
  kind: dataset
709
+ sha256: 0123456789abcdef...
601
710
 
602
711
  inputs:
603
712
  - ${DATA_ROOT}/raw/participants.csv
@@ -640,6 +749,9 @@ provenance:
640
749
  command: bash process_from_instrument.sh
641
750
  script: process_from_instrument.sh
642
751
 
752
+ checksums:
753
+ processed.csv: 0123456789abcdef...
754
+
643
755
  artifacts:
644
756
  - path: processed.csv
645
757
  kind: dataset
@@ -664,6 +776,32 @@ Answers files may also use schema-style aliases such as `subject.path`,
664
776
  `description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
665
777
  and `provenance.params`.
666
778
 
779
+ Answers can request selected provenance fields from the current runtime instead
780
+ of taking them from the payload:
781
+
782
+ ```yaml
783
+ target: path/to/run-001
784
+ title: Processing outputs
785
+ summary: Files produced by the shell processing workflow.
786
+
787
+ provenance:
788
+ command: bash generate_some_data_artifact.sh
789
+ script: generate_some_data_artifact.sh
790
+ infer_from_runtime:
791
+ - runtime
792
+ - git
793
+ - source_code
794
+ ```
795
+
796
+ `runtime` covers `created_at`, `hostname`, `username`, and `slurm_job_id`. `git`
797
+ covers Git commit, branch, dirty state, remote, tags, and `git describe`.
798
+ `source_code` leaves the source-code reference derived from runtime Git metadata.
799
+ This is especially useful for timestamps, host/user and SLURM context, Git state,
800
+ and derived `source_code`. Provide generation `command` and `script` explicitly
801
+ in CLI answers files, because the runtime command and script would describe the
802
+ `data-annotations annotate ...` invocation rather than the script that generated
803
+ the artifact.
804
+
667
805
  For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
668
806
  `file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
669
807
  sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
@@ -735,6 +873,11 @@ resolving an older installed command. From a source checkout, use
735
873
  `uv run data-annotations provenance chain ...`, or reinstall the CLI from the
736
874
  updated source before using the bare `data-annotations` command.
737
875
 
876
+ Both `match` and `chain` also accept `--checksum-policy` and
877
+ `--max-checksum-bytes`. Use `--checksum-policy always` when you want full
878
+ verification of large local files, and leave the default `auto` when you prefer
879
+ to avoid long checksum passes on very large artifacts.
880
+
738
881
  ### Run With `uvx`
739
882
 
740
883
  ```bash
@@ -856,6 +999,7 @@ uv run python examples/record_file_description.py
856
999
  uv run python examples/record_directory_description.py
857
1000
  uv run python examples/annotate_file.py
858
1001
  uv run python examples/annotate_directory.py
1002
+ uv run python examples/checksum_policy.py
859
1003
  uv run python examples/annotate_file_answers_cli.py
860
1004
  uv run python examples/write_file_manifest.py
861
1005
  uv run python examples/write_directory_manifest.py
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "data-annotations"
3
- version = "2.5.0"
3
+ version = "2.7.0"
4
4
  description = "Annotate data artifacts with provenance and descriptions"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -9,7 +9,7 @@ authors = [
9
9
  license = "BSD-3-Clause"
10
10
  license-files = ["LICENSE"]
11
11
  requires-python = ">=3.12"
12
- dependencies = ["pydantic>=2.13.1"]
12
+ dependencies = ["pydantic>=2.13.1", "PyYAML>=6.0.2"]
13
13
  keywords = ["annotations", "data", "metadata", "provenance", "reproducibility"]
14
14
  classifiers = [
15
15
  "Development Status :: 4 - Beta",
@@ -30,7 +30,7 @@ Changelog = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/blob/main/C
30
30
  Issues = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/issues"
31
31
 
32
32
  [project.optional-dependencies]
33
- cli = ["PyYAML>=6.0.2", "questionary>=2.1.1", "typer>=0.16.0"]
33
+ cli = ["questionary>=2.1.1", "typer>=0.16.0"]
34
34
 
35
35
  [project.scripts]
36
36
  data-annotations = "data_annotations.cli:main"
@@ -9,6 +9,7 @@ if TYPE_CHECKING:
9
9
  DocumentedArtifactGroup,
10
10
  )
11
11
  from data_annotations.provenance.models import ChildBundle, ProducedFile
12
+ from data_annotations.provenance.models import ChecksumPolicy
12
13
 
13
14
  DEFAULT_INPUT_ARGS = ("input_path", "input_paths")
14
15
 
@@ -78,6 +79,8 @@ def coerce_produced_file(
78
79
  item: Any,
79
80
  *,
80
81
  normalize_paths: bool = True,
82
+ checksum_policy: "ChecksumPolicy" = "auto",
83
+ max_checksum_bytes: int | None = None,
81
84
  ) -> "ProducedFile":
82
85
  from data_annotations.description.models import DocumentedArtifact
83
86
  from data_annotations.provenance import writers as provenance_writers
@@ -89,7 +92,15 @@ def coerce_produced_file(
89
92
  path=str(path),
90
93
  kind=item.kind,
91
94
  sha256=(
92
- provenance_writers.sha256_file(path)
95
+ provenance_writers._resolve_file_sha256(
96
+ path,
97
+ checksum_policy=checksum_policy,
98
+ max_checksum_bytes=(
99
+ max_checksum_bytes
100
+ if max_checksum_bytes is not None
101
+ else provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES
102
+ ),
103
+ )
93
104
  if normalize_paths and path.exists()
94
105
  else None
95
106
  ),
@@ -106,7 +117,15 @@ def coerce_produced_file(
106
117
  path=str(normalized),
107
118
  kind=kind,
108
119
  sha256=(
109
- provenance_writers.sha256_file(normalized)
120
+ provenance_writers._resolve_file_sha256(
121
+ normalized,
122
+ checksum_policy=checksum_policy,
123
+ max_checksum_bytes=(
124
+ max_checksum_bytes
125
+ if max_checksum_bytes is not None
126
+ else provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES
127
+ ),
128
+ )
110
129
  if normalize_paths and normalized.exists()
111
130
  else None
112
131
  ),
@@ -117,7 +136,15 @@ def coerce_produced_file(
117
136
  path=str(path),
118
137
  kind="other",
119
138
  sha256=(
120
- provenance_writers.sha256_file(path)
139
+ provenance_writers._resolve_file_sha256(
140
+ path,
141
+ checksum_policy=checksum_policy,
142
+ max_checksum_bytes=(
143
+ max_checksum_bytes
144
+ if max_checksum_bytes is not None
145
+ else provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES
146
+ ),
147
+ )
121
148
  if normalize_paths and path.exists()
122
149
  else None
123
150
  ),
@@ -128,9 +155,17 @@ def coerce_produced_files(
128
155
  items: Iterable[Any],
129
156
  *,
130
157
  normalize_paths: bool = True,
158
+ checksum_policy: "ChecksumPolicy" = "auto",
159
+ max_checksum_bytes: int | None = None,
131
160
  ) -> list["ProducedFile"]:
132
161
  return [
133
- coerce_produced_file(item, normalize_paths=normalize_paths) for item in items
162
+ coerce_produced_file(
163
+ item,
164
+ normalize_paths=normalize_paths,
165
+ checksum_policy=checksum_policy,
166
+ max_checksum_bytes=max_checksum_bytes,
167
+ )
168
+ for item in items
134
169
  ]
135
170
 
136
171
 
@@ -1,3 +1,10 @@
1
+ from .answers import (
2
+ AnswersError,
3
+ DirectoryAnswers,
4
+ FileAnswers,
5
+ load_directory_answers,
6
+ load_file_answers,
7
+ )
1
8
  from .models import (
2
9
  DirectoryAnnotationDocument,
3
10
  DirectoryAnnotationResult,
@@ -17,13 +24,18 @@ from .writers import (
17
24
  __all__ = [
18
25
  "annotate_directory",
19
26
  "annotate_file",
27
+ "load_directory_answers",
28
+ "load_file_answers",
20
29
  "record_directory_annotation",
21
30
  "record_file_annotation",
22
31
  "write_directory_annotation",
23
32
  "write_file_annotation",
33
+ "AnswersError",
34
+ "DirectoryAnswers",
24
35
  "DirectoryAnnotationDocument",
25
36
  "DirectoryAnnotationResult",
26
37
  "DirectoryArtifactSubject",
38
+ "FileAnswers",
27
39
  "FileAnnotationDocument",
28
40
  "FileAnnotationResult",
29
41
  "FileArtifactSubject",