data-annotations 2.5.0__tar.gz → 2.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {data_annotations-2.5.0 → data_annotations-2.6.0}/PKG-INFO +85 -1
  2. {data_annotations-2.5.0 → data_annotations-2.6.0}/README.md +84 -0
  3. {data_annotations-2.5.0 → data_annotations-2.6.0}/pyproject.toml +1 -1
  4. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/_decorators.py +39 -4
  5. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/annotations/decorators.py +18 -1
  6. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/annotations/writers.py +44 -1
  7. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/annotate/__init__.py +32 -0
  8. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/annotate/helpers.py +17 -1
  9. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/answers.py +14 -1
  10. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/common.py +57 -1
  11. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/provenance_commands.py +15 -1
  12. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/__init__.py +4 -0
  13. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/decorators.py +17 -1
  14. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/models.py +1 -0
  15. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/recovery/chain.py +60 -11
  16. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/recovery/matching.py +76 -15
  17. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/writers.py +200 -18
  18. {data_annotations-2.5.0 → data_annotations-2.6.0}/LICENSE +0 -0
  19. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/__init__.py +0 -0
  20. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/annotations/__init__.py +0 -0
  21. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/annotations/models.py +0 -0
  22. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/cli.py +0 -0
  23. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/__init__.py +0 -0
  24. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/prompts.py +0 -0
  25. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/publish.py +0 -0
  26. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/description/__init__.py +0 -0
  27. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/description/decorators.py +0 -0
  28. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/description/models.py +0 -0
  29. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/description/writers.py +0 -0
  30. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/git.py +0 -0
  31. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/recovery/__init__.py +0 -0
  32. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/recovery/manifest.py +0 -0
  33. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/recovery/sources.py +0 -0
  34. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/recovery/types.py +0 -0
  35. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/runtime.py +0 -0
  36. {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/publish.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-annotations
3
- Version: 2.5.0
3
+ Version: 2.6.0
4
4
  Summary: Annotate data artifacts with provenance and descriptions
5
5
  Keywords: annotations,data,metadata,provenance,reproducibility
6
6
  Author: Rodrigo C. G. Pena
@@ -109,6 +109,11 @@ Every annotation document includes provenance with:
109
109
  directory content digests, and upstream annotation sidecar references when
110
110
  present
111
111
 
112
+ Local file hashing defaults to checksum policy `auto`: existing files are hashed
113
+ only up to `10 * 1024**3` bytes (10 GiB). Larger files are still recorded, but
114
+ their `sha256` or directory `content_digest` is left unset unless you provide a
115
+ precomputed checksum yourself.
116
+
112
117
  You can also attach your own parameters, input file paths, and function names.
113
118
  Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
114
119
  such as `s3://...` or `https://...` are preserved as provided.
@@ -502,6 +507,75 @@ README.
502
507
  If you want the direct writer approach instead, use `write_file_manifest(...)` and
503
508
  `write_directory_manifest(...)` (see `examples/`).
504
509
 
510
+ ## Checksum Policy
511
+
512
+ All provenance and annotation entry points that hash local files support the same
513
+ policy controls:
514
+
515
+ - `checksum_policy="auto"`: hash existing local files only when they are at or
516
+ below `max_checksum_bytes`. This is the default, and
517
+ `max_checksum_bytes` defaults to `10 * 1024**3` bytes (10 GiB).
518
+ - `checksum_policy="always"`: hash existing local files regardless of size.
519
+ - `checksum_policy="never"`: never hash local files automatically. Checksums are
520
+ recorded only when you supply them explicitly.
521
+
522
+ When a checksum is skipped, JSON sidecars keep the same schema and simply store
523
+ `sha256: null`. Directory `content_digest` is also left unset when any tracked
524
+ member file lacks a checksum.
525
+
526
+ You can change the policy from Python:
527
+
528
+ ```python
529
+ from data_annotations.annotations import annotate_file
530
+ from data_annotations.provenance import write_file_manifest
531
+
532
+ write_file_manifest(
533
+ "outputs/summary.txt",
534
+ checksum_policy="always",
535
+ )
536
+
537
+ annotate_file(
538
+ "outputs/summary.txt",
539
+ title="Run Summary",
540
+ summary="Post-hoc summary.",
541
+ artifact_sha256="precomputed-sha256",
542
+ checksum_policy="never",
543
+ )
544
+ ```
545
+
546
+ You can also inject precomputed checksums directly:
547
+
548
+ - File APIs: pass `artifact_sha256=...`.
549
+ - File or directory APIs: pass `checksum_overrides={path: sha256}`. For
550
+ directory outputs, keys can be relative to the output directory or absolute
551
+ paths.
552
+ - Decorators such as `record_file_manifest(...)`, `record_directory_manifest(...)`,
553
+ `record_file_annotation(...)`, and `record_directory_annotation(...)` accept the
554
+ same checksum-policy arguments.
555
+
556
+ From the CLI, use `--checksum-policy`, `--max-checksum-bytes`, `--sha256`, and
557
+ repeatable `--checksum PATH=SHA256`:
558
+
559
+ ```bash
560
+ data-annotations annotate file path/to/summary.txt \
561
+ --title "Run Summary" \
562
+ --summary "Post-hoc summary." \
563
+ --kind report \
564
+ --checksum-policy never \
565
+ --sha256 0123456789abcdef...
566
+
567
+ data-annotations annotate directory path/to/run-001 \
568
+ --title "Processing outputs" \
569
+ --summary "Directory-level outputs." \
570
+ --checksum-policy never \
571
+ --checksum processed.csv=0123456789abcdef...
572
+
573
+ data-annotations provenance chain path/to/run-001 \
574
+ --checksum-policy always
575
+ ```
576
+
577
+ For a complete runnable workflow, see `examples/checksum_policy.py`.
578
+
505
579
  ## Description Layer
506
580
 
507
581
  The `data_annotations.description` sub-package provides the structured description
@@ -628,6 +702,7 @@ target: path/to/participants.csv
628
702
  title: Participant Cohort
629
703
  summary: Participant-level cohort assignments.
630
704
  kind: dataset
705
+ sha256: 0123456789abcdef...
631
706
 
632
707
  inputs:
633
708
  - ${DATA_ROOT}/raw/participants.csv
@@ -670,6 +745,9 @@ provenance:
670
745
  command: bash process_from_instrument.sh
671
746
  script: process_from_instrument.sh
672
747
 
748
+ checksums:
749
+ processed.csv: 0123456789abcdef...
750
+
673
751
  artifacts:
674
752
  - path: processed.csv
675
753
  kind: dataset
@@ -765,6 +843,11 @@ resolving an older installed command. From a source checkout, use
765
843
  `uv run data-annotations provenance chain ...`, or reinstall the CLI from the
766
844
  updated source before using the bare `data-annotations` command.
767
845
 
846
+ Both `match` and `chain` also accept `--checksum-policy` and
847
+ `--max-checksum-bytes`. Use `--checksum-policy always` when you want full
848
+ verification of large local files, and leave the default `auto` when you prefer
849
+ to avoid long checksum passes on very large artifacts.
850
+
768
851
  ### Run With `uvx`
769
852
 
770
853
  ```bash
@@ -886,6 +969,7 @@ uv run python examples/record_file_description.py
886
969
  uv run python examples/record_directory_description.py
887
970
  uv run python examples/annotate_file.py
888
971
  uv run python examples/annotate_directory.py
972
+ uv run python examples/checksum_policy.py
889
973
  uv run python examples/annotate_file_answers_cli.py
890
974
  uv run python examples/write_file_manifest.py
891
975
  uv run python examples/write_directory_manifest.py
@@ -79,6 +79,11 @@ Every annotation document includes provenance with:
79
79
  directory content digests, and upstream annotation sidecar references when
80
80
  present
81
81
 
82
+ Local file hashing defaults to checksum policy `auto`: existing files are hashed
83
+ only up to `10 * 1024**3` bytes (10 GiB). Larger files are still recorded, but
84
+ their `sha256` or directory `content_digest` is left unset unless you provide a
85
+ precomputed checksum yourself.
86
+
82
87
  You can also attach your own parameters, input file paths, and function names.
83
88
  Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
84
89
  such as `s3://...` or `https://...` are preserved as provided.
@@ -472,6 +477,75 @@ README.
472
477
  If you want the direct writer approach instead, use `write_file_manifest(...)` and
473
478
  `write_directory_manifest(...)` (see `examples/`).
474
479
 
480
+ ## Checksum Policy
481
+
482
+ All provenance and annotation entry points that hash local files support the same
483
+ policy controls:
484
+
485
+ - `checksum_policy="auto"`: hash existing local files only when they are at or
486
+ below `max_checksum_bytes`. This is the default, and
487
+ `max_checksum_bytes` defaults to `10 * 1024**3` bytes (10 GiB).
488
+ - `checksum_policy="always"`: hash existing local files regardless of size.
489
+ - `checksum_policy="never"`: never hash local files automatically. Checksums are
490
+ recorded only when you supply them explicitly.
491
+
492
+ When a checksum is skipped, JSON sidecars keep the same schema and simply store
493
+ `sha256: null`. Directory `content_digest` is also left unset when any tracked
494
+ member file lacks a checksum.
495
+
496
+ You can change the policy from Python:
497
+
498
+ ```python
499
+ from data_annotations.annotations import annotate_file
500
+ from data_annotations.provenance import write_file_manifest
501
+
502
+ write_file_manifest(
503
+ "outputs/summary.txt",
504
+ checksum_policy="always",
505
+ )
506
+
507
+ annotate_file(
508
+ "outputs/summary.txt",
509
+ title="Run Summary",
510
+ summary="Post-hoc summary.",
511
+ artifact_sha256="precomputed-sha256",
512
+ checksum_policy="never",
513
+ )
514
+ ```
515
+
516
+ You can also inject precomputed checksums directly:
517
+
518
+ - File APIs: pass `artifact_sha256=...`.
519
+ - File or directory APIs: pass `checksum_overrides={path: sha256}`. For
520
+ directory outputs, keys can be relative to the output directory or absolute
521
+ paths.
522
+ - Decorators such as `record_file_manifest(...)`, `record_directory_manifest(...)`,
523
+ `record_file_annotation(...)`, and `record_directory_annotation(...)` accept the
524
+ same checksum-policy arguments.
525
+
526
+ From the CLI, use `--checksum-policy`, `--max-checksum-bytes`, `--sha256`, and
527
+ repeatable `--checksum PATH=SHA256`:
528
+
529
+ ```bash
530
+ data-annotations annotate file path/to/summary.txt \
531
+ --title "Run Summary" \
532
+ --summary "Post-hoc summary." \
533
+ --kind report \
534
+ --checksum-policy never \
535
+ --sha256 0123456789abcdef...
536
+
537
+ data-annotations annotate directory path/to/run-001 \
538
+ --title "Processing outputs" \
539
+ --summary "Directory-level outputs." \
540
+ --checksum-policy never \
541
+ --checksum processed.csv=0123456789abcdef...
542
+
543
+ data-annotations provenance chain path/to/run-001 \
544
+ --checksum-policy always
545
+ ```
546
+
547
+ For a complete runnable workflow, see `examples/checksum_policy.py`.
548
+
475
549
  ## Description Layer
476
550
 
477
551
  The `data_annotations.description` sub-package provides the structured description
@@ -598,6 +672,7 @@ target: path/to/participants.csv
598
672
  title: Participant Cohort
599
673
  summary: Participant-level cohort assignments.
600
674
  kind: dataset
675
+ sha256: 0123456789abcdef...
601
676
 
602
677
  inputs:
603
678
  - ${DATA_ROOT}/raw/participants.csv
@@ -640,6 +715,9 @@ provenance:
640
715
  command: bash process_from_instrument.sh
641
716
  script: process_from_instrument.sh
642
717
 
718
+ checksums:
719
+ processed.csv: 0123456789abcdef...
720
+
643
721
  artifacts:
644
722
  - path: processed.csv
645
723
  kind: dataset
@@ -735,6 +813,11 @@ resolving an older installed command. From a source checkout, use
735
813
  `uv run data-annotations provenance chain ...`, or reinstall the CLI from the
736
814
  updated source before using the bare `data-annotations` command.
737
815
 
816
+ Both `match` and `chain` also accept `--checksum-policy` and
817
+ `--max-checksum-bytes`. Use `--checksum-policy always` when you want full
818
+ verification of large local files, and leave the default `auto` when you prefer
819
+ to avoid long checksum passes on very large artifacts.
820
+
738
821
  ### Run With `uvx`
739
822
 
740
823
  ```bash
@@ -856,6 +939,7 @@ uv run python examples/record_file_description.py
856
939
  uv run python examples/record_directory_description.py
857
940
  uv run python examples/annotate_file.py
858
941
  uv run python examples/annotate_directory.py
942
+ uv run python examples/checksum_policy.py
859
943
  uv run python examples/annotate_file_answers_cli.py
860
944
  uv run python examples/write_file_manifest.py
861
945
  uv run python examples/write_directory_manifest.py
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "data-annotations"
3
- version = "2.5.0"
3
+ version = "2.6.0"
4
4
  description = "Annotate data artifacts with provenance and descriptions"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -9,6 +9,7 @@ if TYPE_CHECKING:
9
9
  DocumentedArtifactGroup,
10
10
  )
11
11
  from data_annotations.provenance.models import ChildBundle, ProducedFile
12
+ from data_annotations.provenance.models import ChecksumPolicy
12
13
 
13
14
  DEFAULT_INPUT_ARGS = ("input_path", "input_paths")
14
15
 
@@ -78,6 +79,8 @@ def coerce_produced_file(
78
79
  item: Any,
79
80
  *,
80
81
  normalize_paths: bool = True,
82
+ checksum_policy: "ChecksumPolicy" = "auto",
83
+ max_checksum_bytes: int | None = None,
81
84
  ) -> "ProducedFile":
82
85
  from data_annotations.description.models import DocumentedArtifact
83
86
  from data_annotations.provenance import writers as provenance_writers
@@ -89,7 +92,15 @@ def coerce_produced_file(
89
92
  path=str(path),
90
93
  kind=item.kind,
91
94
  sha256=(
92
- provenance_writers.sha256_file(path)
95
+ provenance_writers._resolve_file_sha256(
96
+ path,
97
+ checksum_policy=checksum_policy,
98
+ max_checksum_bytes=(
99
+ max_checksum_bytes
100
+ if max_checksum_bytes is not None
101
+ else provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES
102
+ ),
103
+ )
93
104
  if normalize_paths and path.exists()
94
105
  else None
95
106
  ),
@@ -106,7 +117,15 @@ def coerce_produced_file(
106
117
  path=str(normalized),
107
118
  kind=kind,
108
119
  sha256=(
109
- provenance_writers.sha256_file(normalized)
120
+ provenance_writers._resolve_file_sha256(
121
+ normalized,
122
+ checksum_policy=checksum_policy,
123
+ max_checksum_bytes=(
124
+ max_checksum_bytes
125
+ if max_checksum_bytes is not None
126
+ else provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES
127
+ ),
128
+ )
110
129
  if normalize_paths and normalized.exists()
111
130
  else None
112
131
  ),
@@ -117,7 +136,15 @@ def coerce_produced_file(
117
136
  path=str(path),
118
137
  kind="other",
119
138
  sha256=(
120
- provenance_writers.sha256_file(path)
139
+ provenance_writers._resolve_file_sha256(
140
+ path,
141
+ checksum_policy=checksum_policy,
142
+ max_checksum_bytes=(
143
+ max_checksum_bytes
144
+ if max_checksum_bytes is not None
145
+ else provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES
146
+ ),
147
+ )
121
148
  if normalize_paths and path.exists()
122
149
  else None
123
150
  ),
@@ -128,9 +155,17 @@ def coerce_produced_files(
128
155
  items: Iterable[Any],
129
156
  *,
130
157
  normalize_paths: bool = True,
158
+ checksum_policy: "ChecksumPolicy" = "auto",
159
+ max_checksum_bytes: int | None = None,
131
160
  ) -> list["ProducedFile"]:
132
161
  return [
133
- coerce_produced_file(item, normalize_paths=normalize_paths) for item in items
162
+ coerce_produced_file(
163
+ item,
164
+ normalize_paths=normalize_paths,
165
+ checksum_policy=checksum_policy,
166
+ max_checksum_bytes=max_checksum_bytes,
167
+ )
168
+ for item in items
134
169
  ]
135
170
 
136
171
 
@@ -1,4 +1,6 @@
1
+ from collections.abc import Mapping
1
2
  from functools import wraps
3
+ from pathlib import Path
2
4
  from typing import Any, Callable
3
5
 
4
6
  from data_annotations._decorators import (
@@ -12,7 +14,8 @@ from data_annotations._decorators import (
12
14
  split_child_bundles,
13
15
  )
14
16
  from data_annotations.description.models import DocumentedArtifact, FieldDefinition
15
- from data_annotations.provenance.models import ArtifactKind
17
+ from data_annotations.provenance import writers as provenance_writers
18
+ from data_annotations.provenance.models import ArtifactKind, ChecksumPolicy
16
19
 
17
20
  from .writers import annotate_directory, annotate_file
18
21
 
@@ -29,10 +32,14 @@ def record_file_annotation(
29
32
  acquisition_context: dict[str, Any] | None = None,
30
33
  generation_context: dict[str, Any] | None = None,
31
34
  artifact_kind: ArtifactKind = "other",
35
+ artifact_sha256: str | None = None,
32
36
  write_readme: bool = True,
33
37
  write_schema: bool | None = None,
34
38
  annotation_suffix: str = ".annotation.json",
35
39
  readme_suffix: str = ".README.md",
40
+ checksum_policy: ChecksumPolicy = "auto",
41
+ max_checksum_bytes: int = provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES,
42
+ checksum_overrides: Mapping[str | Path, str] | None = None,
36
43
  ):
37
44
  """
38
45
  Decorate a function that writes one annotated artifact.
@@ -68,6 +75,7 @@ def record_file_annotation(
68
75
  acquisition_context=acquisition_context,
69
76
  generation_context=generation_context,
70
77
  artifact_kind=artifact_kind,
78
+ artifact_sha256=artifact_sha256,
71
79
  params=params,
72
80
  inputs=inputs,
73
81
  function=fn,
@@ -75,6 +83,9 @@ def record_file_annotation(
75
83
  write_schema=write_schema,
76
84
  annotation_suffix=annotation_suffix,
77
85
  readme_suffix=readme_suffix,
86
+ checksum_policy=checksum_policy,
87
+ max_checksum_bytes=max_checksum_bytes,
88
+ checksum_overrides=checksum_overrides,
78
89
  )
79
90
  return result
80
91
 
@@ -95,6 +106,9 @@ def record_directory_annotation(
95
106
  write_schema: bool | None = None,
96
107
  annotation_filename: str = "data-annotations.json",
97
108
  readme_filename: str = "README.md",
109
+ checksum_policy: ChecksumPolicy = "auto",
110
+ max_checksum_bytes: int = provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES,
111
+ checksum_overrides: Mapping[str | Path, str] | None = None,
98
112
  ):
99
113
  """
100
114
  Decorate a function that writes several annotated outputs in a directory.
@@ -150,6 +164,9 @@ def record_directory_annotation(
150
164
  write_schema=write_schema,
151
165
  annotation_filename=annotation_filename,
152
166
  readme_filename=readme_filename,
167
+ checksum_policy=checksum_policy,
168
+ max_checksum_bytes=max_checksum_bytes,
169
+ checksum_overrides=checksum_overrides,
153
170
  )
154
171
  return result
155
172
 
@@ -1,4 +1,4 @@
1
- from collections.abc import Sequence
1
+ from collections.abc import Mapping, Sequence
2
2
  from pathlib import Path
3
3
  from typing import Any, Callable
4
4
 
@@ -17,6 +17,7 @@ from data_annotations.provenance import (
17
17
  ArtifactKind,
18
18
  BaseProvenance,
19
19
  ChildBundle,
20
+ ChecksumPolicy,
20
21
  ProducedFile,
21
22
  )
22
23
  from data_annotations.provenance import writers as provenance_writers
@@ -154,22 +155,30 @@ def _build_file_annotation_document(
154
155
  acquisition_context: dict[str, Any] | None = None,
155
156
  generation_context: dict[str, Any] | None = None,
156
157
  artifact_kind: ArtifactKind = "other",
158
+ artifact_sha256: str | None = None,
157
159
  params: dict[str, Any] | None = None,
158
160
  inputs: Sequence[str | Path] | None = None,
159
161
  function: Callable[..., Any] | None = None,
160
162
  capture_mode: str = "runtime",
161
163
  provenance_overrides: dict[str, Any] | None = None,
162
164
  normalize_inputs: bool = True,
165
+ checksum_policy: ChecksumPolicy = "auto",
166
+ max_checksum_bytes: int = provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES,
167
+ checksum_overrides: Mapping[str | Path, str] | None = None,
163
168
  ) -> FileAnnotationDocument:
164
169
  manifest = provenance_writers._build_file_manifest(
165
170
  artifact_path,
166
171
  artifact_kind=artifact_kind,
172
+ artifact_sha256=artifact_sha256,
167
173
  params=params,
168
174
  inputs=inputs,
169
175
  function=function,
170
176
  capture_mode=capture_mode,
171
177
  overrides=provenance_overrides,
172
178
  normalize_inputs=normalize_inputs,
179
+ checksum_policy=checksum_policy,
180
+ max_checksum_bytes=max_checksum_bytes,
181
+ checksum_overrides=checksum_overrides,
173
182
  )
174
183
 
175
184
  return FileAnnotationDocument(
@@ -211,6 +220,9 @@ def _build_directory_annotation_document(
211
220
  capture_mode: str = "runtime",
212
221
  provenance_overrides: dict[str, Any] | None = None,
213
222
  normalize_inputs: bool = True,
223
+ checksum_policy: ChecksumPolicy = "auto",
224
+ max_checksum_bytes: int = provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES,
225
+ checksum_overrides: Mapping[str | Path, str] | None = None,
214
226
  ) -> DirectoryAnnotationDocument:
215
227
  normalized_output_dir = Path(provenance_writers._normalize_local_path(output_dir))
216
228
  artifact_groups = artifact_groups or []
@@ -229,6 +241,9 @@ def _build_directory_annotation_document(
229
241
  capture_mode=capture_mode,
230
242
  overrides=provenance_overrides,
231
243
  normalize_inputs=normalize_inputs,
244
+ checksum_policy=checksum_policy,
245
+ max_checksum_bytes=max_checksum_bytes,
246
+ checksum_overrides=checksum_overrides,
232
247
  )
233
248
 
234
249
  return DirectoryAnnotationDocument(
@@ -285,6 +300,7 @@ def write_file_annotation(
285
300
  acquisition_context: dict[str, Any] | None = None,
286
301
  generation_context: dict[str, Any] | None = None,
287
302
  artifact_kind: ArtifactKind = "other",
303
+ artifact_sha256: str | None = None,
288
304
  params: dict[str, Any] | None = None,
289
305
  inputs: Sequence[str | Path] | None = None,
290
306
  function: Callable[..., Any] | None = None,
@@ -292,6 +308,9 @@ def write_file_annotation(
292
308
  provenance_overrides: dict[str, Any] | None = None,
293
309
  normalize_inputs: bool = True,
294
310
  suffix: str = ".annotation.json",
311
+ checksum_policy: ChecksumPolicy = "auto",
312
+ max_checksum_bytes: int = provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES,
313
+ checksum_overrides: Mapping[str | Path, str] | None = None,
295
314
  ) -> Path:
296
315
  document = _build_file_annotation_document(
297
316
  artifact_path,
@@ -303,12 +322,16 @@ def write_file_annotation(
303
322
  acquisition_context=acquisition_context,
304
323
  generation_context=generation_context,
305
324
  artifact_kind=artifact_kind,
325
+ artifact_sha256=artifact_sha256,
306
326
  params=params,
307
327
  inputs=inputs,
308
328
  function=function,
309
329
  capture_mode=capture_mode,
310
330
  provenance_overrides=provenance_overrides,
311
331
  normalize_inputs=normalize_inputs,
332
+ checksum_policy=checksum_policy,
333
+ max_checksum_bytes=max_checksum_bytes,
334
+ checksum_overrides=checksum_overrides,
312
335
  )
313
336
  annotation_path = Path(str(document.subject.path) + suffix)
314
337
  return _write_annotation_document(document, annotation_path)
@@ -331,6 +354,9 @@ def write_directory_annotation(
331
354
  provenance_overrides: dict[str, Any] | None = None,
332
355
  normalize_inputs: bool = True,
333
356
  filename: str = "data-annotations.json",
357
+ checksum_policy: ChecksumPolicy = "auto",
358
+ max_checksum_bytes: int = provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES,
359
+ checksum_overrides: Mapping[str | Path, str] | None = None,
334
360
  ) -> Path:
335
361
  document = _build_directory_annotation_document(
336
362
  output_dir,
@@ -347,6 +373,9 @@ def write_directory_annotation(
347
373
  capture_mode=capture_mode,
348
374
  provenance_overrides=provenance_overrides,
349
375
  normalize_inputs=normalize_inputs,
376
+ checksum_policy=checksum_policy,
377
+ max_checksum_bytes=max_checksum_bytes,
378
+ checksum_overrides=checksum_overrides,
350
379
  )
351
380
  annotation_path = Path(document.subject.path) / filename
352
381
  return _write_annotation_document(document, annotation_path)
@@ -363,6 +392,7 @@ def annotate_file(
363
392
  acquisition_context: dict[str, Any] | None = None,
364
393
  generation_context: dict[str, Any] | None = None,
365
394
  artifact_kind: ArtifactKind = "other",
395
+ artifact_sha256: str | None = None,
366
396
  params: dict[str, Any] | None = None,
367
397
  inputs: Sequence[str | Path] | None = None,
368
398
  function: Callable[..., Any] | None = None,
@@ -370,6 +400,9 @@ def annotate_file(
370
400
  write_schema: bool | None = None,
371
401
  annotation_suffix: str = ".annotation.json",
372
402
  readme_suffix: str = ".README.md",
403
+ checksum_policy: ChecksumPolicy = "auto",
404
+ max_checksum_bytes: int = provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES,
405
+ checksum_overrides: Mapping[str | Path, str] | None = None,
373
406
  ) -> FileAnnotationResult:
374
407
  document = _build_file_annotation_document(
375
408
  artifact_path,
@@ -381,9 +414,13 @@ def annotate_file(
381
414
  acquisition_context=acquisition_context,
382
415
  generation_context=generation_context,
383
416
  artifact_kind=artifact_kind,
417
+ artifact_sha256=artifact_sha256,
384
418
  params=params,
385
419
  inputs=inputs,
386
420
  function=function,
421
+ checksum_policy=checksum_policy,
422
+ max_checksum_bytes=max_checksum_bytes,
423
+ checksum_overrides=checksum_overrides,
387
424
  )
388
425
  artifact_path = Path(document.subject.path)
389
426
  annotation_path = _write_annotation_document(
@@ -425,6 +462,9 @@ def annotate_directory(
425
462
  write_schema: bool | None = None,
426
463
  annotation_filename: str = "data-annotations.json",
427
464
  readme_filename: str = "README.md",
465
+ checksum_policy: ChecksumPolicy = "auto",
466
+ max_checksum_bytes: int = provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES,
467
+ checksum_overrides: Mapping[str | Path, str] | None = None,
428
468
  ) -> DirectoryAnnotationResult:
429
469
  document = _build_directory_annotation_document(
430
470
  output_dir,
@@ -438,6 +478,9 @@ def annotate_directory(
438
478
  params=params,
439
479
  inputs=inputs,
440
480
  function=function,
481
+ checksum_policy=checksum_policy,
482
+ max_checksum_bytes=max_checksum_bytes,
483
+ checksum_overrides=checksum_overrides,
441
484
  )
442
485
  output_dir = Path(document.subject.path)
443
486
  annotation_path = _write_annotation_document(
@@ -24,6 +24,8 @@ from .helpers import (
24
24
  )
25
25
  from ..common import (
26
26
  CommandOption,
27
+ ChecksumPolicyOption,
28
+ ChecksumValuesOption,
27
29
  ForceOption,
28
30
  FunctionOption,
29
31
  GitBranchOption,
@@ -34,9 +36,11 @@ from ..common import (
34
36
  GitShaOption,
35
37
  GitTagsOption,
36
38
  InputValuesOption,
39
+ MaxChecksumBytesOption,
37
40
  ParamValuesOption,
38
41
  ScriptOption,
39
42
  ScriptRepoPathOption,
43
+ Sha256Option,
40
44
  SourceDownloadUriOption,
41
45
  SourceKindOption,
42
46
  SourcePathOption,
@@ -45,10 +49,12 @@ from ..common import (
45
49
  SourceUriOption,
46
50
  _annotation_paths_for_directory,
47
51
  _annotation_paths_for_file,
52
+ _collect_checksums,
48
53
  _discover_directory_entries,
49
54
  _ensure_annotation_outputs_available,
50
55
  _error,
51
56
  _validate_artifact_kind,
57
+ _validate_checksum_policy,
52
58
  )
53
59
 
54
60
  DEFAULT_ANNOTATION_INSTRUCTIONS = """
@@ -119,6 +125,10 @@ def annotate_file_command(
119
125
  source_path: SourcePathOption = None,
120
126
  source_revision: SourceRevisionOption = None,
121
127
  source_sha256: SourceSha256Option = None,
128
+ checksum_policy: ChecksumPolicyOption = "auto",
129
+ max_checksum_bytes: MaxChecksumBytesOption = 10 * 1024**3,
130
+ sha256: Sha256Option = None,
131
+ checksum_values: ChecksumValuesOption = None,
122
132
  force: ForceOption = False,
123
133
  ) -> None:
124
134
  file_answers = _load_file_answers(answers_path)
@@ -209,6 +219,13 @@ def annotate_file_command(
209
219
  if artifact_kind in {"dataset", "table"}:
210
220
  if is_interactive and file_answers is None:
211
221
  fields, primary_key, missing_value_codes = prompts._prompt_schema_details()
222
+ selected_checksum_policy = _validate_checksum_policy(checksum_policy)
223
+ checksum_overrides: dict[str | Path, str] = _collect_checksums(checksum_values)
224
+ artifact_sha256 = (
225
+ sha256
226
+ if sha256 is not None
227
+ else (file_answers.sha256 if file_answers else None)
228
+ )
212
229
 
213
230
  annotation_path, readme_path = _write_post_hoc_file_bundle(
214
231
  artifact_path,
@@ -221,6 +238,10 @@ def annotate_file_command(
221
238
  inputs=inputs,
222
239
  params=params,
223
240
  provenance_overrides=provenance_overrides,
241
+ artifact_sha256=artifact_sha256,
242
+ checksum_policy=selected_checksum_policy,
243
+ max_checksum_bytes=max_checksum_bytes,
244
+ checksum_overrides=checksum_overrides,
224
245
  )
225
246
  typer.echo("")
226
247
  typer.echo(f"Annotation: {annotation_path}")
@@ -297,6 +318,9 @@ def annotate_directory_command(
297
318
  "--group-kind",
298
319
  help="Artifact kind for the corresponding --group-selector.",
299
320
  ),
321
+ checksum_policy: ChecksumPolicyOption = "auto",
322
+ max_checksum_bytes: MaxChecksumBytesOption = 10 * 1024**3,
323
+ checksum_values: ChecksumValuesOption = None,
300
324
  force: ForceOption = False,
301
325
  ) -> None:
302
326
  directory_answers = _load_directory_answers(answers_path)
@@ -407,6 +431,11 @@ def annotate_directory_command(
407
431
  )
408
432
 
409
433
  default_kind = _validate_artifact_kind(kind) if kind is not None else None
434
+ selected_checksum_policy = _validate_checksum_policy(checksum_policy)
435
+ checksum_overrides: dict[str | Path, str] = (
436
+ dict(directory_answers.checksums) if directory_answers is not None else {}
437
+ )
438
+ checksum_overrides.update(_collect_checksums(checksum_values))
410
439
  if directory_answers is not None and has_answers_inventory:
411
440
  artifacts = _documented_artifacts_from_answers(directory_answers.artifacts)
412
441
  if not group_selector_values:
@@ -486,6 +515,9 @@ def annotate_directory_command(
486
515
  inputs=inputs,
487
516
  params=params,
488
517
  provenance_overrides=provenance_overrides,
518
+ checksum_policy=selected_checksum_policy,
519
+ max_checksum_bytes=max_checksum_bytes,
520
+ checksum_overrides=checksum_overrides,
489
521
  )
490
522
 
491
523
  typer.echo(f"Annotation: {annotation_path}")