data-annotations 2.5.0__tar.gz → 2.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_annotations-2.5.0 → data_annotations-2.6.0}/PKG-INFO +85 -1
- {data_annotations-2.5.0 → data_annotations-2.6.0}/README.md +84 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/pyproject.toml +1 -1
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/_decorators.py +39 -4
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/annotations/decorators.py +18 -1
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/annotations/writers.py +44 -1
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/annotate/__init__.py +32 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/annotate/helpers.py +17 -1
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/answers.py +14 -1
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/common.py +57 -1
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/provenance_commands.py +15 -1
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/__init__.py +4 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/decorators.py +17 -1
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/models.py +1 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/recovery/chain.py +60 -11
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/recovery/matching.py +76 -15
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/writers.py +200 -18
- {data_annotations-2.5.0 → data_annotations-2.6.0}/LICENSE +0 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/__init__.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/annotations/__init__.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/annotations/models.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/cli.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/__init__.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/prompts.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/publish.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/description/__init__.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/description/decorators.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/description/models.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/description/writers.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/git.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/recovery/__init__.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/recovery/manifest.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/recovery/sources.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/recovery/types.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/provenance/runtime.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/publish.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-annotations
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.6.0
|
|
4
4
|
Summary: Annotate data artifacts with provenance and descriptions
|
|
5
5
|
Keywords: annotations,data,metadata,provenance,reproducibility
|
|
6
6
|
Author: Rodrigo C. G. Pena
|
|
@@ -109,6 +109,11 @@ Every annotation document includes provenance with:
|
|
|
109
109
|
directory content digests, and upstream annotation sidecar references when
|
|
110
110
|
present
|
|
111
111
|
|
|
112
|
+
Local file hashing defaults to checksum policy `auto`: existing files are hashed
|
|
113
|
+
only up to `10 * 1024**3` bytes (10 GiB). Larger files are still recorded, but
|
|
114
|
+
their `sha256` or directory `content_digest` is left unset unless you provide a
|
|
115
|
+
precomputed checksum yourself.
|
|
116
|
+
|
|
112
117
|
You can also attach your own parameters, input file paths, and function names.
|
|
113
118
|
Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
|
|
114
119
|
such as `s3://...` or `https://...` are preserved as provided.
|
|
@@ -502,6 +507,75 @@ README.
|
|
|
502
507
|
If you want the direct writer approach instead, use `write_file_manifest(...)` and
|
|
503
508
|
`write_directory_manifest(...)` (see `examples/`).
|
|
504
509
|
|
|
510
|
+
## Checksum Policy
|
|
511
|
+
|
|
512
|
+
All provenance and annotation entry points that hash local files support the same
|
|
513
|
+
policy controls:
|
|
514
|
+
|
|
515
|
+
- `checksum_policy="auto"`: hash existing local files only when they are at or
|
|
516
|
+
below `max_checksum_bytes`. This is the default, and
|
|
517
|
+
`max_checksum_bytes` defaults to `10 * 1024**3` bytes (10 GiB).
|
|
518
|
+
- `checksum_policy="always"`: hash existing local files regardless of size.
|
|
519
|
+
- `checksum_policy="never"`: never hash local files automatically. Checksums are
|
|
520
|
+
recorded only when you supply them explicitly.
|
|
521
|
+
|
|
522
|
+
When a checksum is skipped, JSON sidecars keep the same schema and simply store
|
|
523
|
+
`sha256: null`. Directory `content_digest` is also left unset when any tracked
|
|
524
|
+
member file lacks a checksum.
|
|
525
|
+
|
|
526
|
+
You can change the policy from Python:
|
|
527
|
+
|
|
528
|
+
```python
|
|
529
|
+
from data_annotations.annotations import annotate_file
|
|
530
|
+
from data_annotations.provenance import write_file_manifest
|
|
531
|
+
|
|
532
|
+
write_file_manifest(
|
|
533
|
+
"outputs/summary.txt",
|
|
534
|
+
checksum_policy="always",
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
annotate_file(
|
|
538
|
+
"outputs/summary.txt",
|
|
539
|
+
title="Run Summary",
|
|
540
|
+
summary="Post-hoc summary.",
|
|
541
|
+
artifact_sha256="precomputed-sha256",
|
|
542
|
+
checksum_policy="never",
|
|
543
|
+
)
|
|
544
|
+
```
|
|
545
|
+
|
|
546
|
+
You can also inject precomputed checksums directly:
|
|
547
|
+
|
|
548
|
+
- File APIs: pass `artifact_sha256=...`.
|
|
549
|
+
- File or directory APIs: pass `checksum_overrides={path: sha256}`. For
|
|
550
|
+
directory outputs, keys can be relative to the output directory or absolute
|
|
551
|
+
paths.
|
|
552
|
+
- Decorators such as `record_file_manifest(...)`, `record_directory_manifest(...)`,
|
|
553
|
+
`record_file_annotation(...)`, and `record_directory_annotation(...)` accept the
|
|
554
|
+
same checksum-policy arguments.
|
|
555
|
+
|
|
556
|
+
From the CLI, use `--checksum-policy`, `--max-checksum-bytes`, `--sha256`, and
|
|
557
|
+
repeatable `--checksum PATH=SHA256`:
|
|
558
|
+
|
|
559
|
+
```bash
|
|
560
|
+
data-annotations annotate file path/to/summary.txt \
|
|
561
|
+
--title "Run Summary" \
|
|
562
|
+
--summary "Post-hoc summary." \
|
|
563
|
+
--kind report \
|
|
564
|
+
--checksum-policy never \
|
|
565
|
+
--sha256 0123456789abcdef...
|
|
566
|
+
|
|
567
|
+
data-annotations annotate directory path/to/run-001 \
|
|
568
|
+
--title "Processing outputs" \
|
|
569
|
+
--summary "Directory-level outputs." \
|
|
570
|
+
--checksum-policy never \
|
|
571
|
+
--checksum processed.csv=0123456789abcdef...
|
|
572
|
+
|
|
573
|
+
data-annotations provenance chain path/to/run-001 \
|
|
574
|
+
--checksum-policy always
|
|
575
|
+
```
|
|
576
|
+
|
|
577
|
+
For a complete runnable workflow, see `examples/checksum_policy.py`.
|
|
578
|
+
|
|
505
579
|
## Description Layer
|
|
506
580
|
|
|
507
581
|
The `data_annotations.description` sub-package provides the structured description
|
|
@@ -628,6 +702,7 @@ target: path/to/participants.csv
|
|
|
628
702
|
title: Participant Cohort
|
|
629
703
|
summary: Participant-level cohort assignments.
|
|
630
704
|
kind: dataset
|
|
705
|
+
sha256: 0123456789abcdef...
|
|
631
706
|
|
|
632
707
|
inputs:
|
|
633
708
|
- ${DATA_ROOT}/raw/participants.csv
|
|
@@ -670,6 +745,9 @@ provenance:
|
|
|
670
745
|
command: bash process_from_instrument.sh
|
|
671
746
|
script: process_from_instrument.sh
|
|
672
747
|
|
|
748
|
+
checksums:
|
|
749
|
+
processed.csv: 0123456789abcdef...
|
|
750
|
+
|
|
673
751
|
artifacts:
|
|
674
752
|
- path: processed.csv
|
|
675
753
|
kind: dataset
|
|
@@ -765,6 +843,11 @@ resolving an older installed command. From a source checkout, use
|
|
|
765
843
|
`uv run data-annotations provenance chain ...`, or reinstall the CLI from the
|
|
766
844
|
updated source before using the bare `data-annotations` command.
|
|
767
845
|
|
|
846
|
+
Both `match` and `chain` also accept `--checksum-policy` and
|
|
847
|
+
`--max-checksum-bytes`. Use `--checksum-policy always` when you want full
|
|
848
|
+
verification of large local files, and leave the default `auto` when you prefer
|
|
849
|
+
to avoid long checksum passes on very large artifacts.
|
|
850
|
+
|
|
768
851
|
### Run With `uvx`
|
|
769
852
|
|
|
770
853
|
```bash
|
|
@@ -886,6 +969,7 @@ uv run python examples/record_file_description.py
|
|
|
886
969
|
uv run python examples/record_directory_description.py
|
|
887
970
|
uv run python examples/annotate_file.py
|
|
888
971
|
uv run python examples/annotate_directory.py
|
|
972
|
+
uv run python examples/checksum_policy.py
|
|
889
973
|
uv run python examples/annotate_file_answers_cli.py
|
|
890
974
|
uv run python examples/write_file_manifest.py
|
|
891
975
|
uv run python examples/write_directory_manifest.py
|
|
@@ -79,6 +79,11 @@ Every annotation document includes provenance with:
|
|
|
79
79
|
directory content digests, and upstream annotation sidecar references when
|
|
80
80
|
present
|
|
81
81
|
|
|
82
|
+
Local file hashing defaults to checksum policy `auto`: existing files are hashed
|
|
83
|
+
only up to `10 * 1024**3` bytes (10 GiB). Larger files are still recorded, but
|
|
84
|
+
their `sha256` or directory `content_digest` is left unset unless you provide a
|
|
85
|
+
precomputed checksum yourself.
|
|
86
|
+
|
|
82
87
|
You can also attach your own parameters, input file paths, and function names.
|
|
83
88
|
Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
|
|
84
89
|
such as `s3://...` or `https://...` are preserved as provided.
|
|
@@ -472,6 +477,75 @@ README.
|
|
|
472
477
|
If you want the direct writer approach instead, use `write_file_manifest(...)` and
|
|
473
478
|
`write_directory_manifest(...)` (see `examples/`).
|
|
474
479
|
|
|
480
|
+
## Checksum Policy
|
|
481
|
+
|
|
482
|
+
All provenance and annotation entry points that hash local files support the same
|
|
483
|
+
policy controls:
|
|
484
|
+
|
|
485
|
+
- `checksum_policy="auto"`: hash existing local files only when they are at or
|
|
486
|
+
below `max_checksum_bytes`. This is the default, and
|
|
487
|
+
`max_checksum_bytes` defaults to `10 * 1024**3` bytes (10 GiB).
|
|
488
|
+
- `checksum_policy="always"`: hash existing local files regardless of size.
|
|
489
|
+
- `checksum_policy="never"`: never hash local files automatically. Checksums are
|
|
490
|
+
recorded only when you supply them explicitly.
|
|
491
|
+
|
|
492
|
+
When a checksum is skipped, JSON sidecars keep the same schema and simply store
|
|
493
|
+
`sha256: null`. Directory `content_digest` is also left unset when any tracked
|
|
494
|
+
member file lacks a checksum.
|
|
495
|
+
|
|
496
|
+
You can change the policy from Python:
|
|
497
|
+
|
|
498
|
+
```python
|
|
499
|
+
from data_annotations.annotations import annotate_file
|
|
500
|
+
from data_annotations.provenance import write_file_manifest
|
|
501
|
+
|
|
502
|
+
write_file_manifest(
|
|
503
|
+
"outputs/summary.txt",
|
|
504
|
+
checksum_policy="always",
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
annotate_file(
|
|
508
|
+
"outputs/summary.txt",
|
|
509
|
+
title="Run Summary",
|
|
510
|
+
summary="Post-hoc summary.",
|
|
511
|
+
artifact_sha256="precomputed-sha256",
|
|
512
|
+
checksum_policy="never",
|
|
513
|
+
)
|
|
514
|
+
```
|
|
515
|
+
|
|
516
|
+
You can also inject precomputed checksums directly:
|
|
517
|
+
|
|
518
|
+
- File APIs: pass `artifact_sha256=...`.
|
|
519
|
+
- File or directory APIs: pass `checksum_overrides={path: sha256}`. For
|
|
520
|
+
directory outputs, keys can be relative to the output directory or absolute
|
|
521
|
+
paths.
|
|
522
|
+
- Decorators such as `record_file_manifest(...)`, `record_directory_manifest(...)`,
|
|
523
|
+
`record_file_annotation(...)`, and `record_directory_annotation(...)` accept the
|
|
524
|
+
same checksum-policy arguments.
|
|
525
|
+
|
|
526
|
+
From the CLI, use `--checksum-policy`, `--max-checksum-bytes`, `--sha256`, and
|
|
527
|
+
repeatable `--checksum PATH=SHA256`:
|
|
528
|
+
|
|
529
|
+
```bash
|
|
530
|
+
data-annotations annotate file path/to/summary.txt \
|
|
531
|
+
--title "Run Summary" \
|
|
532
|
+
--summary "Post-hoc summary." \
|
|
533
|
+
--kind report \
|
|
534
|
+
--checksum-policy never \
|
|
535
|
+
--sha256 0123456789abcdef...
|
|
536
|
+
|
|
537
|
+
data-annotations annotate directory path/to/run-001 \
|
|
538
|
+
--title "Processing outputs" \
|
|
539
|
+
--summary "Directory-level outputs." \
|
|
540
|
+
--checksum-policy never \
|
|
541
|
+
--checksum processed.csv=0123456789abcdef...
|
|
542
|
+
|
|
543
|
+
data-annotations provenance chain path/to/run-001 \
|
|
544
|
+
--checksum-policy always
|
|
545
|
+
```
|
|
546
|
+
|
|
547
|
+
For a complete runnable workflow, see `examples/checksum_policy.py`.
|
|
548
|
+
|
|
475
549
|
## Description Layer
|
|
476
550
|
|
|
477
551
|
The `data_annotations.description` sub-package provides the structured description
|
|
@@ -598,6 +672,7 @@ target: path/to/participants.csv
|
|
|
598
672
|
title: Participant Cohort
|
|
599
673
|
summary: Participant-level cohort assignments.
|
|
600
674
|
kind: dataset
|
|
675
|
+
sha256: 0123456789abcdef...
|
|
601
676
|
|
|
602
677
|
inputs:
|
|
603
678
|
- ${DATA_ROOT}/raw/participants.csv
|
|
@@ -640,6 +715,9 @@ provenance:
|
|
|
640
715
|
command: bash process_from_instrument.sh
|
|
641
716
|
script: process_from_instrument.sh
|
|
642
717
|
|
|
718
|
+
checksums:
|
|
719
|
+
processed.csv: 0123456789abcdef...
|
|
720
|
+
|
|
643
721
|
artifacts:
|
|
644
722
|
- path: processed.csv
|
|
645
723
|
kind: dataset
|
|
@@ -735,6 +813,11 @@ resolving an older installed command. From a source checkout, use
|
|
|
735
813
|
`uv run data-annotations provenance chain ...`, or reinstall the CLI from the
|
|
736
814
|
updated source before using the bare `data-annotations` command.
|
|
737
815
|
|
|
816
|
+
Both `match` and `chain` also accept `--checksum-policy` and
|
|
817
|
+
`--max-checksum-bytes`. Use `--checksum-policy always` when you want full
|
|
818
|
+
verification of large local files, and leave the default `auto` when you prefer
|
|
819
|
+
to avoid long checksum passes on very large artifacts.
|
|
820
|
+
|
|
738
821
|
### Run With `uvx`
|
|
739
822
|
|
|
740
823
|
```bash
|
|
@@ -856,6 +939,7 @@ uv run python examples/record_file_description.py
|
|
|
856
939
|
uv run python examples/record_directory_description.py
|
|
857
940
|
uv run python examples/annotate_file.py
|
|
858
941
|
uv run python examples/annotate_directory.py
|
|
942
|
+
uv run python examples/checksum_policy.py
|
|
859
943
|
uv run python examples/annotate_file_answers_cli.py
|
|
860
944
|
uv run python examples/write_file_manifest.py
|
|
861
945
|
uv run python examples/write_directory_manifest.py
|
|
@@ -9,6 +9,7 @@ if TYPE_CHECKING:
|
|
|
9
9
|
DocumentedArtifactGroup,
|
|
10
10
|
)
|
|
11
11
|
from data_annotations.provenance.models import ChildBundle, ProducedFile
|
|
12
|
+
from data_annotations.provenance.models import ChecksumPolicy
|
|
12
13
|
|
|
13
14
|
DEFAULT_INPUT_ARGS = ("input_path", "input_paths")
|
|
14
15
|
|
|
@@ -78,6 +79,8 @@ def coerce_produced_file(
|
|
|
78
79
|
item: Any,
|
|
79
80
|
*,
|
|
80
81
|
normalize_paths: bool = True,
|
|
82
|
+
checksum_policy: "ChecksumPolicy" = "auto",
|
|
83
|
+
max_checksum_bytes: int | None = None,
|
|
81
84
|
) -> "ProducedFile":
|
|
82
85
|
from data_annotations.description.models import DocumentedArtifact
|
|
83
86
|
from data_annotations.provenance import writers as provenance_writers
|
|
@@ -89,7 +92,15 @@ def coerce_produced_file(
|
|
|
89
92
|
path=str(path),
|
|
90
93
|
kind=item.kind,
|
|
91
94
|
sha256=(
|
|
92
|
-
provenance_writers.
|
|
95
|
+
provenance_writers._resolve_file_sha256(
|
|
96
|
+
path,
|
|
97
|
+
checksum_policy=checksum_policy,
|
|
98
|
+
max_checksum_bytes=(
|
|
99
|
+
max_checksum_bytes
|
|
100
|
+
if max_checksum_bytes is not None
|
|
101
|
+
else provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES
|
|
102
|
+
),
|
|
103
|
+
)
|
|
93
104
|
if normalize_paths and path.exists()
|
|
94
105
|
else None
|
|
95
106
|
),
|
|
@@ -106,7 +117,15 @@ def coerce_produced_file(
|
|
|
106
117
|
path=str(normalized),
|
|
107
118
|
kind=kind,
|
|
108
119
|
sha256=(
|
|
109
|
-
provenance_writers.
|
|
120
|
+
provenance_writers._resolve_file_sha256(
|
|
121
|
+
normalized,
|
|
122
|
+
checksum_policy=checksum_policy,
|
|
123
|
+
max_checksum_bytes=(
|
|
124
|
+
max_checksum_bytes
|
|
125
|
+
if max_checksum_bytes is not None
|
|
126
|
+
else provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES
|
|
127
|
+
),
|
|
128
|
+
)
|
|
110
129
|
if normalize_paths and normalized.exists()
|
|
111
130
|
else None
|
|
112
131
|
),
|
|
@@ -117,7 +136,15 @@ def coerce_produced_file(
|
|
|
117
136
|
path=str(path),
|
|
118
137
|
kind="other",
|
|
119
138
|
sha256=(
|
|
120
|
-
provenance_writers.
|
|
139
|
+
provenance_writers._resolve_file_sha256(
|
|
140
|
+
path,
|
|
141
|
+
checksum_policy=checksum_policy,
|
|
142
|
+
max_checksum_bytes=(
|
|
143
|
+
max_checksum_bytes
|
|
144
|
+
if max_checksum_bytes is not None
|
|
145
|
+
else provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES
|
|
146
|
+
),
|
|
147
|
+
)
|
|
121
148
|
if normalize_paths and path.exists()
|
|
122
149
|
else None
|
|
123
150
|
),
|
|
@@ -128,9 +155,17 @@ def coerce_produced_files(
|
|
|
128
155
|
items: Iterable[Any],
|
|
129
156
|
*,
|
|
130
157
|
normalize_paths: bool = True,
|
|
158
|
+
checksum_policy: "ChecksumPolicy" = "auto",
|
|
159
|
+
max_checksum_bytes: int | None = None,
|
|
131
160
|
) -> list["ProducedFile"]:
|
|
132
161
|
return [
|
|
133
|
-
coerce_produced_file(
|
|
162
|
+
coerce_produced_file(
|
|
163
|
+
item,
|
|
164
|
+
normalize_paths=normalize_paths,
|
|
165
|
+
checksum_policy=checksum_policy,
|
|
166
|
+
max_checksum_bytes=max_checksum_bytes,
|
|
167
|
+
)
|
|
168
|
+
for item in items
|
|
134
169
|
]
|
|
135
170
|
|
|
136
171
|
|
{data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/annotations/decorators.py
RENAMED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
+
from collections.abc import Mapping
|
|
1
2
|
from functools import wraps
|
|
3
|
+
from pathlib import Path
|
|
2
4
|
from typing import Any, Callable
|
|
3
5
|
|
|
4
6
|
from data_annotations._decorators import (
|
|
@@ -12,7 +14,8 @@ from data_annotations._decorators import (
|
|
|
12
14
|
split_child_bundles,
|
|
13
15
|
)
|
|
14
16
|
from data_annotations.description.models import DocumentedArtifact, FieldDefinition
|
|
15
|
-
from data_annotations.provenance
|
|
17
|
+
from data_annotations.provenance import writers as provenance_writers
|
|
18
|
+
from data_annotations.provenance.models import ArtifactKind, ChecksumPolicy
|
|
16
19
|
|
|
17
20
|
from .writers import annotate_directory, annotate_file
|
|
18
21
|
|
|
@@ -29,10 +32,14 @@ def record_file_annotation(
|
|
|
29
32
|
acquisition_context: dict[str, Any] | None = None,
|
|
30
33
|
generation_context: dict[str, Any] | None = None,
|
|
31
34
|
artifact_kind: ArtifactKind = "other",
|
|
35
|
+
artifact_sha256: str | None = None,
|
|
32
36
|
write_readme: bool = True,
|
|
33
37
|
write_schema: bool | None = None,
|
|
34
38
|
annotation_suffix: str = ".annotation.json",
|
|
35
39
|
readme_suffix: str = ".README.md",
|
|
40
|
+
checksum_policy: ChecksumPolicy = "auto",
|
|
41
|
+
max_checksum_bytes: int = provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES,
|
|
42
|
+
checksum_overrides: Mapping[str | Path, str] | None = None,
|
|
36
43
|
):
|
|
37
44
|
"""
|
|
38
45
|
Decorate a function that writes one annotated artifact.
|
|
@@ -68,6 +75,7 @@ def record_file_annotation(
|
|
|
68
75
|
acquisition_context=acquisition_context,
|
|
69
76
|
generation_context=generation_context,
|
|
70
77
|
artifact_kind=artifact_kind,
|
|
78
|
+
artifact_sha256=artifact_sha256,
|
|
71
79
|
params=params,
|
|
72
80
|
inputs=inputs,
|
|
73
81
|
function=fn,
|
|
@@ -75,6 +83,9 @@ def record_file_annotation(
|
|
|
75
83
|
write_schema=write_schema,
|
|
76
84
|
annotation_suffix=annotation_suffix,
|
|
77
85
|
readme_suffix=readme_suffix,
|
|
86
|
+
checksum_policy=checksum_policy,
|
|
87
|
+
max_checksum_bytes=max_checksum_bytes,
|
|
88
|
+
checksum_overrides=checksum_overrides,
|
|
78
89
|
)
|
|
79
90
|
return result
|
|
80
91
|
|
|
@@ -95,6 +106,9 @@ def record_directory_annotation(
|
|
|
95
106
|
write_schema: bool | None = None,
|
|
96
107
|
annotation_filename: str = "data-annotations.json",
|
|
97
108
|
readme_filename: str = "README.md",
|
|
109
|
+
checksum_policy: ChecksumPolicy = "auto",
|
|
110
|
+
max_checksum_bytes: int = provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES,
|
|
111
|
+
checksum_overrides: Mapping[str | Path, str] | None = None,
|
|
98
112
|
):
|
|
99
113
|
"""
|
|
100
114
|
Decorate a function that writes several annotated outputs in a directory.
|
|
@@ -150,6 +164,9 @@ def record_directory_annotation(
|
|
|
150
164
|
write_schema=write_schema,
|
|
151
165
|
annotation_filename=annotation_filename,
|
|
152
166
|
readme_filename=readme_filename,
|
|
167
|
+
checksum_policy=checksum_policy,
|
|
168
|
+
max_checksum_bytes=max_checksum_bytes,
|
|
169
|
+
checksum_overrides=checksum_overrides,
|
|
153
170
|
)
|
|
154
171
|
return result
|
|
155
172
|
|
{data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/annotations/writers.py
RENAMED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from collections.abc import Sequence
|
|
1
|
+
from collections.abc import Mapping, Sequence
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from typing import Any, Callable
|
|
4
4
|
|
|
@@ -17,6 +17,7 @@ from data_annotations.provenance import (
|
|
|
17
17
|
ArtifactKind,
|
|
18
18
|
BaseProvenance,
|
|
19
19
|
ChildBundle,
|
|
20
|
+
ChecksumPolicy,
|
|
20
21
|
ProducedFile,
|
|
21
22
|
)
|
|
22
23
|
from data_annotations.provenance import writers as provenance_writers
|
|
@@ -154,22 +155,30 @@ def _build_file_annotation_document(
|
|
|
154
155
|
acquisition_context: dict[str, Any] | None = None,
|
|
155
156
|
generation_context: dict[str, Any] | None = None,
|
|
156
157
|
artifact_kind: ArtifactKind = "other",
|
|
158
|
+
artifact_sha256: str | None = None,
|
|
157
159
|
params: dict[str, Any] | None = None,
|
|
158
160
|
inputs: Sequence[str | Path] | None = None,
|
|
159
161
|
function: Callable[..., Any] | None = None,
|
|
160
162
|
capture_mode: str = "runtime",
|
|
161
163
|
provenance_overrides: dict[str, Any] | None = None,
|
|
162
164
|
normalize_inputs: bool = True,
|
|
165
|
+
checksum_policy: ChecksumPolicy = "auto",
|
|
166
|
+
max_checksum_bytes: int = provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES,
|
|
167
|
+
checksum_overrides: Mapping[str | Path, str] | None = None,
|
|
163
168
|
) -> FileAnnotationDocument:
|
|
164
169
|
manifest = provenance_writers._build_file_manifest(
|
|
165
170
|
artifact_path,
|
|
166
171
|
artifact_kind=artifact_kind,
|
|
172
|
+
artifact_sha256=artifact_sha256,
|
|
167
173
|
params=params,
|
|
168
174
|
inputs=inputs,
|
|
169
175
|
function=function,
|
|
170
176
|
capture_mode=capture_mode,
|
|
171
177
|
overrides=provenance_overrides,
|
|
172
178
|
normalize_inputs=normalize_inputs,
|
|
179
|
+
checksum_policy=checksum_policy,
|
|
180
|
+
max_checksum_bytes=max_checksum_bytes,
|
|
181
|
+
checksum_overrides=checksum_overrides,
|
|
173
182
|
)
|
|
174
183
|
|
|
175
184
|
return FileAnnotationDocument(
|
|
@@ -211,6 +220,9 @@ def _build_directory_annotation_document(
|
|
|
211
220
|
capture_mode: str = "runtime",
|
|
212
221
|
provenance_overrides: dict[str, Any] | None = None,
|
|
213
222
|
normalize_inputs: bool = True,
|
|
223
|
+
checksum_policy: ChecksumPolicy = "auto",
|
|
224
|
+
max_checksum_bytes: int = provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES,
|
|
225
|
+
checksum_overrides: Mapping[str | Path, str] | None = None,
|
|
214
226
|
) -> DirectoryAnnotationDocument:
|
|
215
227
|
normalized_output_dir = Path(provenance_writers._normalize_local_path(output_dir))
|
|
216
228
|
artifact_groups = artifact_groups or []
|
|
@@ -229,6 +241,9 @@ def _build_directory_annotation_document(
|
|
|
229
241
|
capture_mode=capture_mode,
|
|
230
242
|
overrides=provenance_overrides,
|
|
231
243
|
normalize_inputs=normalize_inputs,
|
|
244
|
+
checksum_policy=checksum_policy,
|
|
245
|
+
max_checksum_bytes=max_checksum_bytes,
|
|
246
|
+
checksum_overrides=checksum_overrides,
|
|
232
247
|
)
|
|
233
248
|
|
|
234
249
|
return DirectoryAnnotationDocument(
|
|
@@ -285,6 +300,7 @@ def write_file_annotation(
|
|
|
285
300
|
acquisition_context: dict[str, Any] | None = None,
|
|
286
301
|
generation_context: dict[str, Any] | None = None,
|
|
287
302
|
artifact_kind: ArtifactKind = "other",
|
|
303
|
+
artifact_sha256: str | None = None,
|
|
288
304
|
params: dict[str, Any] | None = None,
|
|
289
305
|
inputs: Sequence[str | Path] | None = None,
|
|
290
306
|
function: Callable[..., Any] | None = None,
|
|
@@ -292,6 +308,9 @@ def write_file_annotation(
|
|
|
292
308
|
provenance_overrides: dict[str, Any] | None = None,
|
|
293
309
|
normalize_inputs: bool = True,
|
|
294
310
|
suffix: str = ".annotation.json",
|
|
311
|
+
checksum_policy: ChecksumPolicy = "auto",
|
|
312
|
+
max_checksum_bytes: int = provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES,
|
|
313
|
+
checksum_overrides: Mapping[str | Path, str] | None = None,
|
|
295
314
|
) -> Path:
|
|
296
315
|
document = _build_file_annotation_document(
|
|
297
316
|
artifact_path,
|
|
@@ -303,12 +322,16 @@ def write_file_annotation(
|
|
|
303
322
|
acquisition_context=acquisition_context,
|
|
304
323
|
generation_context=generation_context,
|
|
305
324
|
artifact_kind=artifact_kind,
|
|
325
|
+
artifact_sha256=artifact_sha256,
|
|
306
326
|
params=params,
|
|
307
327
|
inputs=inputs,
|
|
308
328
|
function=function,
|
|
309
329
|
capture_mode=capture_mode,
|
|
310
330
|
provenance_overrides=provenance_overrides,
|
|
311
331
|
normalize_inputs=normalize_inputs,
|
|
332
|
+
checksum_policy=checksum_policy,
|
|
333
|
+
max_checksum_bytes=max_checksum_bytes,
|
|
334
|
+
checksum_overrides=checksum_overrides,
|
|
312
335
|
)
|
|
313
336
|
annotation_path = Path(str(document.subject.path) + suffix)
|
|
314
337
|
return _write_annotation_document(document, annotation_path)
|
|
@@ -331,6 +354,9 @@ def write_directory_annotation(
|
|
|
331
354
|
provenance_overrides: dict[str, Any] | None = None,
|
|
332
355
|
normalize_inputs: bool = True,
|
|
333
356
|
filename: str = "data-annotations.json",
|
|
357
|
+
checksum_policy: ChecksumPolicy = "auto",
|
|
358
|
+
max_checksum_bytes: int = provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES,
|
|
359
|
+
checksum_overrides: Mapping[str | Path, str] | None = None,
|
|
334
360
|
) -> Path:
|
|
335
361
|
document = _build_directory_annotation_document(
|
|
336
362
|
output_dir,
|
|
@@ -347,6 +373,9 @@ def write_directory_annotation(
|
|
|
347
373
|
capture_mode=capture_mode,
|
|
348
374
|
provenance_overrides=provenance_overrides,
|
|
349
375
|
normalize_inputs=normalize_inputs,
|
|
376
|
+
checksum_policy=checksum_policy,
|
|
377
|
+
max_checksum_bytes=max_checksum_bytes,
|
|
378
|
+
checksum_overrides=checksum_overrides,
|
|
350
379
|
)
|
|
351
380
|
annotation_path = Path(document.subject.path) / filename
|
|
352
381
|
return _write_annotation_document(document, annotation_path)
|
|
@@ -363,6 +392,7 @@ def annotate_file(
|
|
|
363
392
|
acquisition_context: dict[str, Any] | None = None,
|
|
364
393
|
generation_context: dict[str, Any] | None = None,
|
|
365
394
|
artifact_kind: ArtifactKind = "other",
|
|
395
|
+
artifact_sha256: str | None = None,
|
|
366
396
|
params: dict[str, Any] | None = None,
|
|
367
397
|
inputs: Sequence[str | Path] | None = None,
|
|
368
398
|
function: Callable[..., Any] | None = None,
|
|
@@ -370,6 +400,9 @@ def annotate_file(
|
|
|
370
400
|
write_schema: bool | None = None,
|
|
371
401
|
annotation_suffix: str = ".annotation.json",
|
|
372
402
|
readme_suffix: str = ".README.md",
|
|
403
|
+
checksum_policy: ChecksumPolicy = "auto",
|
|
404
|
+
max_checksum_bytes: int = provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES,
|
|
405
|
+
checksum_overrides: Mapping[str | Path, str] | None = None,
|
|
373
406
|
) -> FileAnnotationResult:
|
|
374
407
|
document = _build_file_annotation_document(
|
|
375
408
|
artifact_path,
|
|
@@ -381,9 +414,13 @@ def annotate_file(
|
|
|
381
414
|
acquisition_context=acquisition_context,
|
|
382
415
|
generation_context=generation_context,
|
|
383
416
|
artifact_kind=artifact_kind,
|
|
417
|
+
artifact_sha256=artifact_sha256,
|
|
384
418
|
params=params,
|
|
385
419
|
inputs=inputs,
|
|
386
420
|
function=function,
|
|
421
|
+
checksum_policy=checksum_policy,
|
|
422
|
+
max_checksum_bytes=max_checksum_bytes,
|
|
423
|
+
checksum_overrides=checksum_overrides,
|
|
387
424
|
)
|
|
388
425
|
artifact_path = Path(document.subject.path)
|
|
389
426
|
annotation_path = _write_annotation_document(
|
|
@@ -425,6 +462,9 @@ def annotate_directory(
|
|
|
425
462
|
write_schema: bool | None = None,
|
|
426
463
|
annotation_filename: str = "data-annotations.json",
|
|
427
464
|
readme_filename: str = "README.md",
|
|
465
|
+
checksum_policy: ChecksumPolicy = "auto",
|
|
466
|
+
max_checksum_bytes: int = provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES,
|
|
467
|
+
checksum_overrides: Mapping[str | Path, str] | None = None,
|
|
428
468
|
) -> DirectoryAnnotationResult:
|
|
429
469
|
document = _build_directory_annotation_document(
|
|
430
470
|
output_dir,
|
|
@@ -438,6 +478,9 @@ def annotate_directory(
|
|
|
438
478
|
params=params,
|
|
439
479
|
inputs=inputs,
|
|
440
480
|
function=function,
|
|
481
|
+
checksum_policy=checksum_policy,
|
|
482
|
+
max_checksum_bytes=max_checksum_bytes,
|
|
483
|
+
checksum_overrides=checksum_overrides,
|
|
441
484
|
)
|
|
442
485
|
output_dir = Path(document.subject.path)
|
|
443
486
|
annotation_path = _write_annotation_document(
|
{data_annotations-2.5.0 → data_annotations-2.6.0}/src/data_annotations/cli_app/annotate/__init__.py
RENAMED
|
@@ -24,6 +24,8 @@ from .helpers import (
|
|
|
24
24
|
)
|
|
25
25
|
from ..common import (
|
|
26
26
|
CommandOption,
|
|
27
|
+
ChecksumPolicyOption,
|
|
28
|
+
ChecksumValuesOption,
|
|
27
29
|
ForceOption,
|
|
28
30
|
FunctionOption,
|
|
29
31
|
GitBranchOption,
|
|
@@ -34,9 +36,11 @@ from ..common import (
|
|
|
34
36
|
GitShaOption,
|
|
35
37
|
GitTagsOption,
|
|
36
38
|
InputValuesOption,
|
|
39
|
+
MaxChecksumBytesOption,
|
|
37
40
|
ParamValuesOption,
|
|
38
41
|
ScriptOption,
|
|
39
42
|
ScriptRepoPathOption,
|
|
43
|
+
Sha256Option,
|
|
40
44
|
SourceDownloadUriOption,
|
|
41
45
|
SourceKindOption,
|
|
42
46
|
SourcePathOption,
|
|
@@ -45,10 +49,12 @@ from ..common import (
|
|
|
45
49
|
SourceUriOption,
|
|
46
50
|
_annotation_paths_for_directory,
|
|
47
51
|
_annotation_paths_for_file,
|
|
52
|
+
_collect_checksums,
|
|
48
53
|
_discover_directory_entries,
|
|
49
54
|
_ensure_annotation_outputs_available,
|
|
50
55
|
_error,
|
|
51
56
|
_validate_artifact_kind,
|
|
57
|
+
_validate_checksum_policy,
|
|
52
58
|
)
|
|
53
59
|
|
|
54
60
|
DEFAULT_ANNOTATION_INSTRUCTIONS = """
|
|
@@ -119,6 +125,10 @@ def annotate_file_command(
|
|
|
119
125
|
source_path: SourcePathOption = None,
|
|
120
126
|
source_revision: SourceRevisionOption = None,
|
|
121
127
|
source_sha256: SourceSha256Option = None,
|
|
128
|
+
checksum_policy: ChecksumPolicyOption = "auto",
|
|
129
|
+
max_checksum_bytes: MaxChecksumBytesOption = 10 * 1024**3,
|
|
130
|
+
sha256: Sha256Option = None,
|
|
131
|
+
checksum_values: ChecksumValuesOption = None,
|
|
122
132
|
force: ForceOption = False,
|
|
123
133
|
) -> None:
|
|
124
134
|
file_answers = _load_file_answers(answers_path)
|
|
@@ -209,6 +219,13 @@ def annotate_file_command(
|
|
|
209
219
|
if artifact_kind in {"dataset", "table"}:
|
|
210
220
|
if is_interactive and file_answers is None:
|
|
211
221
|
fields, primary_key, missing_value_codes = prompts._prompt_schema_details()
|
|
222
|
+
selected_checksum_policy = _validate_checksum_policy(checksum_policy)
|
|
223
|
+
checksum_overrides: dict[str | Path, str] = _collect_checksums(checksum_values)
|
|
224
|
+
artifact_sha256 = (
|
|
225
|
+
sha256
|
|
226
|
+
if sha256 is not None
|
|
227
|
+
else (file_answers.sha256 if file_answers else None)
|
|
228
|
+
)
|
|
212
229
|
|
|
213
230
|
annotation_path, readme_path = _write_post_hoc_file_bundle(
|
|
214
231
|
artifact_path,
|
|
@@ -221,6 +238,10 @@ def annotate_file_command(
|
|
|
221
238
|
inputs=inputs,
|
|
222
239
|
params=params,
|
|
223
240
|
provenance_overrides=provenance_overrides,
|
|
241
|
+
artifact_sha256=artifact_sha256,
|
|
242
|
+
checksum_policy=selected_checksum_policy,
|
|
243
|
+
max_checksum_bytes=max_checksum_bytes,
|
|
244
|
+
checksum_overrides=checksum_overrides,
|
|
224
245
|
)
|
|
225
246
|
typer.echo("")
|
|
226
247
|
typer.echo(f"Annotation: {annotation_path}")
|
|
@@ -297,6 +318,9 @@ def annotate_directory_command(
|
|
|
297
318
|
"--group-kind",
|
|
298
319
|
help="Artifact kind for the corresponding --group-selector.",
|
|
299
320
|
),
|
|
321
|
+
checksum_policy: ChecksumPolicyOption = "auto",
|
|
322
|
+
max_checksum_bytes: MaxChecksumBytesOption = 10 * 1024**3,
|
|
323
|
+
checksum_values: ChecksumValuesOption = None,
|
|
300
324
|
force: ForceOption = False,
|
|
301
325
|
) -> None:
|
|
302
326
|
directory_answers = _load_directory_answers(answers_path)
|
|
@@ -407,6 +431,11 @@ def annotate_directory_command(
|
|
|
407
431
|
)
|
|
408
432
|
|
|
409
433
|
default_kind = _validate_artifact_kind(kind) if kind is not None else None
|
|
434
|
+
selected_checksum_policy = _validate_checksum_policy(checksum_policy)
|
|
435
|
+
checksum_overrides: dict[str | Path, str] = (
|
|
436
|
+
dict(directory_answers.checksums) if directory_answers is not None else {}
|
|
437
|
+
)
|
|
438
|
+
checksum_overrides.update(_collect_checksums(checksum_values))
|
|
410
439
|
if directory_answers is not None and has_answers_inventory:
|
|
411
440
|
artifacts = _documented_artifacts_from_answers(directory_answers.artifacts)
|
|
412
441
|
if not group_selector_values:
|
|
@@ -486,6 +515,9 @@ def annotate_directory_command(
|
|
|
486
515
|
inputs=inputs,
|
|
487
516
|
params=params,
|
|
488
517
|
provenance_overrides=provenance_overrides,
|
|
518
|
+
checksum_policy=selected_checksum_policy,
|
|
519
|
+
max_checksum_bytes=max_checksum_bytes,
|
|
520
|
+
checksum_overrides=checksum_overrides,
|
|
489
521
|
)
|
|
490
522
|
|
|
491
523
|
typer.echo(f"Annotation: {annotation_path}")
|