data-annotations 2.5.0__tar.gz → 2.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_annotations-2.5.0 → data_annotations-2.7.0}/PKG-INFO +148 -4
- {data_annotations-2.5.0 → data_annotations-2.7.0}/README.md +146 -2
- {data_annotations-2.5.0 → data_annotations-2.7.0}/pyproject.toml +3 -3
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/_decorators.py +39 -4
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/annotations/__init__.py +12 -0
- {data_annotations-2.5.0/src/data_annotations/cli_app → data_annotations-2.7.0/src/data_annotations/annotations}/answers.py +151 -11
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/annotations/decorators.py +104 -8
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/annotations/writers.py +326 -27
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/annotate/__init__.py +32 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/annotate/helpers.py +94 -3
- data_annotations-2.7.0/src/data_annotations/cli_app/answers.py +35 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/common.py +57 -1
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/provenance_commands.py +15 -1
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/__init__.py +4 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/decorators.py +17 -1
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/models.py +1 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/chain.py +60 -11
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/matching.py +76 -15
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/writers.py +200 -18
- {data_annotations-2.5.0 → data_annotations-2.7.0}/LICENSE +0 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/__init__.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/annotations/models.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/cli.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/__init__.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/prompts.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/publish.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/description/__init__.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/description/decorators.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/description/models.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/description/writers.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/git.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/__init__.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/manifest.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/sources.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/types.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/provenance/runtime.py +0 -0
- {data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/publish.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-annotations
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.7.0
|
|
4
4
|
Summary: Annotate data artifacts with provenance and descriptions
|
|
5
5
|
Keywords: annotations,data,metadata,provenance,reproducibility
|
|
6
6
|
Author: Rodrigo C. G. Pena
|
|
@@ -18,7 +18,7 @@ Classifier: Programming Language :: Python :: 3.14
|
|
|
18
18
|
Classifier: Topic :: Scientific/Engineering
|
|
19
19
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
20
|
Requires-Dist: pydantic>=2.13.1
|
|
21
|
-
Requires-Dist: pyyaml>=6.0.2
|
|
21
|
+
Requires-Dist: pyyaml>=6.0.2
|
|
22
22
|
Requires-Dist: questionary>=2.1.1 ; extra == 'cli'
|
|
23
23
|
Requires-Dist: typer>=0.16.0 ; extra == 'cli'
|
|
24
24
|
Requires-Python: >=3.12
|
|
@@ -71,8 +71,9 @@ Or add it to a project with [uv](https://astral.sh/uv/):
|
|
|
71
71
|
uv add data-annotations
|
|
72
72
|
```
|
|
73
73
|
|
|
74
|
-
The command-line interface uses optional dependencies
|
|
75
|
-
CLI support when you want to run
|
|
74
|
+
The command-line interface uses optional dependencies for prompting and command
|
|
75
|
+
parsing. Install the package with CLI support when you want to run
|
|
76
|
+
`data-annotations` commands:
|
|
76
77
|
|
|
77
78
|
```bash
|
|
78
79
|
pip install "data-annotations[cli]"
|
|
@@ -109,6 +110,11 @@ Every annotation document includes provenance with:
|
|
|
109
110
|
directory content digests, and upstream annotation sidecar references when
|
|
110
111
|
present
|
|
111
112
|
|
|
113
|
+
Local file hashing defaults to checksum policy `auto`: existing files are hashed
|
|
114
|
+
only up to `10 * 1024**3` bytes (10 GiB). Larger files are still recorded, but
|
|
115
|
+
their `sha256` or directory `content_digest` is left unset unless you provide a
|
|
116
|
+
precomputed checksum yourself.
|
|
117
|
+
|
|
112
118
|
You can also attach your own parameters, input file paths, and function names.
|
|
113
119
|
Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
|
|
114
120
|
such as `s3://...` or `https://...` are preserved as provided.
|
|
@@ -373,6 +379,39 @@ metadata to vary per call instead of staying fixed at decoration time, use
|
|
|
373
379
|
`write_directory_annotation(...)` directly instead. See the example gallery in
|
|
374
380
|
`examples/` for runnable examples of all approaches.
|
|
375
381
|
|
|
382
|
+
The Python API can also load the same YAML answers payloads used by the
|
|
383
|
+
CLI:
|
|
384
|
+
|
|
385
|
+
```python
|
|
386
|
+
from data_annotations.annotations import (
|
|
387
|
+
annotate_directory,
|
|
388
|
+
annotate_file,
|
|
389
|
+
record_file_annotation,
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
annotate_file(answers="participants.yaml")
|
|
393
|
+
annotate_directory(answers="run-001.yaml")
|
|
394
|
+
|
|
395
|
+
annotate_file(
|
|
396
|
+
"outputs/summary.txt",
|
|
397
|
+
answers={"title": "Run Summary", "summary": "Validation run summary."},
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
@record_file_annotation(answers="participants.yaml")
|
|
401
|
+
def write_participants(artifact_path, input_path):
|
|
402
|
+
...
|
|
403
|
+
```
|
|
404
|
+
|
|
405
|
+
If an answers payload includes `target`, the positional artifact path or directory
|
|
406
|
+
may be omitted. When both are provided, they must resolve to the same path.
|
|
407
|
+
Explicit Python keyword arguments override values from `answers`. Environment
|
|
408
|
+
variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded inside string
|
|
409
|
+
values in both YAML files and mapping payloads.
|
|
410
|
+
|
|
411
|
+
For directory decorators, the wrapped function still provides the produced output
|
|
412
|
+
inventory. Matching `answers.artifacts` entries can supply titles, summaries,
|
|
413
|
+
kinds, fields, primary keys, and missing-value codes for those returned paths.
|
|
414
|
+
|
|
376
415
|
### When To Use Decorators Vs Direct Functions
|
|
377
416
|
|
|
378
417
|
If a function is only a final serializer for already-prepared data, prefer the
|
|
@@ -502,6 +541,75 @@ README.
|
|
|
502
541
|
If you want the direct writer approach instead, use `write_file_manifest(...)` and
|
|
503
542
|
`write_directory_manifest(...)` (see `examples/`).
|
|
504
543
|
|
|
544
|
+
## Checksum Policy
|
|
545
|
+
|
|
546
|
+
All provenance and annotation entry points that hash local files support the same
|
|
547
|
+
policy controls:
|
|
548
|
+
|
|
549
|
+
- `checksum_policy="auto"`: hash existing local files only when they are at or
|
|
550
|
+
below `max_checksum_bytes`. This is the default, and
|
|
551
|
+
`max_checksum_bytes` defaults to `10 * 1024**3` bytes (10 GiB).
|
|
552
|
+
- `checksum_policy="always"`: hash existing local files regardless of size.
|
|
553
|
+
- `checksum_policy="never"`: never hash local files automatically. Checksums are
|
|
554
|
+
recorded only when you supply them explicitly.
|
|
555
|
+
|
|
556
|
+
When a checksum is skipped, JSON sidecars keep the same schema and simply store
|
|
557
|
+
`sha256: null`. Directory `content_digest` is also left unset when any tracked
|
|
558
|
+
member file lacks a checksum.
|
|
559
|
+
|
|
560
|
+
You can change the policy from Python:
|
|
561
|
+
|
|
562
|
+
```python
|
|
563
|
+
from data_annotations.annotations import annotate_file
|
|
564
|
+
from data_annotations.provenance import write_file_manifest
|
|
565
|
+
|
|
566
|
+
write_file_manifest(
|
|
567
|
+
"outputs/summary.txt",
|
|
568
|
+
checksum_policy="always",
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
annotate_file(
|
|
572
|
+
"outputs/summary.txt",
|
|
573
|
+
title="Run Summary",
|
|
574
|
+
summary="Post-hoc summary.",
|
|
575
|
+
artifact_sha256="precomputed-sha256",
|
|
576
|
+
checksum_policy="never",
|
|
577
|
+
)
|
|
578
|
+
```
|
|
579
|
+
|
|
580
|
+
You can also inject precomputed checksums directly:
|
|
581
|
+
|
|
582
|
+
- File APIs: pass `artifact_sha256=...`.
|
|
583
|
+
- File or directory APIs: pass `checksum_overrides={path: sha256}`. For
|
|
584
|
+
directory outputs, keys can be relative to the output directory or absolute
|
|
585
|
+
paths.
|
|
586
|
+
- Decorators such as `record_file_manifest(...)`, `record_directory_manifest(...)`,
|
|
587
|
+
`record_file_annotation(...)`, and `record_directory_annotation(...)` accept the
|
|
588
|
+
same checksum-policy arguments.
|
|
589
|
+
|
|
590
|
+
From the CLI, use `--checksum-policy`, `--max-checksum-bytes`, `--sha256`, and
|
|
591
|
+
repeatable `--checksum PATH=SHA256`:
|
|
592
|
+
|
|
593
|
+
```bash
|
|
594
|
+
data-annotations annotate file path/to/summary.txt \
|
|
595
|
+
--title "Run Summary" \
|
|
596
|
+
--summary "Post-hoc summary." \
|
|
597
|
+
--kind report \
|
|
598
|
+
--checksum-policy never \
|
|
599
|
+
--sha256 0123456789abcdef...
|
|
600
|
+
|
|
601
|
+
data-annotations annotate directory path/to/run-001 \
|
|
602
|
+
--title "Processing outputs" \
|
|
603
|
+
--summary "Directory-level outputs." \
|
|
604
|
+
--checksum-policy never \
|
|
605
|
+
--checksum processed.csv=0123456789abcdef...
|
|
606
|
+
|
|
607
|
+
data-annotations provenance chain path/to/run-001 \
|
|
608
|
+
--checksum-policy always
|
|
609
|
+
```
|
|
610
|
+
|
|
611
|
+
For a complete runnable workflow, see `examples/checksum_policy.py`.
|
|
612
|
+
|
|
505
613
|
## Description Layer
|
|
506
614
|
|
|
507
615
|
The `data_annotations.description` sub-package provides the structured description
|
|
@@ -628,6 +736,7 @@ target: path/to/participants.csv
|
|
|
628
736
|
title: Participant Cohort
|
|
629
737
|
summary: Participant-level cohort assignments.
|
|
630
738
|
kind: dataset
|
|
739
|
+
sha256: 0123456789abcdef...
|
|
631
740
|
|
|
632
741
|
inputs:
|
|
633
742
|
- ${DATA_ROOT}/raw/participants.csv
|
|
@@ -670,6 +779,9 @@ provenance:
|
|
|
670
779
|
command: bash process_from_instrument.sh
|
|
671
780
|
script: process_from_instrument.sh
|
|
672
781
|
|
|
782
|
+
checksums:
|
|
783
|
+
processed.csv: 0123456789abcdef...
|
|
784
|
+
|
|
673
785
|
artifacts:
|
|
674
786
|
- path: processed.csv
|
|
675
787
|
kind: dataset
|
|
@@ -694,6 +806,32 @@ Answers files may also use schema-style aliases such as `subject.path`,
|
|
|
694
806
|
`description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
|
|
695
807
|
and `provenance.params`.
|
|
696
808
|
|
|
809
|
+
Answers can request selected provenance fields from the current runtime instead
|
|
810
|
+
of taking them from the payload:
|
|
811
|
+
|
|
812
|
+
```yaml
|
|
813
|
+
target: path/to/run-001
|
|
814
|
+
title: Processing outputs
|
|
815
|
+
summary: Files produced by the shell processing workflow.
|
|
816
|
+
|
|
817
|
+
provenance:
|
|
818
|
+
command: bash generate_some_data_artifact.sh
|
|
819
|
+
script: generate_some_data_artifact.sh
|
|
820
|
+
infer_from_runtime:
|
|
821
|
+
- runtime
|
|
822
|
+
- git
|
|
823
|
+
- source_code
|
|
824
|
+
```
|
|
825
|
+
|
|
826
|
+
`runtime` covers `created_at`, `hostname`, `username`, and `slurm_job_id`. `git`
|
|
827
|
+
covers Git commit, branch, dirty state, remote, tags, and `git describe`.
|
|
828
|
+
`source_code` leaves the source-code reference derived from runtime Git metadata.
|
|
829
|
+
This is especially useful for timestamps, host/user and SLURM context, Git state,
|
|
830
|
+
and derived `source_code`. Provide generation `command` and `script` explicitly
|
|
831
|
+
in CLI answers files, because the runtime command and script would describe the
|
|
832
|
+
`data-annotations annotate ...` invocation rather than the script that generated
|
|
833
|
+
the artifact.
|
|
834
|
+
|
|
697
835
|
For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
|
|
698
836
|
`file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
|
|
699
837
|
sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
|
|
@@ -765,6 +903,11 @@ resolving an older installed command. From a source checkout, use
|
|
|
765
903
|
`uv run data-annotations provenance chain ...`, or reinstall the CLI from the
|
|
766
904
|
updated source before using the bare `data-annotations` command.
|
|
767
905
|
|
|
906
|
+
Both `match` and `chain` also accept `--checksum-policy` and
|
|
907
|
+
`--max-checksum-bytes`. Use `--checksum-policy always` when you want full
|
|
908
|
+
verification of large local files, and leave the default `auto` when you prefer
|
|
909
|
+
to avoid long checksum passes on very large artifacts.
|
|
910
|
+
|
|
768
911
|
### Run With `uvx`
|
|
769
912
|
|
|
770
913
|
```bash
|
|
@@ -886,6 +1029,7 @@ uv run python examples/record_file_description.py
|
|
|
886
1029
|
uv run python examples/record_directory_description.py
|
|
887
1030
|
uv run python examples/annotate_file.py
|
|
888
1031
|
uv run python examples/annotate_directory.py
|
|
1032
|
+
uv run python examples/checksum_policy.py
|
|
889
1033
|
uv run python examples/annotate_file_answers_cli.py
|
|
890
1034
|
uv run python examples/write_file_manifest.py
|
|
891
1035
|
uv run python examples/write_directory_manifest.py
|
|
@@ -41,8 +41,9 @@ Or add it to a project with [uv](https://astral.sh/uv/):
|
|
|
41
41
|
uv add data-annotations
|
|
42
42
|
```
|
|
43
43
|
|
|
44
|
-
The command-line interface uses optional dependencies
|
|
45
|
-
CLI support when you want to run
|
|
44
|
+
The command-line interface uses optional dependencies for prompting and command
|
|
45
|
+
parsing. Install the package with CLI support when you want to run
|
|
46
|
+
`data-annotations` commands:
|
|
46
47
|
|
|
47
48
|
```bash
|
|
48
49
|
pip install "data-annotations[cli]"
|
|
@@ -79,6 +80,11 @@ Every annotation document includes provenance with:
|
|
|
79
80
|
directory content digests, and upstream annotation sidecar references when
|
|
80
81
|
present
|
|
81
82
|
|
|
83
|
+
Local file hashing defaults to checksum policy `auto`: existing files are hashed
|
|
84
|
+
only up to `10 * 1024**3` bytes (10 GiB). Larger files are still recorded, but
|
|
85
|
+
their `sha256` or directory `content_digest` is left unset unless you provide a
|
|
86
|
+
precomputed checksum yourself.
|
|
87
|
+
|
|
82
88
|
You can also attach your own parameters, input file paths, and function names.
|
|
83
89
|
Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
|
|
84
90
|
such as `s3://...` or `https://...` are preserved as provided.
|
|
@@ -343,6 +349,39 @@ metadata to vary per call instead of staying fixed at decoration time, use
|
|
|
343
349
|
`write_directory_annotation(...)` directly instead. See the example gallery in
|
|
344
350
|
`examples/` for runnable examples of all approaches.
|
|
345
351
|
|
|
352
|
+
The Python API can also load the same YAML answers payloads used by the
|
|
353
|
+
CLI:
|
|
354
|
+
|
|
355
|
+
```python
|
|
356
|
+
from data_annotations.annotations import (
|
|
357
|
+
annotate_directory,
|
|
358
|
+
annotate_file,
|
|
359
|
+
record_file_annotation,
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
annotate_file(answers="participants.yaml")
|
|
363
|
+
annotate_directory(answers="run-001.yaml")
|
|
364
|
+
|
|
365
|
+
annotate_file(
|
|
366
|
+
"outputs/summary.txt",
|
|
367
|
+
answers={"title": "Run Summary", "summary": "Validation run summary."},
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
@record_file_annotation(answers="participants.yaml")
|
|
371
|
+
def write_participants(artifact_path, input_path):
|
|
372
|
+
...
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
If an answers payload includes `target`, the positional artifact path or directory
|
|
376
|
+
may be omitted. When both are provided, they must resolve to the same path.
|
|
377
|
+
Explicit Python keyword arguments override values from `answers`. Environment
|
|
378
|
+
variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded inside string
|
|
379
|
+
values in both YAML files and mapping payloads.
|
|
380
|
+
|
|
381
|
+
For directory decorators, the wrapped function still provides the produced output
|
|
382
|
+
inventory. Matching `answers.artifacts` entries can supply titles, summaries,
|
|
383
|
+
kinds, fields, primary keys, and missing-value codes for those returned paths.
|
|
384
|
+
|
|
346
385
|
### When To Use Decorators Vs Direct Functions
|
|
347
386
|
|
|
348
387
|
If a function is only a final serializer for already-prepared data, prefer the
|
|
@@ -472,6 +511,75 @@ README.
|
|
|
472
511
|
If you want the direct writer approach instead, use `write_file_manifest(...)` and
|
|
473
512
|
`write_directory_manifest(...)` (see `examples/`).
|
|
474
513
|
|
|
514
|
+
## Checksum Policy
|
|
515
|
+
|
|
516
|
+
All provenance and annotation entry points that hash local files support the same
|
|
517
|
+
policy controls:
|
|
518
|
+
|
|
519
|
+
- `checksum_policy="auto"`: hash existing local files only when they are at or
|
|
520
|
+
below `max_checksum_bytes`. This is the default, and
|
|
521
|
+
`max_checksum_bytes` defaults to `10 * 1024**3` bytes (10 GiB).
|
|
522
|
+
- `checksum_policy="always"`: hash existing local files regardless of size.
|
|
523
|
+
- `checksum_policy="never"`: never hash local files automatically. Checksums are
|
|
524
|
+
recorded only when you supply them explicitly.
|
|
525
|
+
|
|
526
|
+
When a checksum is skipped, JSON sidecars keep the same schema and simply store
|
|
527
|
+
`sha256: null`. Directory `content_digest` is also left unset when any tracked
|
|
528
|
+
member file lacks a checksum.
|
|
529
|
+
|
|
530
|
+
You can change the policy from Python:
|
|
531
|
+
|
|
532
|
+
```python
|
|
533
|
+
from data_annotations.annotations import annotate_file
|
|
534
|
+
from data_annotations.provenance import write_file_manifest
|
|
535
|
+
|
|
536
|
+
write_file_manifest(
|
|
537
|
+
"outputs/summary.txt",
|
|
538
|
+
checksum_policy="always",
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
annotate_file(
|
|
542
|
+
"outputs/summary.txt",
|
|
543
|
+
title="Run Summary",
|
|
544
|
+
summary="Post-hoc summary.",
|
|
545
|
+
artifact_sha256="precomputed-sha256",
|
|
546
|
+
checksum_policy="never",
|
|
547
|
+
)
|
|
548
|
+
```
|
|
549
|
+
|
|
550
|
+
You can also inject precomputed checksums directly:
|
|
551
|
+
|
|
552
|
+
- File APIs: pass `artifact_sha256=...`.
|
|
553
|
+
- File or directory APIs: pass `checksum_overrides={path: sha256}`. For
|
|
554
|
+
directory outputs, keys can be relative to the output directory or absolute
|
|
555
|
+
paths.
|
|
556
|
+
- Decorators such as `record_file_manifest(...)`, `record_directory_manifest(...)`,
|
|
557
|
+
`record_file_annotation(...)`, and `record_directory_annotation(...)` accept the
|
|
558
|
+
same checksum-policy arguments.
|
|
559
|
+
|
|
560
|
+
From the CLI, use `--checksum-policy`, `--max-checksum-bytes`, `--sha256`, and
|
|
561
|
+
repeatable `--checksum PATH=SHA256`:
|
|
562
|
+
|
|
563
|
+
```bash
|
|
564
|
+
data-annotations annotate file path/to/summary.txt \
|
|
565
|
+
--title "Run Summary" \
|
|
566
|
+
--summary "Post-hoc summary." \
|
|
567
|
+
--kind report \
|
|
568
|
+
--checksum-policy never \
|
|
569
|
+
--sha256 0123456789abcdef...
|
|
570
|
+
|
|
571
|
+
data-annotations annotate directory path/to/run-001 \
|
|
572
|
+
--title "Processing outputs" \
|
|
573
|
+
--summary "Directory-level outputs." \
|
|
574
|
+
--checksum-policy never \
|
|
575
|
+
--checksum processed.csv=0123456789abcdef...
|
|
576
|
+
|
|
577
|
+
data-annotations provenance chain path/to/run-001 \
|
|
578
|
+
--checksum-policy always
|
|
579
|
+
```
|
|
580
|
+
|
|
581
|
+
For a complete runnable workflow, see `examples/checksum_policy.py`.
|
|
582
|
+
|
|
475
583
|
## Description Layer
|
|
476
584
|
|
|
477
585
|
The `data_annotations.description` sub-package provides the structured description
|
|
@@ -598,6 +706,7 @@ target: path/to/participants.csv
|
|
|
598
706
|
title: Participant Cohort
|
|
599
707
|
summary: Participant-level cohort assignments.
|
|
600
708
|
kind: dataset
|
|
709
|
+
sha256: 0123456789abcdef...
|
|
601
710
|
|
|
602
711
|
inputs:
|
|
603
712
|
- ${DATA_ROOT}/raw/participants.csv
|
|
@@ -640,6 +749,9 @@ provenance:
|
|
|
640
749
|
command: bash process_from_instrument.sh
|
|
641
750
|
script: process_from_instrument.sh
|
|
642
751
|
|
|
752
|
+
checksums:
|
|
753
|
+
processed.csv: 0123456789abcdef...
|
|
754
|
+
|
|
643
755
|
artifacts:
|
|
644
756
|
- path: processed.csv
|
|
645
757
|
kind: dataset
|
|
@@ -664,6 +776,32 @@ Answers files may also use schema-style aliases such as `subject.path`,
|
|
|
664
776
|
`description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
|
|
665
777
|
and `provenance.params`.
|
|
666
778
|
|
|
779
|
+
Answers can request selected provenance fields from the current runtime instead
|
|
780
|
+
of taking them from the payload:
|
|
781
|
+
|
|
782
|
+
```yaml
|
|
783
|
+
target: path/to/run-001
|
|
784
|
+
title: Processing outputs
|
|
785
|
+
summary: Files produced by the shell processing workflow.
|
|
786
|
+
|
|
787
|
+
provenance:
|
|
788
|
+
command: bash generate_some_data_artifact.sh
|
|
789
|
+
script: generate_some_data_artifact.sh
|
|
790
|
+
infer_from_runtime:
|
|
791
|
+
- runtime
|
|
792
|
+
- git
|
|
793
|
+
- source_code
|
|
794
|
+
```
|
|
795
|
+
|
|
796
|
+
`runtime` covers `created_at`, `hostname`, `username`, and `slurm_job_id`. `git`
|
|
797
|
+
covers Git commit, branch, dirty state, remote, tags, and `git describe`.
|
|
798
|
+
`source_code` leaves the source-code reference derived from runtime Git metadata.
|
|
799
|
+
This is especially useful for timestamps, host/user and SLURM context, Git state,
|
|
800
|
+
and derived `source_code`. Provide generation `command` and `script` explicitly
|
|
801
|
+
in CLI answers files, because the runtime command and script would describe the
|
|
802
|
+
`data-annotations annotate ...` invocation rather than the script that generated
|
|
803
|
+
the artifact.
|
|
804
|
+
|
|
667
805
|
For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
|
|
668
806
|
`file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
|
|
669
807
|
sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
|
|
@@ -735,6 +873,11 @@ resolving an older installed command. From a source checkout, use
|
|
|
735
873
|
`uv run data-annotations provenance chain ...`, or reinstall the CLI from the
|
|
736
874
|
updated source before using the bare `data-annotations` command.
|
|
737
875
|
|
|
876
|
+
Both `match` and `chain` also accept `--checksum-policy` and
|
|
877
|
+
`--max-checksum-bytes`. Use `--checksum-policy always` when you want full
|
|
878
|
+
verification of large local files, and leave the default `auto` when you prefer
|
|
879
|
+
to avoid long checksum passes on very large artifacts.
|
|
880
|
+
|
|
738
881
|
### Run With `uvx`
|
|
739
882
|
|
|
740
883
|
```bash
|
|
@@ -856,6 +999,7 @@ uv run python examples/record_file_description.py
|
|
|
856
999
|
uv run python examples/record_directory_description.py
|
|
857
1000
|
uv run python examples/annotate_file.py
|
|
858
1001
|
uv run python examples/annotate_directory.py
|
|
1002
|
+
uv run python examples/checksum_policy.py
|
|
859
1003
|
uv run python examples/annotate_file_answers_cli.py
|
|
860
1004
|
uv run python examples/write_file_manifest.py
|
|
861
1005
|
uv run python examples/write_directory_manifest.py
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "data-annotations"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.7.0"
|
|
4
4
|
description = "Annotate data artifacts with provenance and descriptions"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
@@ -9,7 +9,7 @@ authors = [
|
|
|
9
9
|
license = "BSD-3-Clause"
|
|
10
10
|
license-files = ["LICENSE"]
|
|
11
11
|
requires-python = ">=3.12"
|
|
12
|
-
dependencies = ["pydantic>=2.13.1"]
|
|
12
|
+
dependencies = ["pydantic>=2.13.1", "PyYAML>=6.0.2"]
|
|
13
13
|
keywords = ["annotations", "data", "metadata", "provenance", "reproducibility"]
|
|
14
14
|
classifiers = [
|
|
15
15
|
"Development Status :: 4 - Beta",
|
|
@@ -30,7 +30,7 @@ Changelog = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/blob/main/C
|
|
|
30
30
|
Issues = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/issues"
|
|
31
31
|
|
|
32
32
|
[project.optional-dependencies]
|
|
33
|
-
cli = ["
|
|
33
|
+
cli = ["questionary>=2.1.1", "typer>=0.16.0"]
|
|
34
34
|
|
|
35
35
|
[project.scripts]
|
|
36
36
|
data-annotations = "data_annotations.cli:main"
|
|
@@ -9,6 +9,7 @@ if TYPE_CHECKING:
|
|
|
9
9
|
DocumentedArtifactGroup,
|
|
10
10
|
)
|
|
11
11
|
from data_annotations.provenance.models import ChildBundle, ProducedFile
|
|
12
|
+
from data_annotations.provenance.models import ChecksumPolicy
|
|
12
13
|
|
|
13
14
|
DEFAULT_INPUT_ARGS = ("input_path", "input_paths")
|
|
14
15
|
|
|
@@ -78,6 +79,8 @@ def coerce_produced_file(
|
|
|
78
79
|
item: Any,
|
|
79
80
|
*,
|
|
80
81
|
normalize_paths: bool = True,
|
|
82
|
+
checksum_policy: "ChecksumPolicy" = "auto",
|
|
83
|
+
max_checksum_bytes: int | None = None,
|
|
81
84
|
) -> "ProducedFile":
|
|
82
85
|
from data_annotations.description.models import DocumentedArtifact
|
|
83
86
|
from data_annotations.provenance import writers as provenance_writers
|
|
@@ -89,7 +92,15 @@ def coerce_produced_file(
|
|
|
89
92
|
path=str(path),
|
|
90
93
|
kind=item.kind,
|
|
91
94
|
sha256=(
|
|
92
|
-
provenance_writers.
|
|
95
|
+
provenance_writers._resolve_file_sha256(
|
|
96
|
+
path,
|
|
97
|
+
checksum_policy=checksum_policy,
|
|
98
|
+
max_checksum_bytes=(
|
|
99
|
+
max_checksum_bytes
|
|
100
|
+
if max_checksum_bytes is not None
|
|
101
|
+
else provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES
|
|
102
|
+
),
|
|
103
|
+
)
|
|
93
104
|
if normalize_paths and path.exists()
|
|
94
105
|
else None
|
|
95
106
|
),
|
|
@@ -106,7 +117,15 @@ def coerce_produced_file(
|
|
|
106
117
|
path=str(normalized),
|
|
107
118
|
kind=kind,
|
|
108
119
|
sha256=(
|
|
109
|
-
provenance_writers.
|
|
120
|
+
provenance_writers._resolve_file_sha256(
|
|
121
|
+
normalized,
|
|
122
|
+
checksum_policy=checksum_policy,
|
|
123
|
+
max_checksum_bytes=(
|
|
124
|
+
max_checksum_bytes
|
|
125
|
+
if max_checksum_bytes is not None
|
|
126
|
+
else provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES
|
|
127
|
+
),
|
|
128
|
+
)
|
|
110
129
|
if normalize_paths and normalized.exists()
|
|
111
130
|
else None
|
|
112
131
|
),
|
|
@@ -117,7 +136,15 @@ def coerce_produced_file(
|
|
|
117
136
|
path=str(path),
|
|
118
137
|
kind="other",
|
|
119
138
|
sha256=(
|
|
120
|
-
provenance_writers.
|
|
139
|
+
provenance_writers._resolve_file_sha256(
|
|
140
|
+
path,
|
|
141
|
+
checksum_policy=checksum_policy,
|
|
142
|
+
max_checksum_bytes=(
|
|
143
|
+
max_checksum_bytes
|
|
144
|
+
if max_checksum_bytes is not None
|
|
145
|
+
else provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES
|
|
146
|
+
),
|
|
147
|
+
)
|
|
121
148
|
if normalize_paths and path.exists()
|
|
122
149
|
else None
|
|
123
150
|
),
|
|
@@ -128,9 +155,17 @@ def coerce_produced_files(
|
|
|
128
155
|
items: Iterable[Any],
|
|
129
156
|
*,
|
|
130
157
|
normalize_paths: bool = True,
|
|
158
|
+
checksum_policy: "ChecksumPolicy" = "auto",
|
|
159
|
+
max_checksum_bytes: int | None = None,
|
|
131
160
|
) -> list["ProducedFile"]:
|
|
132
161
|
return [
|
|
133
|
-
coerce_produced_file(
|
|
162
|
+
coerce_produced_file(
|
|
163
|
+
item,
|
|
164
|
+
normalize_paths=normalize_paths,
|
|
165
|
+
checksum_policy=checksum_policy,
|
|
166
|
+
max_checksum_bytes=max_checksum_bytes,
|
|
167
|
+
)
|
|
168
|
+
for item in items
|
|
134
169
|
]
|
|
135
170
|
|
|
136
171
|
|
{data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/annotations/__init__.py
RENAMED
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
from .answers import (
|
|
2
|
+
AnswersError,
|
|
3
|
+
DirectoryAnswers,
|
|
4
|
+
FileAnswers,
|
|
5
|
+
load_directory_answers,
|
|
6
|
+
load_file_answers,
|
|
7
|
+
)
|
|
1
8
|
from .models import (
|
|
2
9
|
DirectoryAnnotationDocument,
|
|
3
10
|
DirectoryAnnotationResult,
|
|
@@ -17,13 +24,18 @@ from .writers import (
|
|
|
17
24
|
__all__ = [
|
|
18
25
|
"annotate_directory",
|
|
19
26
|
"annotate_file",
|
|
27
|
+
"load_directory_answers",
|
|
28
|
+
"load_file_answers",
|
|
20
29
|
"record_directory_annotation",
|
|
21
30
|
"record_file_annotation",
|
|
22
31
|
"write_directory_annotation",
|
|
23
32
|
"write_file_annotation",
|
|
33
|
+
"AnswersError",
|
|
34
|
+
"DirectoryAnswers",
|
|
24
35
|
"DirectoryAnnotationDocument",
|
|
25
36
|
"DirectoryAnnotationResult",
|
|
26
37
|
"DirectoryArtifactSubject",
|
|
38
|
+
"FileAnswers",
|
|
27
39
|
"FileAnnotationDocument",
|
|
28
40
|
"FileAnnotationResult",
|
|
29
41
|
"FileArtifactSubject",
|