data-annotations 2.6.0__tar.gz → 2.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_annotations-2.6.0 → data_annotations-2.8.0}/PKG-INFO +84 -12
- {data_annotations-2.6.0 → data_annotations-2.8.0}/README.md +82 -10
- {data_annotations-2.6.0 → data_annotations-2.8.0}/pyproject.toml +3 -3
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/annotations/__init__.py +12 -0
- {data_annotations-2.6.0/src/data_annotations/cli_app → data_annotations-2.8.0/src/data_annotations/annotations}/answers.py +137 -10
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/annotations/decorators.py +88 -9
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/annotations/writers.py +287 -31
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/cli_app/annotate/__init__.py +20 -4
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/cli_app/annotate/helpers.py +102 -21
- data_annotations-2.8.0/src/data_annotations/cli_app/answers.py +35 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/LICENSE +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/__init__.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/_decorators.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/annotations/models.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/cli.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/cli_app/__init__.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/cli_app/common.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/cli_app/prompts.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/cli_app/provenance_commands.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/cli_app/publish.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/description/__init__.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/description/decorators.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/description/models.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/description/writers.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/__init__.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/decorators.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/git.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/models.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/recovery/__init__.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/recovery/chain.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/recovery/manifest.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/recovery/matching.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/recovery/sources.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/recovery/types.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/runtime.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/writers.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/publish.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-annotations
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.8.0
|
|
4
4
|
Summary: Annotate data artifacts with provenance and descriptions
|
|
5
5
|
Keywords: annotations,data,metadata,provenance,reproducibility
|
|
6
6
|
Author: Rodrigo C. G. Pena
|
|
@@ -18,7 +18,7 @@ Classifier: Programming Language :: Python :: 3.14
|
|
|
18
18
|
Classifier: Topic :: Scientific/Engineering
|
|
19
19
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
20
|
Requires-Dist: pydantic>=2.13.1
|
|
21
|
-
Requires-Dist: pyyaml>=6.0.2
|
|
21
|
+
Requires-Dist: pyyaml>=6.0.2
|
|
22
22
|
Requires-Dist: questionary>=2.1.1 ; extra == 'cli'
|
|
23
23
|
Requires-Dist: typer>=0.16.0 ; extra == 'cli'
|
|
24
24
|
Requires-Python: >=3.12
|
|
@@ -71,8 +71,9 @@ Or add it to a project with [uv](https://astral.sh/uv/):
|
|
|
71
71
|
uv add data-annotations
|
|
72
72
|
```
|
|
73
73
|
|
|
74
|
-
The command-line interface uses optional dependencies
|
|
75
|
-
CLI support when you want to run
|
|
74
|
+
The command-line interface uses optional dependencies for prompting and command
|
|
75
|
+
parsing. Install the package with CLI support when you want to run
|
|
76
|
+
`data-annotations` commands:
|
|
76
77
|
|
|
77
78
|
```bash
|
|
78
79
|
pip install "data-annotations[cli]"
|
|
@@ -125,13 +126,13 @@ Git tags and `git_describe` are human-friendly hints only. For Git sources,
|
|
|
125
126
|
The recommended way to annotate your data artifacts is to decorate pipeline
|
|
126
127
|
functions that consume some inputs and parameters, then write those artifacts.
|
|
127
128
|
This keeps the artifact-writing logic explicit while letting `data-annotations` capture
|
|
128
|
-
provenance and emit
|
|
129
|
+
provenance and emit the annotation JSON sidecar automatically.
|
|
129
130
|
|
|
130
131
|
For example, here is a complete file-level annotation workflow using the
|
|
131
132
|
`record_file_annotation(...)` decorator. Once `write_participants` is called, it
|
|
132
|
-
automatically generates
|
|
133
|
-
|
|
134
|
-
|
|
133
|
+
automatically generates `participants.csv.annotation.json`. Set
|
|
134
|
+
`write_readme=True` when you also want `participants.csv.README.md`, a
|
|
135
|
+
human-friendly Markdown rendering of the description provided in the decorator.
|
|
135
136
|
|
|
136
137
|
```python
|
|
137
138
|
from pathlib import Path
|
|
@@ -164,6 +165,7 @@ from data_annotations.description import AllowedValue, FieldDefinition
|
|
|
164
165
|
artifact_kind="dataset",
|
|
165
166
|
acquisition_context={"source": "Study A registry export"},
|
|
166
167
|
generation_context={"pipeline": "baseline-v1"},
|
|
168
|
+
write_readme=True,
|
|
167
169
|
)
|
|
168
170
|
def write_participants(
|
|
169
171
|
artifact_path: Path,
|
|
@@ -284,6 +286,7 @@ from data_annotations.provenance import ProducedFile
|
|
|
284
286
|
summary="Directory-level documentation for the validation run outputs.",
|
|
285
287
|
acquisition_context={"source": "Study A registry export"},
|
|
286
288
|
generation_context={"pipeline": "baseline-v1"},
|
|
289
|
+
write_readme=True,
|
|
287
290
|
)
|
|
288
291
|
def build_outputs(
|
|
289
292
|
output_dir: Path,
|
|
@@ -378,6 +381,43 @@ metadata to vary per call instead of staying fixed at decoration time, use
|
|
|
378
381
|
`write_directory_annotation(...)` directly instead. See the example gallery in
|
|
379
382
|
`examples/` for runnable examples of all approaches.
|
|
380
383
|
|
|
384
|
+
The Python API can also load the same YAML answers payloads used by the
|
|
385
|
+
CLI:
|
|
386
|
+
|
|
387
|
+
```python
|
|
388
|
+
from data_annotations.annotations import (
|
|
389
|
+
annotate_directory,
|
|
390
|
+
annotate_file,
|
|
391
|
+
record_file_annotation,
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
annotate_file(answers="participants.yaml")
|
|
395
|
+
annotate_directory(answers="run-001.yaml")
|
|
396
|
+
|
|
397
|
+
annotate_file(
|
|
398
|
+
"outputs/summary.txt",
|
|
399
|
+
answers={"title": "Run Summary", "summary": "Validation run summary."},
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
# Add write_readme=True when you also want Markdown README sidecars.
|
|
403
|
+
annotate_file(answers="participants.yaml", write_readme=True)
|
|
404
|
+
annotate_directory(answers="run-001.yaml", write_readme=True)
|
|
405
|
+
|
|
406
|
+
@record_file_annotation(answers="participants.yaml", write_readme=True)
|
|
407
|
+
def write_participants(artifact_path, input_path):
|
|
408
|
+
...
|
|
409
|
+
```
|
|
410
|
+
|
|
411
|
+
If an answers payload includes `target`, the positional artifact path or directory
|
|
412
|
+
may be omitted. When both are provided, they must resolve to the same path.
|
|
413
|
+
Explicit Python keyword arguments override values from `answers`. Environment
|
|
414
|
+
variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded inside string
|
|
415
|
+
values in both YAML files and mapping payloads.
|
|
416
|
+
|
|
417
|
+
For directory decorators, the wrapped function still provides the produced output
|
|
418
|
+
inventory. Matching `answers.artifacts` entries can supply titles, summaries,
|
|
419
|
+
kinds, fields, primary keys, and missing-value codes for those returned paths.
|
|
420
|
+
|
|
381
421
|
### When To Use Decorators Vs Direct Functions
|
|
382
422
|
|
|
383
423
|
If a function is only a final serializer for already-prepared data, prefer the
|
|
@@ -673,9 +713,14 @@ data-annotations annotate directory path/to/run-001 \
|
|
|
673
713
|
--group-kind plot
|
|
674
714
|
```
|
|
675
715
|
|
|
676
|
-
These commands prompt for missing details
|
|
677
|
-
|
|
678
|
-
`capture_mode="post_hoc"`.
|
|
716
|
+
These commands prompt for missing details and write `*.annotation.json` or
|
|
717
|
+
`data-annotations.json`. Post-hoc records are marked with
|
|
718
|
+
`capture_mode="post_hoc"`. README sidecars are opt-in:
|
|
719
|
+
|
|
720
|
+
```bash
|
|
721
|
+
data-annotations annotate file path/to/participants.csv --write-readme
|
|
722
|
+
data-annotations annotate directory path/to/run-001 --write-readme
|
|
723
|
+
```
|
|
679
724
|
|
|
680
725
|
For shell workflows, you can move the prompt answers into a YAML file and run
|
|
681
726
|
the command non-interactively:
|
|
@@ -772,6 +817,32 @@ Answers files may also use schema-style aliases such as `subject.path`,
|
|
|
772
817
|
`description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
|
|
773
818
|
and `provenance.params`.
|
|
774
819
|
|
|
820
|
+
Answers can request selected provenance fields from the current runtime instead
|
|
821
|
+
of taking them from the payload:
|
|
822
|
+
|
|
823
|
+
```yaml
|
|
824
|
+
target: path/to/run-001
|
|
825
|
+
title: Processing outputs
|
|
826
|
+
summary: Files produced by the shell processing workflow.
|
|
827
|
+
|
|
828
|
+
provenance:
|
|
829
|
+
command: bash generate_some_data_artifact.sh
|
|
830
|
+
script: generate_some_data_artifact.sh
|
|
831
|
+
infer_from_runtime:
|
|
832
|
+
- runtime
|
|
833
|
+
- git
|
|
834
|
+
- source_code
|
|
835
|
+
```
|
|
836
|
+
|
|
837
|
+
`runtime` covers `created_at`, `hostname`, `username`, and `slurm_job_id`. `git`
|
|
838
|
+
covers Git commit, branch, dirty state, remote, tags, and `git describe`.
|
|
839
|
+
`source_code` leaves the source-code reference derived from runtime Git metadata.
|
|
840
|
+
This is especially useful for timestamps, host/user and SLURM context, Git state,
|
|
841
|
+
and derived `source_code`. Provide generation `command` and `script` explicitly
|
|
842
|
+
in CLI answers files, because the runtime command and script would describe the
|
|
843
|
+
`data-annotations annotate ...` invocation rather than the script that generated
|
|
844
|
+
the artifact.
|
|
845
|
+
|
|
775
846
|
For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
|
|
776
847
|
`file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
|
|
777
848
|
sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
|
|
@@ -984,4 +1055,5 @@ uv run python examples/publish_cli.py
|
|
|
984
1055
|
```
|
|
985
1056
|
|
|
986
1057
|
Each example writes its outputs to a fresh temporary directory and prints the
|
|
987
|
-
location so you can inspect the generated annotation documents and
|
|
1058
|
+
location so you can inspect the generated annotation documents and any requested
|
|
1059
|
+
README sidecars.
|
|
@@ -41,8 +41,9 @@ Or add it to a project with [uv](https://astral.sh/uv/):
|
|
|
41
41
|
uv add data-annotations
|
|
42
42
|
```
|
|
43
43
|
|
|
44
|
-
The command-line interface uses optional dependencies
|
|
45
|
-
CLI support when you want to run
|
|
44
|
+
The command-line interface uses optional dependencies for prompting and command
|
|
45
|
+
parsing. Install the package with CLI support when you want to run
|
|
46
|
+
`data-annotations` commands:
|
|
46
47
|
|
|
47
48
|
```bash
|
|
48
49
|
pip install "data-annotations[cli]"
|
|
@@ -95,13 +96,13 @@ Git tags and `git_describe` are human-friendly hints only. For Git sources,
|
|
|
95
96
|
The recommended way to annotate your data artifacts is to decorate pipeline
|
|
96
97
|
functions that consume some inputs and parameters, then write those artifacts.
|
|
97
98
|
This keeps the artifact-writing logic explicit while letting `data-annotations` capture
|
|
98
|
-
provenance and emit
|
|
99
|
+
provenance and emit the annotation JSON sidecar automatically.
|
|
99
100
|
|
|
100
101
|
For example, here is a complete file-level annotation workflow using the
|
|
101
102
|
`record_file_annotation(...)` decorator. Once `write_participants` is called, it
|
|
102
|
-
automatically generates
|
|
103
|
-
|
|
104
|
-
|
|
103
|
+
automatically generates `participants.csv.annotation.json`. Set
|
|
104
|
+
`write_readme=True` when you also want `participants.csv.README.md`, a
|
|
105
|
+
human-friendly Markdown rendering of the description provided in the decorator.
|
|
105
106
|
|
|
106
107
|
```python
|
|
107
108
|
from pathlib import Path
|
|
@@ -134,6 +135,7 @@ from data_annotations.description import AllowedValue, FieldDefinition
|
|
|
134
135
|
artifact_kind="dataset",
|
|
135
136
|
acquisition_context={"source": "Study A registry export"},
|
|
136
137
|
generation_context={"pipeline": "baseline-v1"},
|
|
138
|
+
write_readme=True,
|
|
137
139
|
)
|
|
138
140
|
def write_participants(
|
|
139
141
|
artifact_path: Path,
|
|
@@ -254,6 +256,7 @@ from data_annotations.provenance import ProducedFile
|
|
|
254
256
|
summary="Directory-level documentation for the validation run outputs.",
|
|
255
257
|
acquisition_context={"source": "Study A registry export"},
|
|
256
258
|
generation_context={"pipeline": "baseline-v1"},
|
|
259
|
+
write_readme=True,
|
|
257
260
|
)
|
|
258
261
|
def build_outputs(
|
|
259
262
|
output_dir: Path,
|
|
@@ -348,6 +351,43 @@ metadata to vary per call instead of staying fixed at decoration time, use
|
|
|
348
351
|
`write_directory_annotation(...)` directly instead. See the example gallery in
|
|
349
352
|
`examples/` for runnable examples of all approaches.
|
|
350
353
|
|
|
354
|
+
The Python API can also load the same YAML answers payloads used by the
|
|
355
|
+
CLI:
|
|
356
|
+
|
|
357
|
+
```python
|
|
358
|
+
from data_annotations.annotations import (
|
|
359
|
+
annotate_directory,
|
|
360
|
+
annotate_file,
|
|
361
|
+
record_file_annotation,
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
annotate_file(answers="participants.yaml")
|
|
365
|
+
annotate_directory(answers="run-001.yaml")
|
|
366
|
+
|
|
367
|
+
annotate_file(
|
|
368
|
+
"outputs/summary.txt",
|
|
369
|
+
answers={"title": "Run Summary", "summary": "Validation run summary."},
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
# Add write_readme=True when you also want Markdown README sidecars.
|
|
373
|
+
annotate_file(answers="participants.yaml", write_readme=True)
|
|
374
|
+
annotate_directory(answers="run-001.yaml", write_readme=True)
|
|
375
|
+
|
|
376
|
+
@record_file_annotation(answers="participants.yaml", write_readme=True)
|
|
377
|
+
def write_participants(artifact_path, input_path):
|
|
378
|
+
...
|
|
379
|
+
```
|
|
380
|
+
|
|
381
|
+
If an answers payload includes `target`, the positional artifact path or directory
|
|
382
|
+
may be omitted. When both are provided, they must resolve to the same path.
|
|
383
|
+
Explicit Python keyword arguments override values from `answers`. Environment
|
|
384
|
+
variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded inside string
|
|
385
|
+
values in both YAML files and mapping payloads.
|
|
386
|
+
|
|
387
|
+
For directory decorators, the wrapped function still provides the produced output
|
|
388
|
+
inventory. Matching `answers.artifacts` entries can supply titles, summaries,
|
|
389
|
+
kinds, fields, primary keys, and missing-value codes for those returned paths.
|
|
390
|
+
|
|
351
391
|
### When To Use Decorators Vs Direct Functions
|
|
352
392
|
|
|
353
393
|
If a function is only a final serializer for already-prepared data, prefer the
|
|
@@ -643,9 +683,14 @@ data-annotations annotate directory path/to/run-001 \
|
|
|
643
683
|
--group-kind plot
|
|
644
684
|
```
|
|
645
685
|
|
|
646
|
-
These commands prompt for missing details
|
|
647
|
-
|
|
648
|
-
`capture_mode="post_hoc"`.
|
|
686
|
+
These commands prompt for missing details and write `*.annotation.json` or
|
|
687
|
+
`data-annotations.json`. Post-hoc records are marked with
|
|
688
|
+
`capture_mode="post_hoc"`. README sidecars are opt-in:
|
|
689
|
+
|
|
690
|
+
```bash
|
|
691
|
+
data-annotations annotate file path/to/participants.csv --write-readme
|
|
692
|
+
data-annotations annotate directory path/to/run-001 --write-readme
|
|
693
|
+
```
|
|
649
694
|
|
|
650
695
|
For shell workflows, you can move the prompt answers into a YAML file and run
|
|
651
696
|
the command non-interactively:
|
|
@@ -742,6 +787,32 @@ Answers files may also use schema-style aliases such as `subject.path`,
|
|
|
742
787
|
`description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
|
|
743
788
|
and `provenance.params`.
|
|
744
789
|
|
|
790
|
+
Answers can request selected provenance fields from the current runtime instead
|
|
791
|
+
of taking them from the payload:
|
|
792
|
+
|
|
793
|
+
```yaml
|
|
794
|
+
target: path/to/run-001
|
|
795
|
+
title: Processing outputs
|
|
796
|
+
summary: Files produced by the shell processing workflow.
|
|
797
|
+
|
|
798
|
+
provenance:
|
|
799
|
+
command: bash generate_some_data_artifact.sh
|
|
800
|
+
script: generate_some_data_artifact.sh
|
|
801
|
+
infer_from_runtime:
|
|
802
|
+
- runtime
|
|
803
|
+
- git
|
|
804
|
+
- source_code
|
|
805
|
+
```
|
|
806
|
+
|
|
807
|
+
`runtime` covers `created_at`, `hostname`, `username`, and `slurm_job_id`. `git`
|
|
808
|
+
covers Git commit, branch, dirty state, remote, tags, and `git describe`.
|
|
809
|
+
`source_code` leaves the source-code reference derived from runtime Git metadata.
|
|
810
|
+
This is especially useful for timestamps, host/user and SLURM context, Git state,
|
|
811
|
+
and derived `source_code`. Provide generation `command` and `script` explicitly
|
|
812
|
+
in CLI answers files, because the runtime command and script would describe the
|
|
813
|
+
`data-annotations annotate ...` invocation rather than the script that generated
|
|
814
|
+
the artifact.
|
|
815
|
+
|
|
745
816
|
For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
|
|
746
817
|
`file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
|
|
747
818
|
sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
|
|
@@ -954,4 +1025,5 @@ uv run python examples/publish_cli.py
|
|
|
954
1025
|
```
|
|
955
1026
|
|
|
956
1027
|
Each example writes its outputs to a fresh temporary directory and prints the
|
|
957
|
-
location so you can inspect the generated annotation documents and
|
|
1028
|
+
location so you can inspect the generated annotation documents and any requested
|
|
1029
|
+
README sidecars.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "data-annotations"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.8.0"
|
|
4
4
|
description = "Annotate data artifacts with provenance and descriptions"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
@@ -9,7 +9,7 @@ authors = [
|
|
|
9
9
|
license = "BSD-3-Clause"
|
|
10
10
|
license-files = ["LICENSE"]
|
|
11
11
|
requires-python = ">=3.12"
|
|
12
|
-
dependencies = ["pydantic>=2.13.1"]
|
|
12
|
+
dependencies = ["pydantic>=2.13.1", "PyYAML>=6.0.2"]
|
|
13
13
|
keywords = ["annotations", "data", "metadata", "provenance", "reproducibility"]
|
|
14
14
|
classifiers = [
|
|
15
15
|
"Development Status :: 4 - Beta",
|
|
@@ -30,7 +30,7 @@ Changelog = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/blob/main/C
|
|
|
30
30
|
Issues = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/issues"
|
|
31
31
|
|
|
32
32
|
[project.optional-dependencies]
|
|
33
|
-
cli = ["
|
|
33
|
+
cli = ["questionary>=2.1.1", "typer>=0.16.0"]
|
|
34
34
|
|
|
35
35
|
[project.scripts]
|
|
36
36
|
data-annotations = "data_annotations.cli:main"
|
{data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/annotations/__init__.py
RENAMED
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
from .answers import (
|
|
2
|
+
AnswersError,
|
|
3
|
+
DirectoryAnswers,
|
|
4
|
+
FileAnswers,
|
|
5
|
+
load_directory_answers,
|
|
6
|
+
load_file_answers,
|
|
7
|
+
)
|
|
1
8
|
from .models import (
|
|
2
9
|
DirectoryAnnotationDocument,
|
|
3
10
|
DirectoryAnnotationResult,
|
|
@@ -17,13 +24,18 @@ from .writers import (
|
|
|
17
24
|
__all__ = [
|
|
18
25
|
"annotate_directory",
|
|
19
26
|
"annotate_file",
|
|
27
|
+
"load_directory_answers",
|
|
28
|
+
"load_file_answers",
|
|
20
29
|
"record_directory_annotation",
|
|
21
30
|
"record_file_annotation",
|
|
22
31
|
"write_directory_annotation",
|
|
23
32
|
"write_file_annotation",
|
|
33
|
+
"AnswersError",
|
|
34
|
+
"DirectoryAnswers",
|
|
24
35
|
"DirectoryAnnotationDocument",
|
|
25
36
|
"DirectoryAnnotationResult",
|
|
26
37
|
"DirectoryArtifactSubject",
|
|
38
|
+
"FileAnswers",
|
|
27
39
|
"FileAnnotationDocument",
|
|
28
40
|
"FileAnnotationResult",
|
|
29
41
|
"FileArtifactSubject",
|
|
@@ -1,18 +1,20 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import re
|
|
3
3
|
import shlex
|
|
4
|
+
from collections.abc import Mapping
|
|
4
5
|
from pathlib import Path
|
|
5
|
-
from typing import Any, Literal
|
|
6
|
+
from typing import Any, Literal, TypeAlias
|
|
6
7
|
|
|
7
8
|
import yaml
|
|
8
9
|
from pydantic import BaseModel, ConfigDict, Field, ValidationError, field_validator
|
|
10
|
+
from pydantic import model_validator
|
|
9
11
|
|
|
10
12
|
from data_annotations.description import FieldDefinition
|
|
11
13
|
from data_annotations.provenance.models import ArtifactKind, SourceCodeReference
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
class AnswersError(ValueError):
|
|
15
|
-
"""Raised when
|
|
17
|
+
"""Raised when an annotation answers payload cannot be used."""
|
|
16
18
|
|
|
17
19
|
|
|
18
20
|
_ENV_VAR_PATTERN = re.compile(
|
|
@@ -79,6 +81,46 @@ _PROVENANCE_KEYS = {
|
|
|
79
81
|
"git_tags",
|
|
80
82
|
"git_describe",
|
|
81
83
|
"source_code",
|
|
84
|
+
"infer_from_runtime",
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
_RUNTIME_CONTEXT_FIELDS = {
|
|
88
|
+
"created_at",
|
|
89
|
+
"hostname",
|
|
90
|
+
"username",
|
|
91
|
+
"slurm_job_id",
|
|
92
|
+
}
|
|
93
|
+
_GIT_RUNTIME_FIELDS = {
|
|
94
|
+
"git_sha",
|
|
95
|
+
"git_branch",
|
|
96
|
+
"git_dirty",
|
|
97
|
+
"git_remote_name",
|
|
98
|
+
"git_remote_url",
|
|
99
|
+
"git_tags",
|
|
100
|
+
"git_describe",
|
|
101
|
+
}
|
|
102
|
+
_RUNTIME_INFERENCE_GROUPS = {
|
|
103
|
+
"runtime": _RUNTIME_CONTEXT_FIELDS,
|
|
104
|
+
"git": _GIT_RUNTIME_FIELDS,
|
|
105
|
+
}
|
|
106
|
+
_RUNTIME_INFERENCE_FIELDS = (
|
|
107
|
+
_RUNTIME_CONTEXT_FIELDS
|
|
108
|
+
| _GIT_RUNTIME_FIELDS
|
|
109
|
+
| set(_RUNTIME_INFERENCE_GROUPS)
|
|
110
|
+
| {"source_code"}
|
|
111
|
+
)
|
|
112
|
+
_UNSUPPORTED_RUNTIME_INFERENCE_FIELDS = {
|
|
113
|
+
"command",
|
|
114
|
+
"script",
|
|
115
|
+
"script_repo_path",
|
|
116
|
+
}
|
|
117
|
+
_EXPLICIT_PROVENANCE_OVERRIDE_FIELDS = {
|
|
118
|
+
"command",
|
|
119
|
+
"script",
|
|
120
|
+
"script_repo_path",
|
|
121
|
+
"function",
|
|
122
|
+
*_GIT_RUNTIME_FIELDS,
|
|
123
|
+
"source_code",
|
|
82
124
|
}
|
|
83
125
|
|
|
84
126
|
|
|
@@ -97,6 +139,50 @@ class ProvenanceAnswers(BaseModel):
|
|
|
97
139
|
git_tags: list[str] | None = None
|
|
98
140
|
git_describe: str | None = None
|
|
99
141
|
source_code: SourceCodeReference | None = None
|
|
142
|
+
infer_from_runtime: list[str] = Field(default_factory=list)
|
|
143
|
+
|
|
144
|
+
@field_validator("infer_from_runtime", mode="before")
|
|
145
|
+
@classmethod
|
|
146
|
+
def _coerce_runtime_inference_fields(cls, value: Any) -> Any:
|
|
147
|
+
if value is None:
|
|
148
|
+
return []
|
|
149
|
+
if isinstance(value, str):
|
|
150
|
+
return [value]
|
|
151
|
+
return value
|
|
152
|
+
|
|
153
|
+
@field_validator("infer_from_runtime")
|
|
154
|
+
@classmethod
|
|
155
|
+
def _validate_runtime_inference_fields(cls, values: list[str]) -> list[str]:
|
|
156
|
+
normalized: list[str] = []
|
|
157
|
+
for value in values:
|
|
158
|
+
if value in _UNSUPPORTED_RUNTIME_INFERENCE_FIELDS:
|
|
159
|
+
raise ValueError(
|
|
160
|
+
"runtime inference is not supported for "
|
|
161
|
+
f"provenance.{value}; provide it explicitly"
|
|
162
|
+
)
|
|
163
|
+
if value not in _RUNTIME_INFERENCE_FIELDS:
|
|
164
|
+
allowed = sorted(_RUNTIME_INFERENCE_FIELDS)
|
|
165
|
+
raise ValueError(
|
|
166
|
+
f"unknown runtime inference field {value!r}; "
|
|
167
|
+
"expected one of: " + ", ".join(allowed)
|
|
168
|
+
)
|
|
169
|
+
if value not in normalized:
|
|
170
|
+
normalized.append(value)
|
|
171
|
+
return normalized
|
|
172
|
+
|
|
173
|
+
@model_validator(mode="after")
|
|
174
|
+
def _validate_runtime_inference_conflicts(self) -> "ProvenanceAnswers":
|
|
175
|
+
inferred = self.runtime_inference_fields()
|
|
176
|
+
conflicts = sorted(
|
|
177
|
+
field
|
|
178
|
+
for field in inferred & _EXPLICIT_PROVENANCE_OVERRIDE_FIELDS
|
|
179
|
+
if field in self.model_fields_set
|
|
180
|
+
)
|
|
181
|
+
if conflicts:
|
|
182
|
+
raise ValueError(
|
|
183
|
+
"cannot both set and infer provenance field(s): " + ", ".join(conflicts)
|
|
184
|
+
)
|
|
185
|
+
return self
|
|
100
186
|
|
|
101
187
|
def command_tokens(self) -> list[str] | None:
|
|
102
188
|
if self.command is None:
|
|
@@ -108,6 +194,12 @@ class ProvenanceAnswers(BaseModel):
|
|
|
108
194
|
except ValueError as exc:
|
|
109
195
|
raise AnswersError(f"invalid provenance.command: {exc}") from exc
|
|
110
196
|
|
|
197
|
+
def runtime_inference_fields(self) -> set[str]:
|
|
198
|
+
fields: set[str] = set()
|
|
199
|
+
for field in self.infer_from_runtime:
|
|
200
|
+
fields.update(_RUNTIME_INFERENCE_GROUPS.get(field, {field}))
|
|
201
|
+
return fields
|
|
202
|
+
|
|
111
203
|
|
|
112
204
|
class BaseAnswers(BaseModel):
|
|
113
205
|
model_config = ConfigDict(extra="forbid")
|
|
@@ -177,12 +269,16 @@ class DirectoryAnswers(BaseAnswers):
|
|
|
177
269
|
checksums: dict[str, str] = Field(default_factory=dict)
|
|
178
270
|
|
|
179
271
|
|
|
180
|
-
|
|
181
|
-
|
|
272
|
+
FileAnswersInput: TypeAlias = str | Path | Mapping[str, Any] | FileAnswers
|
|
273
|
+
DirectoryAnswersInput: TypeAlias = str | Path | Mapping[str, Any] | DirectoryAnswers
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def load_file_answers(source: FileAnswersInput) -> FileAnswers:
|
|
277
|
+
return _validate_answers(source, mode="file")
|
|
182
278
|
|
|
183
279
|
|
|
184
|
-
def load_directory_answers(
|
|
185
|
-
return _validate_answers(
|
|
280
|
+
def load_directory_answers(source: DirectoryAnswersInput) -> DirectoryAnswers:
|
|
281
|
+
return _validate_answers(source, mode="directory")
|
|
186
282
|
|
|
187
283
|
|
|
188
284
|
def check_answers(path: str | Path) -> tuple[Literal["file", "directory"], Path]:
|
|
@@ -230,17 +326,29 @@ def require_complete_directory_answers(
|
|
|
230
326
|
|
|
231
327
|
|
|
232
328
|
def _validate_answers(
|
|
233
|
-
|
|
329
|
+
source: FileAnswersInput | DirectoryAnswersInput,
|
|
234
330
|
*,
|
|
235
331
|
mode: Literal["file", "directory"],
|
|
236
332
|
) -> Any:
|
|
237
|
-
|
|
333
|
+
if mode == "file" and isinstance(source, FileAnswers):
|
|
334
|
+
return source
|
|
335
|
+
if mode == "directory" and isinstance(source, DirectoryAnswers):
|
|
336
|
+
return source
|
|
337
|
+
|
|
338
|
+
normalized = _normalize_answers(_load_raw_answers(source))
|
|
238
339
|
model = FileAnswers if mode == "file" else DirectoryAnswers
|
|
239
340
|
return _model_validate(model, normalized)
|
|
240
341
|
|
|
241
342
|
|
|
242
|
-
def _load_raw_answers(
|
|
243
|
-
|
|
343
|
+
def _load_raw_answers(
|
|
344
|
+
source: str | Path | Mapping[str, Any] | BaseAnswers,
|
|
345
|
+
) -> dict[str, Any]:
|
|
346
|
+
if isinstance(source, BaseAnswers):
|
|
347
|
+
return source.model_dump()
|
|
348
|
+
if isinstance(source, Mapping):
|
|
349
|
+
return _expand_env_vars(dict(source), path="$")
|
|
350
|
+
|
|
351
|
+
answers_path = Path(source).expanduser()
|
|
244
352
|
if not answers_path.is_file():
|
|
245
353
|
raise AnswersError(f"answers file not found: {answers_path}")
|
|
246
354
|
try:
|
|
@@ -416,3 +524,22 @@ def _missing_required_common_fields(
|
|
|
416
524
|
|
|
417
525
|
def _has_text(value: Any) -> bool:
|
|
418
526
|
return isinstance(value, str) and bool(value.strip())
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
__all__ = [
|
|
530
|
+
"AnswersError",
|
|
531
|
+
"BaseAnswers",
|
|
532
|
+
"ChildBundleAnswers",
|
|
533
|
+
"DirectoryAnswers",
|
|
534
|
+
"DirectoryAnswersInput",
|
|
535
|
+
"DirectoryArtifactAnswers",
|
|
536
|
+
"DirectoryArtifactGroupAnswers",
|
|
537
|
+
"FileAnswers",
|
|
538
|
+
"FileAnswersInput",
|
|
539
|
+
"ProvenanceAnswers",
|
|
540
|
+
"check_answers",
|
|
541
|
+
"load_directory_answers",
|
|
542
|
+
"load_file_answers",
|
|
543
|
+
"require_complete_directory_answers",
|
|
544
|
+
"require_complete_file_answers",
|
|
545
|
+
]
|