data-annotations 2.6.0__tar.gz → 2.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_annotations-2.6.0 → data_annotations-2.7.0}/PKG-INFO +64 -4
- {data_annotations-2.6.0 → data_annotations-2.7.0}/README.md +62 -2
- {data_annotations-2.6.0 → data_annotations-2.7.0}/pyproject.toml +3 -3
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/annotations/__init__.py +12 -0
- {data_annotations-2.6.0/src/data_annotations/cli_app → data_annotations-2.7.0/src/data_annotations/annotations}/answers.py +137 -10
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/annotations/decorators.py +86 -7
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/annotations/writers.py +285 -29
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/annotate/helpers.py +77 -2
- data_annotations-2.7.0/src/data_annotations/cli_app/answers.py +35 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/LICENSE +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/__init__.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/_decorators.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/annotations/models.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/cli.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/__init__.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/annotate/__init__.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/common.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/prompts.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/provenance_commands.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/publish.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/description/__init__.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/description/decorators.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/description/models.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/description/writers.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/__init__.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/decorators.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/git.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/models.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/__init__.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/chain.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/manifest.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/matching.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/sources.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/types.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/runtime.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/writers.py +0 -0
- {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/publish.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-annotations
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.7.0
|
|
4
4
|
Summary: Annotate data artifacts with provenance and descriptions
|
|
5
5
|
Keywords: annotations,data,metadata,provenance,reproducibility
|
|
6
6
|
Author: Rodrigo C. G. Pena
|
|
@@ -18,7 +18,7 @@ Classifier: Programming Language :: Python :: 3.14
|
|
|
18
18
|
Classifier: Topic :: Scientific/Engineering
|
|
19
19
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
20
|
Requires-Dist: pydantic>=2.13.1
|
|
21
|
-
Requires-Dist: pyyaml>=6.0.2
|
|
21
|
+
Requires-Dist: pyyaml>=6.0.2
|
|
22
22
|
Requires-Dist: questionary>=2.1.1 ; extra == 'cli'
|
|
23
23
|
Requires-Dist: typer>=0.16.0 ; extra == 'cli'
|
|
24
24
|
Requires-Python: >=3.12
|
|
@@ -71,8 +71,9 @@ Or add it to a project with [uv](https://astral.sh/uv/):
|
|
|
71
71
|
uv add data-annotations
|
|
72
72
|
```
|
|
73
73
|
|
|
74
|
-
The command-line interface uses optional dependencies
|
|
75
|
-
CLI support when you want to run
|
|
74
|
+
The command-line interface uses optional dependencies for prompting and command
|
|
75
|
+
parsing. Install the package with CLI support when you want to run
|
|
76
|
+
`data-annotations` commands:
|
|
76
77
|
|
|
77
78
|
```bash
|
|
78
79
|
pip install "data-annotations[cli]"
|
|
@@ -378,6 +379,39 @@ metadata to vary per call instead of staying fixed at decoration time, use
|
|
|
378
379
|
`write_directory_annotation(...)` directly instead. See the example gallery in
|
|
379
380
|
`examples/` for runnable examples of all approaches.
|
|
380
381
|
|
|
382
|
+
The Python API can also load the same YAML answers payloads used by the
|
|
383
|
+
CLI:
|
|
384
|
+
|
|
385
|
+
```python
|
|
386
|
+
from data_annotations.annotations import (
|
|
387
|
+
annotate_directory,
|
|
388
|
+
annotate_file,
|
|
389
|
+
record_file_annotation,
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
annotate_file(answers="participants.yaml")
|
|
393
|
+
annotate_directory(answers="run-001.yaml")
|
|
394
|
+
|
|
395
|
+
annotate_file(
|
|
396
|
+
"outputs/summary.txt",
|
|
397
|
+
answers={"title": "Run Summary", "summary": "Validation run summary."},
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
@record_file_annotation(answers="participants.yaml")
|
|
401
|
+
def write_participants(artifact_path, input_path):
|
|
402
|
+
...
|
|
403
|
+
```
|
|
404
|
+
|
|
405
|
+
If an answers payload includes `target`, the positional artifact path or directory
|
|
406
|
+
may be omitted. When both are provided, they must resolve to the same path.
|
|
407
|
+
Explicit Python keyword arguments override values from `answers`. Environment
|
|
408
|
+
variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded inside string
|
|
409
|
+
values in both YAML files and mapping payloads.
|
|
410
|
+
|
|
411
|
+
For directory decorators, the wrapped function still provides the produced output
|
|
412
|
+
inventory. Matching `answers.artifacts` entries can supply titles, summaries,
|
|
413
|
+
kinds, fields, primary keys, and missing-value codes for those returned paths.
|
|
414
|
+
|
|
381
415
|
### When To Use Decorators Vs Direct Functions
|
|
382
416
|
|
|
383
417
|
If a function is only a final serializer for already-prepared data, prefer the
|
|
@@ -772,6 +806,32 @@ Answers files may also use schema-style aliases such as `subject.path`,
|
|
|
772
806
|
`description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
|
|
773
807
|
and `provenance.params`.
|
|
774
808
|
|
|
809
|
+
Answers can request selected provenance fields from the current runtime instead
|
|
810
|
+
of taking them from the payload:
|
|
811
|
+
|
|
812
|
+
```yaml
|
|
813
|
+
target: path/to/run-001
|
|
814
|
+
title: Processing outputs
|
|
815
|
+
summary: Files produced by the shell processing workflow.
|
|
816
|
+
|
|
817
|
+
provenance:
|
|
818
|
+
command: bash generate_some_data_artifact.sh
|
|
819
|
+
script: generate_some_data_artifact.sh
|
|
820
|
+
infer_from_runtime:
|
|
821
|
+
- runtime
|
|
822
|
+
- git
|
|
823
|
+
- source_code
|
|
824
|
+
```
|
|
825
|
+
|
|
826
|
+
`runtime` covers `created_at`, `hostname`, `username`, and `slurm_job_id`. `git`
|
|
827
|
+
covers Git commit, branch, dirty state, remote, tags, and `git describe`.
|
|
828
|
+
`source_code` leaves the source-code reference derived from runtime Git metadata.
|
|
829
|
+
This is especially useful for timestamps, host/user and SLURM context, Git state,
|
|
830
|
+
and derived `source_code`. Provide generation `command` and `script` explicitly
|
|
831
|
+
in CLI answers files, because the runtime command and script would describe the
|
|
832
|
+
`data-annotations annotate ...` invocation rather than the script that generated
|
|
833
|
+
the artifact.
|
|
834
|
+
|
|
775
835
|
For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
|
|
776
836
|
`file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
|
|
777
837
|
sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
|
|
@@ -41,8 +41,9 @@ Or add it to a project with [uv](https://astral.sh/uv/):
|
|
|
41
41
|
uv add data-annotations
|
|
42
42
|
```
|
|
43
43
|
|
|
44
|
-
The command-line interface uses optional dependencies
|
|
45
|
-
CLI support when you want to run
|
|
44
|
+
The command-line interface uses optional dependencies for prompting and command
|
|
45
|
+
parsing. Install the package with CLI support when you want to run
|
|
46
|
+
`data-annotations` commands:
|
|
46
47
|
|
|
47
48
|
```bash
|
|
48
49
|
pip install "data-annotations[cli]"
|
|
@@ -348,6 +349,39 @@ metadata to vary per call instead of staying fixed at decoration time, use
|
|
|
348
349
|
`write_directory_annotation(...)` directly instead. See the example gallery in
|
|
349
350
|
`examples/` for runnable examples of all approaches.
|
|
350
351
|
|
|
352
|
+
The Python API can also load the same YAML answers payloads used by the
|
|
353
|
+
CLI:
|
|
354
|
+
|
|
355
|
+
```python
|
|
356
|
+
from data_annotations.annotations import (
|
|
357
|
+
annotate_directory,
|
|
358
|
+
annotate_file,
|
|
359
|
+
record_file_annotation,
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
annotate_file(answers="participants.yaml")
|
|
363
|
+
annotate_directory(answers="run-001.yaml")
|
|
364
|
+
|
|
365
|
+
annotate_file(
|
|
366
|
+
"outputs/summary.txt",
|
|
367
|
+
answers={"title": "Run Summary", "summary": "Validation run summary."},
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
@record_file_annotation(answers="participants.yaml")
|
|
371
|
+
def write_participants(artifact_path, input_path):
|
|
372
|
+
...
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
If an answers payload includes `target`, the positional artifact path or directory
|
|
376
|
+
may be omitted. When both are provided, they must resolve to the same path.
|
|
377
|
+
Explicit Python keyword arguments override values from `answers`. Environment
|
|
378
|
+
variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded inside string
|
|
379
|
+
values in both YAML files and mapping payloads.
|
|
380
|
+
|
|
381
|
+
For directory decorators, the wrapped function still provides the produced output
|
|
382
|
+
inventory. Matching `answers.artifacts` entries can supply titles, summaries,
|
|
383
|
+
kinds, fields, primary keys, and missing-value codes for those returned paths.
|
|
384
|
+
|
|
351
385
|
### When To Use Decorators Vs Direct Functions
|
|
352
386
|
|
|
353
387
|
If a function is only a final serializer for already-prepared data, prefer the
|
|
@@ -742,6 +776,32 @@ Answers files may also use schema-style aliases such as `subject.path`,
|
|
|
742
776
|
`description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
|
|
743
777
|
and `provenance.params`.
|
|
744
778
|
|
|
779
|
+
Answers can request selected provenance fields from the current runtime instead
|
|
780
|
+
of taking them from the payload:
|
|
781
|
+
|
|
782
|
+
```yaml
|
|
783
|
+
target: path/to/run-001
|
|
784
|
+
title: Processing outputs
|
|
785
|
+
summary: Files produced by the shell processing workflow.
|
|
786
|
+
|
|
787
|
+
provenance:
|
|
788
|
+
command: bash generate_some_data_artifact.sh
|
|
789
|
+
script: generate_some_data_artifact.sh
|
|
790
|
+
infer_from_runtime:
|
|
791
|
+
- runtime
|
|
792
|
+
- git
|
|
793
|
+
- source_code
|
|
794
|
+
```
|
|
795
|
+
|
|
796
|
+
`runtime` covers `created_at`, `hostname`, `username`, and `slurm_job_id`. `git`
|
|
797
|
+
covers Git commit, branch, dirty state, remote, tags, and `git describe`.
|
|
798
|
+
`source_code` leaves the source-code reference derived from runtime Git metadata.
|
|
799
|
+
This is especially useful for timestamps, host/user and SLURM context, Git state,
|
|
800
|
+
and derived `source_code`. Provide generation `command` and `script` explicitly
|
|
801
|
+
in CLI answers files, because the runtime command and script would describe the
|
|
802
|
+
`data-annotations annotate ...` invocation rather than the script that generated
|
|
803
|
+
the artifact.
|
|
804
|
+
|
|
745
805
|
For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
|
|
746
806
|
`file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
|
|
747
807
|
sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "data-annotations"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.7.0"
|
|
4
4
|
description = "Annotate data artifacts with provenance and descriptions"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
@@ -9,7 +9,7 @@ authors = [
|
|
|
9
9
|
license = "BSD-3-Clause"
|
|
10
10
|
license-files = ["LICENSE"]
|
|
11
11
|
requires-python = ">=3.12"
|
|
12
|
-
dependencies = ["pydantic>=2.13.1"]
|
|
12
|
+
dependencies = ["pydantic>=2.13.1", "PyYAML>=6.0.2"]
|
|
13
13
|
keywords = ["annotations", "data", "metadata", "provenance", "reproducibility"]
|
|
14
14
|
classifiers = [
|
|
15
15
|
"Development Status :: 4 - Beta",
|
|
@@ -30,7 +30,7 @@ Changelog = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/blob/main/C
|
|
|
30
30
|
Issues = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/issues"
|
|
31
31
|
|
|
32
32
|
[project.optional-dependencies]
|
|
33
|
-
cli = ["
|
|
33
|
+
cli = ["questionary>=2.1.1", "typer>=0.16.0"]
|
|
34
34
|
|
|
35
35
|
[project.scripts]
|
|
36
36
|
data-annotations = "data_annotations.cli:main"
|
{data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/annotations/__init__.py
RENAMED
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
from .answers import (
|
|
2
|
+
AnswersError,
|
|
3
|
+
DirectoryAnswers,
|
|
4
|
+
FileAnswers,
|
|
5
|
+
load_directory_answers,
|
|
6
|
+
load_file_answers,
|
|
7
|
+
)
|
|
1
8
|
from .models import (
|
|
2
9
|
DirectoryAnnotationDocument,
|
|
3
10
|
DirectoryAnnotationResult,
|
|
@@ -17,13 +24,18 @@ from .writers import (
|
|
|
17
24
|
__all__ = [
|
|
18
25
|
"annotate_directory",
|
|
19
26
|
"annotate_file",
|
|
27
|
+
"load_directory_answers",
|
|
28
|
+
"load_file_answers",
|
|
20
29
|
"record_directory_annotation",
|
|
21
30
|
"record_file_annotation",
|
|
22
31
|
"write_directory_annotation",
|
|
23
32
|
"write_file_annotation",
|
|
33
|
+
"AnswersError",
|
|
34
|
+
"DirectoryAnswers",
|
|
24
35
|
"DirectoryAnnotationDocument",
|
|
25
36
|
"DirectoryAnnotationResult",
|
|
26
37
|
"DirectoryArtifactSubject",
|
|
38
|
+
"FileAnswers",
|
|
27
39
|
"FileAnnotationDocument",
|
|
28
40
|
"FileAnnotationResult",
|
|
29
41
|
"FileArtifactSubject",
|
|
@@ -1,18 +1,20 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import re
|
|
3
3
|
import shlex
|
|
4
|
+
from collections.abc import Mapping
|
|
4
5
|
from pathlib import Path
|
|
5
|
-
from typing import Any, Literal
|
|
6
|
+
from typing import Any, Literal, TypeAlias
|
|
6
7
|
|
|
7
8
|
import yaml
|
|
8
9
|
from pydantic import BaseModel, ConfigDict, Field, ValidationError, field_validator
|
|
10
|
+
from pydantic import model_validator
|
|
9
11
|
|
|
10
12
|
from data_annotations.description import FieldDefinition
|
|
11
13
|
from data_annotations.provenance.models import ArtifactKind, SourceCodeReference
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
class AnswersError(ValueError):
|
|
15
|
-
"""Raised when
|
|
17
|
+
"""Raised when an annotation answers payload cannot be used."""
|
|
16
18
|
|
|
17
19
|
|
|
18
20
|
_ENV_VAR_PATTERN = re.compile(
|
|
@@ -79,6 +81,46 @@ _PROVENANCE_KEYS = {
|
|
|
79
81
|
"git_tags",
|
|
80
82
|
"git_describe",
|
|
81
83
|
"source_code",
|
|
84
|
+
"infer_from_runtime",
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
_RUNTIME_CONTEXT_FIELDS = {
|
|
88
|
+
"created_at",
|
|
89
|
+
"hostname",
|
|
90
|
+
"username",
|
|
91
|
+
"slurm_job_id",
|
|
92
|
+
}
|
|
93
|
+
_GIT_RUNTIME_FIELDS = {
|
|
94
|
+
"git_sha",
|
|
95
|
+
"git_branch",
|
|
96
|
+
"git_dirty",
|
|
97
|
+
"git_remote_name",
|
|
98
|
+
"git_remote_url",
|
|
99
|
+
"git_tags",
|
|
100
|
+
"git_describe",
|
|
101
|
+
}
|
|
102
|
+
_RUNTIME_INFERENCE_GROUPS = {
|
|
103
|
+
"runtime": _RUNTIME_CONTEXT_FIELDS,
|
|
104
|
+
"git": _GIT_RUNTIME_FIELDS,
|
|
105
|
+
}
|
|
106
|
+
_RUNTIME_INFERENCE_FIELDS = (
|
|
107
|
+
_RUNTIME_CONTEXT_FIELDS
|
|
108
|
+
| _GIT_RUNTIME_FIELDS
|
|
109
|
+
| set(_RUNTIME_INFERENCE_GROUPS)
|
|
110
|
+
| {"source_code"}
|
|
111
|
+
)
|
|
112
|
+
_UNSUPPORTED_RUNTIME_INFERENCE_FIELDS = {
|
|
113
|
+
"command",
|
|
114
|
+
"script",
|
|
115
|
+
"script_repo_path",
|
|
116
|
+
}
|
|
117
|
+
_EXPLICIT_PROVENANCE_OVERRIDE_FIELDS = {
|
|
118
|
+
"command",
|
|
119
|
+
"script",
|
|
120
|
+
"script_repo_path",
|
|
121
|
+
"function",
|
|
122
|
+
*_GIT_RUNTIME_FIELDS,
|
|
123
|
+
"source_code",
|
|
82
124
|
}
|
|
83
125
|
|
|
84
126
|
|
|
@@ -97,6 +139,50 @@ class ProvenanceAnswers(BaseModel):
|
|
|
97
139
|
git_tags: list[str] | None = None
|
|
98
140
|
git_describe: str | None = None
|
|
99
141
|
source_code: SourceCodeReference | None = None
|
|
142
|
+
infer_from_runtime: list[str] = Field(default_factory=list)
|
|
143
|
+
|
|
144
|
+
@field_validator("infer_from_runtime", mode="before")
|
|
145
|
+
@classmethod
|
|
146
|
+
def _coerce_runtime_inference_fields(cls, value: Any) -> Any:
|
|
147
|
+
if value is None:
|
|
148
|
+
return []
|
|
149
|
+
if isinstance(value, str):
|
|
150
|
+
return [value]
|
|
151
|
+
return value
|
|
152
|
+
|
|
153
|
+
@field_validator("infer_from_runtime")
|
|
154
|
+
@classmethod
|
|
155
|
+
def _validate_runtime_inference_fields(cls, values: list[str]) -> list[str]:
|
|
156
|
+
normalized: list[str] = []
|
|
157
|
+
for value in values:
|
|
158
|
+
if value in _UNSUPPORTED_RUNTIME_INFERENCE_FIELDS:
|
|
159
|
+
raise ValueError(
|
|
160
|
+
"runtime inference is not supported for "
|
|
161
|
+
f"provenance.{value}; provide it explicitly"
|
|
162
|
+
)
|
|
163
|
+
if value not in _RUNTIME_INFERENCE_FIELDS:
|
|
164
|
+
allowed = sorted(_RUNTIME_INFERENCE_FIELDS)
|
|
165
|
+
raise ValueError(
|
|
166
|
+
f"unknown runtime inference field {value!r}; "
|
|
167
|
+
"expected one of: " + ", ".join(allowed)
|
|
168
|
+
)
|
|
169
|
+
if value not in normalized:
|
|
170
|
+
normalized.append(value)
|
|
171
|
+
return normalized
|
|
172
|
+
|
|
173
|
+
@model_validator(mode="after")
|
|
174
|
+
def _validate_runtime_inference_conflicts(self) -> "ProvenanceAnswers":
|
|
175
|
+
inferred = self.runtime_inference_fields()
|
|
176
|
+
conflicts = sorted(
|
|
177
|
+
field
|
|
178
|
+
for field in inferred & _EXPLICIT_PROVENANCE_OVERRIDE_FIELDS
|
|
179
|
+
if field in self.model_fields_set
|
|
180
|
+
)
|
|
181
|
+
if conflicts:
|
|
182
|
+
raise ValueError(
|
|
183
|
+
"cannot both set and infer provenance field(s): " + ", ".join(conflicts)
|
|
184
|
+
)
|
|
185
|
+
return self
|
|
100
186
|
|
|
101
187
|
def command_tokens(self) -> list[str] | None:
|
|
102
188
|
if self.command is None:
|
|
@@ -108,6 +194,12 @@ class ProvenanceAnswers(BaseModel):
|
|
|
108
194
|
except ValueError as exc:
|
|
109
195
|
raise AnswersError(f"invalid provenance.command: {exc}") from exc
|
|
110
196
|
|
|
197
|
+
def runtime_inference_fields(self) -> set[str]:
|
|
198
|
+
fields: set[str] = set()
|
|
199
|
+
for field in self.infer_from_runtime:
|
|
200
|
+
fields.update(_RUNTIME_INFERENCE_GROUPS.get(field, {field}))
|
|
201
|
+
return fields
|
|
202
|
+
|
|
111
203
|
|
|
112
204
|
class BaseAnswers(BaseModel):
|
|
113
205
|
model_config = ConfigDict(extra="forbid")
|
|
@@ -177,12 +269,16 @@ class DirectoryAnswers(BaseAnswers):
|
|
|
177
269
|
checksums: dict[str, str] = Field(default_factory=dict)
|
|
178
270
|
|
|
179
271
|
|
|
180
|
-
|
|
181
|
-
|
|
272
|
+
FileAnswersInput: TypeAlias = str | Path | Mapping[str, Any] | FileAnswers
|
|
273
|
+
DirectoryAnswersInput: TypeAlias = str | Path | Mapping[str, Any] | DirectoryAnswers
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def load_file_answers(source: FileAnswersInput) -> FileAnswers:
|
|
277
|
+
return _validate_answers(source, mode="file")
|
|
182
278
|
|
|
183
279
|
|
|
184
|
-
def load_directory_answers(
|
|
185
|
-
return _validate_answers(
|
|
280
|
+
def load_directory_answers(source: DirectoryAnswersInput) -> DirectoryAnswers:
|
|
281
|
+
return _validate_answers(source, mode="directory")
|
|
186
282
|
|
|
187
283
|
|
|
188
284
|
def check_answers(path: str | Path) -> tuple[Literal["file", "directory"], Path]:
|
|
@@ -230,17 +326,29 @@ def require_complete_directory_answers(
|
|
|
230
326
|
|
|
231
327
|
|
|
232
328
|
def _validate_answers(
|
|
233
|
-
|
|
329
|
+
source: FileAnswersInput | DirectoryAnswersInput,
|
|
234
330
|
*,
|
|
235
331
|
mode: Literal["file", "directory"],
|
|
236
332
|
) -> Any:
|
|
237
|
-
|
|
333
|
+
if mode == "file" and isinstance(source, FileAnswers):
|
|
334
|
+
return source
|
|
335
|
+
if mode == "directory" and isinstance(source, DirectoryAnswers):
|
|
336
|
+
return source
|
|
337
|
+
|
|
338
|
+
normalized = _normalize_answers(_load_raw_answers(source))
|
|
238
339
|
model = FileAnswers if mode == "file" else DirectoryAnswers
|
|
239
340
|
return _model_validate(model, normalized)
|
|
240
341
|
|
|
241
342
|
|
|
242
|
-
def _load_raw_answers(
|
|
243
|
-
|
|
343
|
+
def _load_raw_answers(
|
|
344
|
+
source: str | Path | Mapping[str, Any] | BaseAnswers,
|
|
345
|
+
) -> dict[str, Any]:
|
|
346
|
+
if isinstance(source, BaseAnswers):
|
|
347
|
+
return source.model_dump()
|
|
348
|
+
if isinstance(source, Mapping):
|
|
349
|
+
return _expand_env_vars(dict(source), path="$")
|
|
350
|
+
|
|
351
|
+
answers_path = Path(source).expanduser()
|
|
244
352
|
if not answers_path.is_file():
|
|
245
353
|
raise AnswersError(f"answers file not found: {answers_path}")
|
|
246
354
|
try:
|
|
@@ -416,3 +524,22 @@ def _missing_required_common_fields(
|
|
|
416
524
|
|
|
417
525
|
def _has_text(value: Any) -> bool:
|
|
418
526
|
return isinstance(value, str) and bool(value.strip())
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
__all__ = [
|
|
530
|
+
"AnswersError",
|
|
531
|
+
"BaseAnswers",
|
|
532
|
+
"ChildBundleAnswers",
|
|
533
|
+
"DirectoryAnswers",
|
|
534
|
+
"DirectoryAnswersInput",
|
|
535
|
+
"DirectoryArtifactAnswers",
|
|
536
|
+
"DirectoryArtifactGroupAnswers",
|
|
537
|
+
"FileAnswers",
|
|
538
|
+
"FileAnswersInput",
|
|
539
|
+
"ProvenanceAnswers",
|
|
540
|
+
"check_answers",
|
|
541
|
+
"load_directory_answers",
|
|
542
|
+
"load_file_answers",
|
|
543
|
+
"require_complete_directory_answers",
|
|
544
|
+
"require_complete_file_answers",
|
|
545
|
+
]
|
{data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/annotations/decorators.py
RENAMED
|
@@ -17,11 +17,81 @@ from data_annotations.description.models import DocumentedArtifact, FieldDefinit
|
|
|
17
17
|
from data_annotations.provenance import writers as provenance_writers
|
|
18
18
|
from data_annotations.provenance.models import ArtifactKind, ChecksumPolicy
|
|
19
19
|
|
|
20
|
+
from . import answers as answer_payloads
|
|
20
21
|
from .writers import annotate_directory, annotate_file
|
|
21
22
|
|
|
22
23
|
|
|
24
|
+
def _documented_artifact_from_answer(
|
|
25
|
+
artifact: answer_payloads.DirectoryArtifactAnswers,
|
|
26
|
+
) -> DocumentedArtifact:
|
|
27
|
+
return DocumentedArtifact(
|
|
28
|
+
path=artifact.path,
|
|
29
|
+
kind=artifact.kind,
|
|
30
|
+
title=artifact.title,
|
|
31
|
+
summary=artifact.summary,
|
|
32
|
+
fields=list(artifact.fields),
|
|
33
|
+
primary_key=list(artifact.primary_key),
|
|
34
|
+
missing_value_codes=dict(artifact.missing_value_codes),
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _directory_relative_label(path: str | Path, output_dir: Path) -> str:
|
|
39
|
+
resolved_path = provenance_writers._resolve_directory_member_path(path, output_dir)
|
|
40
|
+
return provenance_writers._directory_relative_label(resolved_path, output_dir)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _merge_artifact_answer(
|
|
44
|
+
artifact: DocumentedArtifact,
|
|
45
|
+
answer: answer_payloads.DirectoryArtifactAnswers,
|
|
46
|
+
) -> DocumentedArtifact:
|
|
47
|
+
updates: dict[str, Any] = {}
|
|
48
|
+
if "kind" in answer.model_fields_set:
|
|
49
|
+
updates["kind"] = answer.kind
|
|
50
|
+
if "title" in answer.model_fields_set:
|
|
51
|
+
updates["title"] = answer.title
|
|
52
|
+
if "summary" in answer.model_fields_set:
|
|
53
|
+
updates["summary"] = answer.summary
|
|
54
|
+
if "fields" in answer.model_fields_set:
|
|
55
|
+
updates["fields"] = list(answer.fields)
|
|
56
|
+
if "primary_key" in answer.model_fields_set:
|
|
57
|
+
updates["primary_key"] = list(answer.primary_key)
|
|
58
|
+
if "missing_value_codes" in answer.model_fields_set:
|
|
59
|
+
updates["missing_value_codes"] = dict(answer.missing_value_codes)
|
|
60
|
+
return artifact.model_copy(update=updates)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _merge_directory_artifacts_from_answers(
|
|
64
|
+
artifacts: list[DocumentedArtifact],
|
|
65
|
+
answers: answer_payloads.DirectoryAnswers | None,
|
|
66
|
+
*,
|
|
67
|
+
output_dir: Path,
|
|
68
|
+
) -> list[DocumentedArtifact]:
|
|
69
|
+
if answers is None or not answers.artifacts:
|
|
70
|
+
return artifacts
|
|
71
|
+
|
|
72
|
+
answer_by_path = {
|
|
73
|
+
_directory_relative_label(answer.path, output_dir): answer
|
|
74
|
+
for answer in answers.artifacts
|
|
75
|
+
}
|
|
76
|
+
if not artifacts:
|
|
77
|
+
return [
|
|
78
|
+
_documented_artifact_from_answer(answer) for answer in answers.artifacts
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
merged: list[DocumentedArtifact] = []
|
|
82
|
+
for artifact in artifacts:
|
|
83
|
+
answer = answer_by_path.get(
|
|
84
|
+
_directory_relative_label(artifact.path, output_dir)
|
|
85
|
+
)
|
|
86
|
+
merged.append(
|
|
87
|
+
_merge_artifact_answer(artifact, answer) if answer is not None else artifact
|
|
88
|
+
)
|
|
89
|
+
return merged
|
|
90
|
+
|
|
91
|
+
|
|
23
92
|
def record_file_annotation(
|
|
24
93
|
*,
|
|
94
|
+
answers: answer_payloads.FileAnswersInput | None = None,
|
|
25
95
|
artifact_path_arg: str = "artifact_path",
|
|
26
96
|
input_args: tuple[str, ...] = DEFAULT_INPUT_ARGS,
|
|
27
97
|
title: str | None = None,
|
|
@@ -31,10 +101,9 @@ def record_file_annotation(
|
|
|
31
101
|
missing_value_codes: dict[str, str] | None = None,
|
|
32
102
|
acquisition_context: dict[str, Any] | None = None,
|
|
33
103
|
generation_context: dict[str, Any] | None = None,
|
|
34
|
-
artifact_kind: ArtifactKind =
|
|
104
|
+
artifact_kind: ArtifactKind | None = None,
|
|
35
105
|
artifact_sha256: str | None = None,
|
|
36
106
|
write_readme: bool = True,
|
|
37
|
-
write_schema: bool | None = None,
|
|
38
107
|
annotation_suffix: str = ".annotation.json",
|
|
39
108
|
readme_suffix: str = ".README.md",
|
|
40
109
|
checksum_policy: ChecksumPolicy = "auto",
|
|
@@ -67,6 +136,7 @@ def record_file_annotation(
|
|
|
67
136
|
inputs = extract_inputs(bound, input_args=input_args)
|
|
68
137
|
annotate_file(
|
|
69
138
|
artifact_path,
|
|
139
|
+
answers=answers,
|
|
70
140
|
title=title,
|
|
71
141
|
summary=summary,
|
|
72
142
|
fields=fields,
|
|
@@ -80,7 +150,6 @@ def record_file_annotation(
|
|
|
80
150
|
inputs=inputs,
|
|
81
151
|
function=fn,
|
|
82
152
|
write_readme=write_readme,
|
|
83
|
-
write_schema=write_schema,
|
|
84
153
|
annotation_suffix=annotation_suffix,
|
|
85
154
|
readme_suffix=readme_suffix,
|
|
86
155
|
checksum_policy=checksum_policy,
|
|
@@ -96,6 +165,7 @@ def record_file_annotation(
|
|
|
96
165
|
|
|
97
166
|
def record_directory_annotation(
|
|
98
167
|
*,
|
|
168
|
+
answers: answer_payloads.DirectoryAnswersInput | None = None,
|
|
99
169
|
output_dir_arg: str = "output_dir",
|
|
100
170
|
input_args: tuple[str, ...] = DEFAULT_INPUT_ARGS,
|
|
101
171
|
title: str | None = None,
|
|
@@ -103,7 +173,6 @@ def record_directory_annotation(
|
|
|
103
173
|
acquisition_context: dict[str, Any] | None = None,
|
|
104
174
|
generation_context: dict[str, Any] | None = None,
|
|
105
175
|
write_readme: bool = True,
|
|
106
|
-
write_schema: bool | None = None,
|
|
107
176
|
annotation_filename: str = "data-annotations.json",
|
|
108
177
|
readme_filename: str = "README.md",
|
|
109
178
|
checksum_policy: ChecksumPolicy = "auto",
|
|
@@ -138,10 +207,20 @@ def record_directory_annotation(
|
|
|
138
207
|
non_child_items, child_bundles = split_child_bundles(items)
|
|
139
208
|
artifact_items, artifact_groups = split_artifact_groups(non_child_items)
|
|
140
209
|
output_dir = argument_path(bound, argument_name=output_dir_arg)
|
|
210
|
+
directory_answers = (
|
|
211
|
+
answer_payloads.load_directory_answers(answers)
|
|
212
|
+
if answers is not None
|
|
213
|
+
else None
|
|
214
|
+
)
|
|
141
215
|
artifacts: list[DocumentedArtifact] = coerce_documented_artifacts(
|
|
142
216
|
artifact_items,
|
|
143
217
|
normalize_paths=False,
|
|
144
218
|
)
|
|
219
|
+
artifacts = _merge_directory_artifacts_from_answers(
|
|
220
|
+
artifacts,
|
|
221
|
+
directory_answers,
|
|
222
|
+
output_dir=output_dir,
|
|
223
|
+
)
|
|
145
224
|
params = extract_params(
|
|
146
225
|
bound,
|
|
147
226
|
target_args=(output_dir_arg,),
|
|
@@ -150,9 +229,10 @@ def record_directory_annotation(
|
|
|
150
229
|
inputs = extract_inputs(bound, input_args=input_args)
|
|
151
230
|
annotate_directory(
|
|
152
231
|
output_dir,
|
|
232
|
+
answers=directory_answers,
|
|
153
233
|
artifacts=artifacts,
|
|
154
|
-
artifact_groups=artifact_groups,
|
|
155
|
-
child_bundles=child_bundles,
|
|
234
|
+
artifact_groups=artifact_groups or None,
|
|
235
|
+
child_bundles=child_bundles or None,
|
|
156
236
|
title=title,
|
|
157
237
|
summary=summary,
|
|
158
238
|
acquisition_context=acquisition_context,
|
|
@@ -161,7 +241,6 @@ def record_directory_annotation(
|
|
|
161
241
|
inputs=inputs,
|
|
162
242
|
function=fn,
|
|
163
243
|
write_readme=write_readme,
|
|
164
|
-
write_schema=write_schema,
|
|
165
244
|
annotation_filename=annotation_filename,
|
|
166
245
|
readme_filename=readme_filename,
|
|
167
246
|
checksum_policy=checksum_policy,
|
{data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/annotations/writers.py
RENAMED
|
@@ -3,8 +3,8 @@ from pathlib import Path
|
|
|
3
3
|
from typing import Any, Callable
|
|
4
4
|
|
|
5
5
|
from data_annotations.description import (
|
|
6
|
-
ArtifactGroupDescription,
|
|
7
6
|
ArtifactDescription,
|
|
7
|
+
ArtifactGroupDescription,
|
|
8
8
|
DirectoryDescription,
|
|
9
9
|
DocumentedArtifact,
|
|
10
10
|
DocumentedArtifactGroup,
|
|
@@ -16,12 +16,13 @@ from data_annotations.description import (
|
|
|
16
16
|
from data_annotations.provenance import (
|
|
17
17
|
ArtifactKind,
|
|
18
18
|
BaseProvenance,
|
|
19
|
-
ChildBundle,
|
|
20
19
|
ChecksumPolicy,
|
|
20
|
+
ChildBundle,
|
|
21
21
|
ProducedFile,
|
|
22
22
|
)
|
|
23
23
|
from data_annotations.provenance import writers as provenance_writers
|
|
24
24
|
|
|
25
|
+
from . import answers as answer_payloads
|
|
25
26
|
from .models import (
|
|
26
27
|
DirectoryAnnotationDocument,
|
|
27
28
|
DirectoryAnnotationResult,
|
|
@@ -31,6 +32,21 @@ from .models import (
|
|
|
31
32
|
FileArtifactSubject,
|
|
32
33
|
)
|
|
33
34
|
|
|
35
|
+
_PROVENANCE_ANSWER_OVERRIDE_FIELDS = (
|
|
36
|
+
"command",
|
|
37
|
+
"script",
|
|
38
|
+
"script_repo_path",
|
|
39
|
+
"function",
|
|
40
|
+
"git_sha",
|
|
41
|
+
"git_branch",
|
|
42
|
+
"git_dirty",
|
|
43
|
+
"git_remote_name",
|
|
44
|
+
"git_remote_url",
|
|
45
|
+
"git_tags",
|
|
46
|
+
"git_describe",
|
|
47
|
+
"source_code",
|
|
48
|
+
)
|
|
49
|
+
|
|
34
50
|
|
|
35
51
|
def _validated_file_readme_fields(
|
|
36
52
|
*, title: str | None, summary: str | None
|
|
@@ -144,6 +160,121 @@ def _coerce_fields(
|
|
|
144
160
|
return [FieldDefinition.model_validate(field) for field in (fields or [])]
|
|
145
161
|
|
|
146
162
|
|
|
163
|
+
def _target_from_answers(value: str | None) -> Path | None:
|
|
164
|
+
if value is None or not value.strip():
|
|
165
|
+
return None
|
|
166
|
+
return Path(value).expanduser().resolve()
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _resolve_answer_target(
|
|
170
|
+
explicit_target: str | Path | None,
|
|
171
|
+
answers_target: str | None,
|
|
172
|
+
*,
|
|
173
|
+
label: str,
|
|
174
|
+
) -> str | Path:
|
|
175
|
+
explicit_path = (
|
|
176
|
+
Path(explicit_target).expanduser().resolve()
|
|
177
|
+
if explicit_target is not None
|
|
178
|
+
else None
|
|
179
|
+
)
|
|
180
|
+
answers_path = _target_from_answers(answers_target)
|
|
181
|
+
if explicit_path is None and answers_path is None:
|
|
182
|
+
raise ValueError(f"{label} is required unless answers supplies target")
|
|
183
|
+
if (
|
|
184
|
+
explicit_path is not None
|
|
185
|
+
and answers_path is not None
|
|
186
|
+
and explicit_path != answers_path
|
|
187
|
+
):
|
|
188
|
+
raise ValueError(
|
|
189
|
+
f"{label} does not match answers target: {explicit_path} != {answers_path}"
|
|
190
|
+
)
|
|
191
|
+
if explicit_target is not None:
|
|
192
|
+
return explicit_target
|
|
193
|
+
if answers_path is None:
|
|
194
|
+
raise ValueError(f"{label} is required unless answers supplies target")
|
|
195
|
+
return answers_path
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _normalize_answer_git_tags(value: list[str] | None) -> list[str]:
|
|
199
|
+
if value is None:
|
|
200
|
+
return []
|
|
201
|
+
return sorted({tag.strip() for tag in value if tag.strip()})
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _provenance_overrides_from_answers(
|
|
205
|
+
answers: answer_payloads.BaseAnswers | None,
|
|
206
|
+
*,
|
|
207
|
+
function: Callable[..., Any] | None,
|
|
208
|
+
) -> dict[str, Any] | None:
|
|
209
|
+
if answers is None:
|
|
210
|
+
return None
|
|
211
|
+
|
|
212
|
+
provenance = answers.provenance
|
|
213
|
+
explicit_fields = provenance.model_fields_set
|
|
214
|
+
inferred_fields = provenance.runtime_inference_fields()
|
|
215
|
+
overrides: dict[str, Any] = {}
|
|
216
|
+
for field in _PROVENANCE_ANSWER_OVERRIDE_FIELDS:
|
|
217
|
+
if field in inferred_fields or field not in explicit_fields:
|
|
218
|
+
continue
|
|
219
|
+
if field == "function" and function is not None:
|
|
220
|
+
continue
|
|
221
|
+
if field == "command":
|
|
222
|
+
overrides[field] = provenance.command_tokens()
|
|
223
|
+
elif field == "git_tags":
|
|
224
|
+
overrides[field] = _normalize_answer_git_tags(provenance.git_tags)
|
|
225
|
+
else:
|
|
226
|
+
overrides[field] = getattr(provenance, field)
|
|
227
|
+
return overrides or None
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _documented_artifacts_from_answers(
|
|
231
|
+
artifacts: list[answer_payloads.DirectoryArtifactAnswers],
|
|
232
|
+
) -> list[DocumentedArtifact]:
|
|
233
|
+
return [
|
|
234
|
+
DocumentedArtifact(
|
|
235
|
+
path=artifact.path,
|
|
236
|
+
kind=artifact.kind,
|
|
237
|
+
title=artifact.title,
|
|
238
|
+
summary=artifact.summary,
|
|
239
|
+
fields=list(artifact.fields),
|
|
240
|
+
primary_key=list(artifact.primary_key),
|
|
241
|
+
missing_value_codes=dict(artifact.missing_value_codes),
|
|
242
|
+
)
|
|
243
|
+
for artifact in artifacts
|
|
244
|
+
]
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _documented_artifact_groups_from_answers(
|
|
248
|
+
groups: list[answer_payloads.DirectoryArtifactGroupAnswers],
|
|
249
|
+
) -> list[DocumentedArtifactGroup]:
|
|
250
|
+
return [
|
|
251
|
+
DocumentedArtifactGroup(
|
|
252
|
+
title=group.title,
|
|
253
|
+
summary=group.summary,
|
|
254
|
+
kind=group.kind,
|
|
255
|
+
paths=list(group.paths),
|
|
256
|
+
selector=group.selector,
|
|
257
|
+
fields=list(group.fields),
|
|
258
|
+
primary_key=list(group.primary_key),
|
|
259
|
+
missing_value_codes=dict(group.missing_value_codes),
|
|
260
|
+
)
|
|
261
|
+
for group in groups
|
|
262
|
+
]
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def _child_bundles_from_answers(
|
|
266
|
+
child_bundles: list[answer_payloads.ChildBundleAnswers],
|
|
267
|
+
) -> list[ChildBundle]:
|
|
268
|
+
return [
|
|
269
|
+
ChildBundle(
|
|
270
|
+
path=child_bundle.path,
|
|
271
|
+
annotation_path=child_bundle.annotation_path,
|
|
272
|
+
content_digest=child_bundle.content_digest,
|
|
273
|
+
)
|
|
274
|
+
for child_bundle in child_bundles
|
|
275
|
+
]
|
|
276
|
+
|
|
277
|
+
|
|
147
278
|
def _build_file_annotation_document(
|
|
148
279
|
artifact_path: str | Path,
|
|
149
280
|
*,
|
|
@@ -382,8 +513,9 @@ def write_directory_annotation(
|
|
|
382
513
|
|
|
383
514
|
|
|
384
515
|
def annotate_file(
|
|
385
|
-
artifact_path: str | Path,
|
|
516
|
+
artifact_path: str | Path | None = None,
|
|
386
517
|
*,
|
|
518
|
+
answers: answer_payloads.FileAnswersInput | None = None,
|
|
387
519
|
title: str | None = None,
|
|
388
520
|
summary: str | None = None,
|
|
389
521
|
fields: list[FieldDefinition] | None = None,
|
|
@@ -391,33 +523,87 @@ def annotate_file(
|
|
|
391
523
|
missing_value_codes: dict[str, str] | None = None,
|
|
392
524
|
acquisition_context: dict[str, Any] | None = None,
|
|
393
525
|
generation_context: dict[str, Any] | None = None,
|
|
394
|
-
artifact_kind: ArtifactKind =
|
|
526
|
+
artifact_kind: ArtifactKind | None = None,
|
|
395
527
|
artifact_sha256: str | None = None,
|
|
396
528
|
params: dict[str, Any] | None = None,
|
|
397
529
|
inputs: Sequence[str | Path] | None = None,
|
|
398
530
|
function: Callable[..., Any] | None = None,
|
|
399
531
|
write_readme: bool = True,
|
|
400
|
-
write_schema: bool | None = None,
|
|
401
532
|
annotation_suffix: str = ".annotation.json",
|
|
402
533
|
readme_suffix: str = ".README.md",
|
|
403
534
|
checksum_policy: ChecksumPolicy = "auto",
|
|
404
535
|
max_checksum_bytes: int = provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES,
|
|
405
536
|
checksum_overrides: Mapping[str | Path, str] | None = None,
|
|
406
537
|
) -> FileAnnotationResult:
|
|
407
|
-
|
|
538
|
+
file_answers = (
|
|
539
|
+
answer_payloads.load_file_answers(answers) if answers is not None else None
|
|
540
|
+
)
|
|
541
|
+
selected_artifact_path = _resolve_answer_target(
|
|
408
542
|
artifact_path,
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
543
|
+
file_answers.target if file_answers is not None else None,
|
|
544
|
+
label="artifact_path",
|
|
545
|
+
)
|
|
546
|
+
title_value = (
|
|
547
|
+
title if title is not None else (file_answers.title if file_answers else None)
|
|
548
|
+
)
|
|
549
|
+
summary_value = (
|
|
550
|
+
summary
|
|
551
|
+
if summary is not None
|
|
552
|
+
else (file_answers.summary if file_answers else None)
|
|
553
|
+
)
|
|
554
|
+
fields_value = (
|
|
555
|
+
fields
|
|
556
|
+
if fields is not None
|
|
557
|
+
else (list(file_answers.fields) if file_answers else None)
|
|
558
|
+
)
|
|
559
|
+
primary_key_value = (
|
|
560
|
+
primary_key
|
|
561
|
+
if primary_key is not None
|
|
562
|
+
else (list(file_answers.primary_key) if file_answers else None)
|
|
563
|
+
)
|
|
564
|
+
missing_value_codes_value = (
|
|
565
|
+
missing_value_codes
|
|
566
|
+
if missing_value_codes is not None
|
|
567
|
+
else (dict(file_answers.missing_value_codes) if file_answers else None)
|
|
568
|
+
)
|
|
569
|
+
artifact_kind_value = (
|
|
570
|
+
artifact_kind
|
|
571
|
+
if artifact_kind is not None
|
|
572
|
+
else (file_answers.kind if file_answers is not None else "other")
|
|
573
|
+
)
|
|
574
|
+
artifact_sha256_value = (
|
|
575
|
+
artifact_sha256
|
|
576
|
+
if artifact_sha256 is not None
|
|
577
|
+
else (file_answers.sha256 if file_answers else None)
|
|
578
|
+
)
|
|
579
|
+
params_value = (
|
|
580
|
+
params
|
|
581
|
+
if params is not None
|
|
582
|
+
else (dict(file_answers.params) if file_answers else None)
|
|
583
|
+
)
|
|
584
|
+
inputs_value = (
|
|
585
|
+
inputs
|
|
586
|
+
if inputs is not None
|
|
587
|
+
else (list(file_answers.inputs) if file_answers else None)
|
|
588
|
+
)
|
|
589
|
+
document = _build_file_annotation_document(
|
|
590
|
+
selected_artifact_path,
|
|
591
|
+
title=title_value,
|
|
592
|
+
summary=summary_value,
|
|
593
|
+
fields=fields_value,
|
|
594
|
+
primary_key=primary_key_value,
|
|
595
|
+
missing_value_codes=missing_value_codes_value,
|
|
414
596
|
acquisition_context=acquisition_context,
|
|
415
597
|
generation_context=generation_context,
|
|
416
|
-
artifact_kind=
|
|
417
|
-
artifact_sha256=
|
|
418
|
-
params=
|
|
419
|
-
inputs=
|
|
598
|
+
artifact_kind=artifact_kind_value,
|
|
599
|
+
artifact_sha256=artifact_sha256_value,
|
|
600
|
+
params=params_value,
|
|
601
|
+
inputs=inputs_value,
|
|
420
602
|
function=function,
|
|
603
|
+
provenance_overrides=_provenance_overrides_from_answers(
|
|
604
|
+
file_answers,
|
|
605
|
+
function=function,
|
|
606
|
+
),
|
|
421
607
|
checksum_policy=checksum_policy,
|
|
422
608
|
max_checksum_bytes=max_checksum_bytes,
|
|
423
609
|
checksum_overrides=checksum_overrides,
|
|
@@ -430,7 +616,7 @@ def annotate_file(
|
|
|
430
616
|
|
|
431
617
|
readme_path: Path | None = None
|
|
432
618
|
if write_readme:
|
|
433
|
-
_validated_file_readme_fields(title=
|
|
619
|
+
_validated_file_readme_fields(title=title_value, summary=summary_value)
|
|
434
620
|
readme_path = write_file_readme(
|
|
435
621
|
Path(str(artifact_path) + readme_suffix),
|
|
436
622
|
artifact_path=document.subject.path,
|
|
@@ -446,9 +632,10 @@ def annotate_file(
|
|
|
446
632
|
|
|
447
633
|
|
|
448
634
|
def annotate_directory(
|
|
449
|
-
output_dir: str | Path,
|
|
635
|
+
output_dir: str | Path | None = None,
|
|
450
636
|
*,
|
|
451
|
-
|
|
637
|
+
answers: answer_payloads.DirectoryAnswersInput | None = None,
|
|
638
|
+
artifacts: list[DocumentedArtifact] | None = None,
|
|
452
639
|
artifact_groups: list[DocumentedArtifactGroup] | None = None,
|
|
453
640
|
child_bundles: list[ChildBundle] | None = None,
|
|
454
641
|
title: str | None = None,
|
|
@@ -459,28 +646,97 @@ def annotate_directory(
|
|
|
459
646
|
inputs: Sequence[str | Path] | None = None,
|
|
460
647
|
function: Callable[..., Any] | None = None,
|
|
461
648
|
write_readme: bool = True,
|
|
462
|
-
write_schema: bool | None = None,
|
|
463
649
|
annotation_filename: str = "data-annotations.json",
|
|
464
650
|
readme_filename: str = "README.md",
|
|
465
651
|
checksum_policy: ChecksumPolicy = "auto",
|
|
466
652
|
max_checksum_bytes: int = provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES,
|
|
467
653
|
checksum_overrides: Mapping[str | Path, str] | None = None,
|
|
468
654
|
) -> DirectoryAnnotationResult:
|
|
469
|
-
|
|
655
|
+
directory_answers = (
|
|
656
|
+
answer_payloads.load_directory_answers(answers) if answers is not None else None
|
|
657
|
+
)
|
|
658
|
+
selected_output_dir = _resolve_answer_target(
|
|
470
659
|
output_dir,
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
660
|
+
directory_answers.target if directory_answers is not None else None,
|
|
661
|
+
label="output_dir",
|
|
662
|
+
)
|
|
663
|
+
artifacts_value = (
|
|
664
|
+
artifacts
|
|
665
|
+
if artifacts is not None
|
|
666
|
+
else (
|
|
667
|
+
_documented_artifacts_from_answers(directory_answers.artifacts)
|
|
668
|
+
if directory_answers
|
|
669
|
+
else None
|
|
670
|
+
)
|
|
671
|
+
)
|
|
672
|
+
if artifacts_value is None:
|
|
673
|
+
raise ValueError("artifacts is required unless answers supplies artifacts")
|
|
674
|
+
artifact_groups_value = (
|
|
675
|
+
artifact_groups
|
|
676
|
+
if artifact_groups is not None
|
|
677
|
+
else (
|
|
678
|
+
_documented_artifact_groups_from_answers(directory_answers.artifact_groups)
|
|
679
|
+
if directory_answers
|
|
680
|
+
else None
|
|
681
|
+
)
|
|
682
|
+
)
|
|
683
|
+
child_bundles_value = (
|
|
684
|
+
child_bundles
|
|
685
|
+
if child_bundles is not None
|
|
686
|
+
else (
|
|
687
|
+
_child_bundles_from_answers(directory_answers.child_bundles)
|
|
688
|
+
if directory_answers
|
|
689
|
+
else None
|
|
690
|
+
)
|
|
691
|
+
)
|
|
692
|
+
title_value = (
|
|
693
|
+
title
|
|
694
|
+
if title is not None
|
|
695
|
+
else (directory_answers.title if directory_answers else None)
|
|
696
|
+
)
|
|
697
|
+
summary_value = (
|
|
698
|
+
summary
|
|
699
|
+
if summary is not None
|
|
700
|
+
else (directory_answers.summary if directory_answers else None)
|
|
701
|
+
)
|
|
702
|
+
params_value = (
|
|
703
|
+
params
|
|
704
|
+
if params is not None
|
|
705
|
+
else (dict(directory_answers.params) if directory_answers else None)
|
|
706
|
+
)
|
|
707
|
+
inputs_value = (
|
|
708
|
+
inputs
|
|
709
|
+
if inputs is not None
|
|
710
|
+
else (list(directory_answers.inputs) if directory_answers else None)
|
|
711
|
+
)
|
|
712
|
+
checksum_overrides_value: dict[str | Path, str] | None
|
|
713
|
+
if directory_answers is not None and directory_answers.checksums:
|
|
714
|
+
checksum_overrides_value = dict(directory_answers.checksums)
|
|
715
|
+
if checksum_overrides is not None:
|
|
716
|
+
checksum_overrides_value.update(checksum_overrides)
|
|
717
|
+
else:
|
|
718
|
+
checksum_overrides_value = (
|
|
719
|
+
dict(checksum_overrides) if checksum_overrides is not None else None
|
|
720
|
+
)
|
|
721
|
+
document = _build_directory_annotation_document(
|
|
722
|
+
selected_output_dir,
|
|
723
|
+
artifacts=artifacts_value,
|
|
724
|
+
artifact_groups=artifact_groups_value,
|
|
725
|
+
child_bundles=child_bundles_value,
|
|
726
|
+
title=title_value,
|
|
727
|
+
summary=summary_value,
|
|
476
728
|
acquisition_context=acquisition_context,
|
|
477
729
|
generation_context=generation_context,
|
|
478
|
-
params=
|
|
479
|
-
inputs=
|
|
730
|
+
params=params_value,
|
|
731
|
+
inputs=inputs_value,
|
|
480
732
|
function=function,
|
|
733
|
+
provenance_overrides=_provenance_overrides_from_answers(
|
|
734
|
+
directory_answers,
|
|
735
|
+
function=function,
|
|
736
|
+
),
|
|
481
737
|
checksum_policy=checksum_policy,
|
|
482
738
|
max_checksum_bytes=max_checksum_bytes,
|
|
483
|
-
checksum_overrides=
|
|
739
|
+
checksum_overrides=checksum_overrides_value,
|
|
484
740
|
)
|
|
485
741
|
output_dir = Path(document.subject.path)
|
|
486
742
|
annotation_path = _write_annotation_document(
|
|
@@ -490,7 +746,7 @@ def annotate_directory(
|
|
|
490
746
|
|
|
491
747
|
readme_path: Path | None = None
|
|
492
748
|
if write_readme:
|
|
493
|
-
_validated_directory_readme_fields(title=
|
|
749
|
+
_validated_directory_readme_fields(title=title_value, summary=summary_value)
|
|
494
750
|
readme_path = write_directory_readme(
|
|
495
751
|
output_dir / readme_filename,
|
|
496
752
|
output_dir=document.subject.path,
|
{data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/annotate/helpers.py
RENAMED
|
@@ -278,6 +278,11 @@ def _selected_source_code(
|
|
|
278
278
|
source_sha256,
|
|
279
279
|
]
|
|
280
280
|
if not any(value is not None for value in source_values):
|
|
281
|
+
if (
|
|
282
|
+
answers is not None
|
|
283
|
+
and "source_code" in answers.provenance.runtime_inference_fields()
|
|
284
|
+
):
|
|
285
|
+
return None
|
|
281
286
|
return answers.provenance.source_code if answers is not None else None
|
|
282
287
|
|
|
283
288
|
if source_kind is None:
|
|
@@ -297,6 +302,34 @@ def _selected_source_code(
|
|
|
297
302
|
)
|
|
298
303
|
|
|
299
304
|
|
|
305
|
+
def _runtime_inferred_fields(answers: answer_files.BaseAnswers | None) -> set[str]:
|
|
306
|
+
if answers is None:
|
|
307
|
+
return set()
|
|
308
|
+
return answers.provenance.runtime_inference_fields()
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _filter_runtime_inferred_overrides(
|
|
312
|
+
overrides: dict[str, Any],
|
|
313
|
+
*,
|
|
314
|
+
answers: answer_files.BaseAnswers | None,
|
|
315
|
+
explicit_fields: set[str],
|
|
316
|
+
) -> dict[str, Any]:
|
|
317
|
+
inferred_fields = _runtime_inferred_fields(answers)
|
|
318
|
+
if not inferred_fields:
|
|
319
|
+
return overrides
|
|
320
|
+
if "source_code" in inferred_fields:
|
|
321
|
+
inferred_fields = inferred_fields | {
|
|
322
|
+
"git_remote_url",
|
|
323
|
+
"git_sha",
|
|
324
|
+
"script_repo_path",
|
|
325
|
+
}
|
|
326
|
+
return {
|
|
327
|
+
key: value
|
|
328
|
+
for key, value in overrides.items()
|
|
329
|
+
if key not in inferred_fields or key in explicit_fields
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
|
|
300
333
|
def _validate_source_code_git_conflicts(
|
|
301
334
|
source_code: SourceCodeReference | None,
|
|
302
335
|
*,
|
|
@@ -374,6 +407,32 @@ def _collect_post_hoc_provenance_from_sources(
|
|
|
374
407
|
selected_inputs = _selected_inputs(input_values, answers)
|
|
375
408
|
selected_params = _selected_params(param_values, answers)
|
|
376
409
|
answer_command_tokens = _command_tokens_from_answers(answers)
|
|
410
|
+
source_code_cli_values = {
|
|
411
|
+
source_kind,
|
|
412
|
+
source_uri,
|
|
413
|
+
source_download_uri,
|
|
414
|
+
source_path,
|
|
415
|
+
source_revision,
|
|
416
|
+
source_sha256,
|
|
417
|
+
}
|
|
418
|
+
explicit_override_fields = {
|
|
419
|
+
field_name
|
|
420
|
+
for field_name, is_explicit in {
|
|
421
|
+
"script": script is not None,
|
|
422
|
+
"script_repo_path": script_repo_path is not None,
|
|
423
|
+
"command": command is not None,
|
|
424
|
+
"function": function is not None,
|
|
425
|
+
"git_sha": git_sha is not None,
|
|
426
|
+
"git_branch": git_branch is not None,
|
|
427
|
+
"git_remote_name": git_remote_name is not None,
|
|
428
|
+
"git_remote_url": git_remote_url is not None,
|
|
429
|
+
"git_tags": git_tags is not None,
|
|
430
|
+
"git_describe": git_describe is not None,
|
|
431
|
+
"git_dirty": git_dirty is not None,
|
|
432
|
+
"source_code": any(value is not None for value in source_code_cli_values),
|
|
433
|
+
}.items()
|
|
434
|
+
if is_explicit
|
|
435
|
+
}
|
|
377
436
|
selected_command = (
|
|
378
437
|
command
|
|
379
438
|
if command is not None
|
|
@@ -435,7 +494,15 @@ def _collect_post_hoc_provenance_from_sources(
|
|
|
435
494
|
)
|
|
436
495
|
if selected_source_code is not None:
|
|
437
496
|
overrides["source_code"] = selected_source_code
|
|
438
|
-
return
|
|
497
|
+
return (
|
|
498
|
+
inputs,
|
|
499
|
+
params,
|
|
500
|
+
_filter_runtime_inferred_overrides(
|
|
501
|
+
overrides,
|
|
502
|
+
answers=answers,
|
|
503
|
+
explicit_fields=explicit_override_fields,
|
|
504
|
+
),
|
|
505
|
+
)
|
|
439
506
|
|
|
440
507
|
command_tokens = (
|
|
441
508
|
_parse_command_string(command)
|
|
@@ -475,7 +542,15 @@ def _collect_post_hoc_provenance_from_sources(
|
|
|
475
542
|
"git_dirty": _provenance_value(git_dirty, answers, "git_dirty"),
|
|
476
543
|
"source_code": selected_source_code,
|
|
477
544
|
}
|
|
478
|
-
return
|
|
545
|
+
return (
|
|
546
|
+
selected_inputs or [],
|
|
547
|
+
selected_params or {},
|
|
548
|
+
_filter_runtime_inferred_overrides(
|
|
549
|
+
overrides,
|
|
550
|
+
answers=answers,
|
|
551
|
+
explicit_fields=explicit_override_fields,
|
|
552
|
+
),
|
|
553
|
+
)
|
|
479
554
|
|
|
480
555
|
|
|
481
556
|
def _documented_artifacts_from_answers(
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from data_annotations.annotations.answers import (
|
|
2
|
+
AnswersError,
|
|
3
|
+
BaseAnswers,
|
|
4
|
+
ChildBundleAnswers,
|
|
5
|
+
DirectoryAnswers,
|
|
6
|
+
DirectoryAnswersInput,
|
|
7
|
+
DirectoryArtifactAnswers,
|
|
8
|
+
DirectoryArtifactGroupAnswers,
|
|
9
|
+
FileAnswers,
|
|
10
|
+
FileAnswersInput,
|
|
11
|
+
ProvenanceAnswers,
|
|
12
|
+
check_answers,
|
|
13
|
+
load_directory_answers,
|
|
14
|
+
load_file_answers,
|
|
15
|
+
require_complete_directory_answers,
|
|
16
|
+
require_complete_file_answers,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"AnswersError",
|
|
21
|
+
"BaseAnswers",
|
|
22
|
+
"ChildBundleAnswers",
|
|
23
|
+
"DirectoryAnswers",
|
|
24
|
+
"DirectoryAnswersInput",
|
|
25
|
+
"DirectoryArtifactAnswers",
|
|
26
|
+
"DirectoryArtifactGroupAnswers",
|
|
27
|
+
"FileAnswers",
|
|
28
|
+
"FileAnswersInput",
|
|
29
|
+
"ProvenanceAnswers",
|
|
30
|
+
"check_answers",
|
|
31
|
+
"load_directory_answers",
|
|
32
|
+
"load_file_answers",
|
|
33
|
+
"require_complete_directory_answers",
|
|
34
|
+
"require_complete_file_answers",
|
|
35
|
+
]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/annotations/models.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/annotate/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/description/__init__.py
RENAMED
|
File without changes
|
{data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/description/decorators.py
RENAMED
|
File without changes
|
{data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/description/models.py
RENAMED
|
File without changes
|
{data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/description/writers.py
RENAMED
|
File without changes
|
{data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/__init__.py
RENAMED
|
File without changes
|
{data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/decorators.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/chain.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/types.py
RENAMED
|
File without changes
|
{data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/runtime.py
RENAMED
|
File without changes
|
{data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/writers.py
RENAMED
|
File without changes
|
|
File without changes
|