data-annotations 2.6.0__tar.gz → 2.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {data_annotations-2.6.0 → data_annotations-2.7.0}/PKG-INFO +64 -4
  2. {data_annotations-2.6.0 → data_annotations-2.7.0}/README.md +62 -2
  3. {data_annotations-2.6.0 → data_annotations-2.7.0}/pyproject.toml +3 -3
  4. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/annotations/__init__.py +12 -0
  5. {data_annotations-2.6.0/src/data_annotations/cli_app → data_annotations-2.7.0/src/data_annotations/annotations}/answers.py +137 -10
  6. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/annotations/decorators.py +86 -7
  7. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/annotations/writers.py +285 -29
  8. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/annotate/helpers.py +77 -2
  9. data_annotations-2.7.0/src/data_annotations/cli_app/answers.py +35 -0
  10. {data_annotations-2.6.0 → data_annotations-2.7.0}/LICENSE +0 -0
  11. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/__init__.py +0 -0
  12. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/_decorators.py +0 -0
  13. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/annotations/models.py +0 -0
  14. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/cli.py +0 -0
  15. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/__init__.py +0 -0
  16. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/annotate/__init__.py +0 -0
  17. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/common.py +0 -0
  18. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/prompts.py +0 -0
  19. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/provenance_commands.py +0 -0
  20. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/cli_app/publish.py +0 -0
  21. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/description/__init__.py +0 -0
  22. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/description/decorators.py +0 -0
  23. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/description/models.py +0 -0
  24. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/description/writers.py +0 -0
  25. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/__init__.py +0 -0
  26. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/decorators.py +0 -0
  27. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/git.py +0 -0
  28. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/models.py +0 -0
  29. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/__init__.py +0 -0
  30. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/chain.py +0 -0
  31. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/manifest.py +0 -0
  32. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/matching.py +0 -0
  33. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/sources.py +0 -0
  34. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/recovery/types.py +0 -0
  35. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/runtime.py +0 -0
  36. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/provenance/writers.py +0 -0
  37. {data_annotations-2.6.0 → data_annotations-2.7.0}/src/data_annotations/publish.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-annotations
3
- Version: 2.6.0
3
+ Version: 2.7.0
4
4
  Summary: Annotate data artifacts with provenance and descriptions
5
5
  Keywords: annotations,data,metadata,provenance,reproducibility
6
6
  Author: Rodrigo C. G. Pena
@@ -18,7 +18,7 @@ Classifier: Programming Language :: Python :: 3.14
18
18
  Classifier: Topic :: Scientific/Engineering
19
19
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
20
  Requires-Dist: pydantic>=2.13.1
21
- Requires-Dist: pyyaml>=6.0.2 ; extra == 'cli'
21
+ Requires-Dist: pyyaml>=6.0.2
22
22
  Requires-Dist: questionary>=2.1.1 ; extra == 'cli'
23
23
  Requires-Dist: typer>=0.16.0 ; extra == 'cli'
24
24
  Requires-Python: >=3.12
@@ -71,8 +71,9 @@ Or add it to a project with [uv](https://astral.sh/uv/):
71
71
  uv add data-annotations
72
72
  ```
73
73
 
74
- The command-line interface uses optional dependencies. Install the package with
75
- CLI support when you want to run `data-annotations` commands:
74
+ The command-line interface uses optional dependencies for prompting and command
75
+ parsing. Install the package with CLI support when you want to run
76
+ `data-annotations` commands:
76
77
 
77
78
  ```bash
78
79
  pip install "data-annotations[cli]"
@@ -378,6 +379,39 @@ metadata to vary per call instead of staying fixed at decoration time, use
378
379
  `write_directory_annotation(...)` directly instead. See the example gallery in
379
380
  `examples/` for runnable examples of all approaches.
380
381
 
382
+ The Python API can also load the same YAML answers payloads used by the
383
+ CLI:
384
+
385
+ ```python
386
+ from data_annotations.annotations import (
387
+ annotate_directory,
388
+ annotate_file,
389
+ record_file_annotation,
390
+ )
391
+
392
+ annotate_file(answers="participants.yaml")
393
+ annotate_directory(answers="run-001.yaml")
394
+
395
+ annotate_file(
396
+ "outputs/summary.txt",
397
+ answers={"title": "Run Summary", "summary": "Validation run summary."},
398
+ )
399
+
400
+ @record_file_annotation(answers="participants.yaml")
401
+ def write_participants(artifact_path, input_path):
402
+ ...
403
+ ```
404
+
405
+ If an answers payload includes `target`, the positional artifact path or directory
406
+ may be omitted. When both are provided, they must resolve to the same path.
407
+ Explicit Python keyword arguments override values from `answers`. Environment
408
+ variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded inside string
409
+ values in both YAML files and mapping payloads.
410
+
411
+ For directory decorators, the wrapped function still provides the produced output
412
+ inventory. Matching `answers.artifacts` entries can supply titles, summaries,
413
+ kinds, fields, primary keys, and missing-value codes for those returned paths.
414
+
381
415
  ### When To Use Decorators Vs Direct Functions
382
416
 
383
417
  If a function is only a final serializer for already-prepared data, prefer the
@@ -772,6 +806,32 @@ Answers files may also use schema-style aliases such as `subject.path`,
772
806
  `description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
773
807
  and `provenance.params`.
774
808
 
809
+ Answers can request selected provenance fields from the current runtime instead
810
+ of taking them from the payload:
811
+
812
+ ```yaml
813
+ target: path/to/run-001
814
+ title: Processing outputs
815
+ summary: Files produced by the shell processing workflow.
816
+
817
+ provenance:
818
+ command: bash generate_some_data_artifact.sh
819
+ script: generate_some_data_artifact.sh
820
+ infer_from_runtime:
821
+ - runtime
822
+ - git
823
+ - source_code
824
+ ```
825
+
826
+ `runtime` covers `created_at`, `hostname`, `username`, and `slurm_job_id`. `git`
827
+ covers Git commit, branch, dirty state, remote, tags, and `git describe`.
828
+ `source_code` leaves the source-code reference derived from runtime Git metadata.
829
+ This is especially useful for timestamps, host/user and SLURM context, Git state,
830
+ and derived `source_code`. Provide generation `command` and `script` explicitly
831
+ in CLI answers files, because the runtime command and script would describe the
832
+ `data-annotations annotate ...` invocation rather than the script that generated
833
+ the artifact.
834
+
775
835
  For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
776
836
  `file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
777
837
  sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
@@ -41,8 +41,9 @@ Or add it to a project with [uv](https://astral.sh/uv/):
41
41
  uv add data-annotations
42
42
  ```
43
43
 
44
- The command-line interface uses optional dependencies. Install the package with
45
- CLI support when you want to run `data-annotations` commands:
44
+ The command-line interface uses optional dependencies for prompting and command
45
+ parsing. Install the package with CLI support when you want to run
46
+ `data-annotations` commands:
46
47
 
47
48
  ```bash
48
49
  pip install "data-annotations[cli]"
@@ -348,6 +349,39 @@ metadata to vary per call instead of staying fixed at decoration time, use
348
349
  `write_directory_annotation(...)` directly instead. See the example gallery in
349
350
  `examples/` for runnable examples of all approaches.
350
351
 
352
+ The Python API can also load the same YAML answers payloads used by the
353
+ CLI:
354
+
355
+ ```python
356
+ from data_annotations.annotations import (
357
+ annotate_directory,
358
+ annotate_file,
359
+ record_file_annotation,
360
+ )
361
+
362
+ annotate_file(answers="participants.yaml")
363
+ annotate_directory(answers="run-001.yaml")
364
+
365
+ annotate_file(
366
+ "outputs/summary.txt",
367
+ answers={"title": "Run Summary", "summary": "Validation run summary."},
368
+ )
369
+
370
+ @record_file_annotation(answers="participants.yaml")
371
+ def write_participants(artifact_path, input_path):
372
+ ...
373
+ ```
374
+
375
+ If an answers payload includes `target`, the positional artifact path or directory
376
+ may be omitted. When both are provided, they must resolve to the same path.
377
+ Explicit Python keyword arguments override values from `answers`. Environment
378
+ variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded inside string
379
+ values in both YAML files and mapping payloads.
380
+
381
+ For directory decorators, the wrapped function still provides the produced output
382
+ inventory. Matching `answers.artifacts` entries can supply titles, summaries,
383
+ kinds, fields, primary keys, and missing-value codes for those returned paths.
384
+
351
385
  ### When To Use Decorators Vs Direct Functions
352
386
 
353
387
  If a function is only a final serializer for already-prepared data, prefer the
@@ -742,6 +776,32 @@ Answers files may also use schema-style aliases such as `subject.path`,
742
776
  `description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
743
777
  and `provenance.params`.
744
778
 
779
+ Answers can request selected provenance fields from the current runtime instead
780
+ of taking them from the payload:
781
+
782
+ ```yaml
783
+ target: path/to/run-001
784
+ title: Processing outputs
785
+ summary: Files produced by the shell processing workflow.
786
+
787
+ provenance:
788
+ command: bash generate_some_data_artifact.sh
789
+ script: generate_some_data_artifact.sh
790
+ infer_from_runtime:
791
+ - runtime
792
+ - git
793
+ - source_code
794
+ ```
795
+
796
+ `runtime` covers `created_at`, `hostname`, `username`, and `slurm_job_id`. `git`
797
+ covers Git commit, branch, dirty state, remote, tags, and `git describe`.
798
+ `source_code` leaves the source-code reference derived from runtime Git metadata.
799
+ This is especially useful for timestamps, host/user and SLURM context, Git state,
800
+ and derived `source_code`. Provide generation `command` and `script` explicitly
801
+ in CLI answers files, because the runtime command and script would describe the
802
+ `data-annotations annotate ...` invocation rather than the script that generated
803
+ the artifact.
804
+
745
805
  For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
746
806
  `file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
747
807
  sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "data-annotations"
3
- version = "2.6.0"
3
+ version = "2.7.0"
4
4
  description = "Annotate data artifacts with provenance and descriptions"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -9,7 +9,7 @@ authors = [
9
9
  license = "BSD-3-Clause"
10
10
  license-files = ["LICENSE"]
11
11
  requires-python = ">=3.12"
12
- dependencies = ["pydantic>=2.13.1"]
12
+ dependencies = ["pydantic>=2.13.1", "PyYAML>=6.0.2"]
13
13
  keywords = ["annotations", "data", "metadata", "provenance", "reproducibility"]
14
14
  classifiers = [
15
15
  "Development Status :: 4 - Beta",
@@ -30,7 +30,7 @@ Changelog = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/blob/main/C
30
30
  Issues = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/issues"
31
31
 
32
32
  [project.optional-dependencies]
33
- cli = ["PyYAML>=6.0.2", "questionary>=2.1.1", "typer>=0.16.0"]
33
+ cli = ["questionary>=2.1.1", "typer>=0.16.0"]
34
34
 
35
35
  [project.scripts]
36
36
  data-annotations = "data_annotations.cli:main"
@@ -1,3 +1,10 @@
1
+ from .answers import (
2
+ AnswersError,
3
+ DirectoryAnswers,
4
+ FileAnswers,
5
+ load_directory_answers,
6
+ load_file_answers,
7
+ )
1
8
  from .models import (
2
9
  DirectoryAnnotationDocument,
3
10
  DirectoryAnnotationResult,
@@ -17,13 +24,18 @@ from .writers import (
17
24
  __all__ = [
18
25
  "annotate_directory",
19
26
  "annotate_file",
27
+ "load_directory_answers",
28
+ "load_file_answers",
20
29
  "record_directory_annotation",
21
30
  "record_file_annotation",
22
31
  "write_directory_annotation",
23
32
  "write_file_annotation",
33
+ "AnswersError",
34
+ "DirectoryAnswers",
24
35
  "DirectoryAnnotationDocument",
25
36
  "DirectoryAnnotationResult",
26
37
  "DirectoryArtifactSubject",
38
+ "FileAnswers",
27
39
  "FileAnnotationDocument",
28
40
  "FileAnnotationResult",
29
41
  "FileArtifactSubject",
@@ -1,18 +1,20 @@
1
1
  import os
2
2
  import re
3
3
  import shlex
4
+ from collections.abc import Mapping
4
5
  from pathlib import Path
5
- from typing import Any, Literal
6
+ from typing import Any, Literal, TypeAlias
6
7
 
7
8
  import yaml
8
9
  from pydantic import BaseModel, ConfigDict, Field, ValidationError, field_validator
10
+ from pydantic import model_validator
9
11
 
10
12
  from data_annotations.description import FieldDefinition
11
13
  from data_annotations.provenance.models import ArtifactKind, SourceCodeReference
12
14
 
13
15
 
14
16
  class AnswersError(ValueError):
15
- """Raised when a CLI answers file cannot be used."""
17
+ """Raised when an annotation answers payload cannot be used."""
16
18
 
17
19
 
18
20
  _ENV_VAR_PATTERN = re.compile(
@@ -79,6 +81,46 @@ _PROVENANCE_KEYS = {
79
81
  "git_tags",
80
82
  "git_describe",
81
83
  "source_code",
84
+ "infer_from_runtime",
85
+ }
86
+
87
+ _RUNTIME_CONTEXT_FIELDS = {
88
+ "created_at",
89
+ "hostname",
90
+ "username",
91
+ "slurm_job_id",
92
+ }
93
+ _GIT_RUNTIME_FIELDS = {
94
+ "git_sha",
95
+ "git_branch",
96
+ "git_dirty",
97
+ "git_remote_name",
98
+ "git_remote_url",
99
+ "git_tags",
100
+ "git_describe",
101
+ }
102
+ _RUNTIME_INFERENCE_GROUPS = {
103
+ "runtime": _RUNTIME_CONTEXT_FIELDS,
104
+ "git": _GIT_RUNTIME_FIELDS,
105
+ }
106
+ _RUNTIME_INFERENCE_FIELDS = (
107
+ _RUNTIME_CONTEXT_FIELDS
108
+ | _GIT_RUNTIME_FIELDS
109
+ | set(_RUNTIME_INFERENCE_GROUPS)
110
+ | {"source_code"}
111
+ )
112
+ _UNSUPPORTED_RUNTIME_INFERENCE_FIELDS = {
113
+ "command",
114
+ "script",
115
+ "script_repo_path",
116
+ }
117
+ _EXPLICIT_PROVENANCE_OVERRIDE_FIELDS = {
118
+ "command",
119
+ "script",
120
+ "script_repo_path",
121
+ "function",
122
+ *_GIT_RUNTIME_FIELDS,
123
+ "source_code",
82
124
  }
83
125
 
84
126
 
@@ -97,6 +139,50 @@ class ProvenanceAnswers(BaseModel):
97
139
  git_tags: list[str] | None = None
98
140
  git_describe: str | None = None
99
141
  source_code: SourceCodeReference | None = None
142
+ infer_from_runtime: list[str] = Field(default_factory=list)
143
+
144
+ @field_validator("infer_from_runtime", mode="before")
145
+ @classmethod
146
+ def _coerce_runtime_inference_fields(cls, value: Any) -> Any:
147
+ if value is None:
148
+ return []
149
+ if isinstance(value, str):
150
+ return [value]
151
+ return value
152
+
153
+ @field_validator("infer_from_runtime")
154
+ @classmethod
155
+ def _validate_runtime_inference_fields(cls, values: list[str]) -> list[str]:
156
+ normalized: list[str] = []
157
+ for value in values:
158
+ if value in _UNSUPPORTED_RUNTIME_INFERENCE_FIELDS:
159
+ raise ValueError(
160
+ "runtime inference is not supported for "
161
+ f"provenance.{value}; provide it explicitly"
162
+ )
163
+ if value not in _RUNTIME_INFERENCE_FIELDS:
164
+ allowed = sorted(_RUNTIME_INFERENCE_FIELDS)
165
+ raise ValueError(
166
+ f"unknown runtime inference field {value!r}; "
167
+ "expected one of: " + ", ".join(allowed)
168
+ )
169
+ if value not in normalized:
170
+ normalized.append(value)
171
+ return normalized
172
+
173
+ @model_validator(mode="after")
174
+ def _validate_runtime_inference_conflicts(self) -> "ProvenanceAnswers":
175
+ inferred = self.runtime_inference_fields()
176
+ conflicts = sorted(
177
+ field
178
+ for field in inferred & _EXPLICIT_PROVENANCE_OVERRIDE_FIELDS
179
+ if field in self.model_fields_set
180
+ )
181
+ if conflicts:
182
+ raise ValueError(
183
+ "cannot both set and infer provenance field(s): " + ", ".join(conflicts)
184
+ )
185
+ return self
100
186
 
101
187
  def command_tokens(self) -> list[str] | None:
102
188
  if self.command is None:
@@ -108,6 +194,12 @@ class ProvenanceAnswers(BaseModel):
108
194
  except ValueError as exc:
109
195
  raise AnswersError(f"invalid provenance.command: {exc}") from exc
110
196
 
197
+ def runtime_inference_fields(self) -> set[str]:
198
+ fields: set[str] = set()
199
+ for field in self.infer_from_runtime:
200
+ fields.update(_RUNTIME_INFERENCE_GROUPS.get(field, {field}))
201
+ return fields
202
+
111
203
 
112
204
  class BaseAnswers(BaseModel):
113
205
  model_config = ConfigDict(extra="forbid")
@@ -177,12 +269,16 @@ class DirectoryAnswers(BaseAnswers):
177
269
  checksums: dict[str, str] = Field(default_factory=dict)
178
270
 
179
271
 
180
- def load_file_answers(path: str | Path) -> FileAnswers:
181
- return _validate_answers(path, mode="file")
272
+ FileAnswersInput: TypeAlias = str | Path | Mapping[str, Any] | FileAnswers
273
+ DirectoryAnswersInput: TypeAlias = str | Path | Mapping[str, Any] | DirectoryAnswers
274
+
275
+
276
+ def load_file_answers(source: FileAnswersInput) -> FileAnswers:
277
+ return _validate_answers(source, mode="file")
182
278
 
183
279
 
184
- def load_directory_answers(path: str | Path) -> DirectoryAnswers:
185
- return _validate_answers(path, mode="directory")
280
+ def load_directory_answers(source: DirectoryAnswersInput) -> DirectoryAnswers:
281
+ return _validate_answers(source, mode="directory")
186
282
 
187
283
 
188
284
  def check_answers(path: str | Path) -> tuple[Literal["file", "directory"], Path]:
@@ -230,17 +326,29 @@ def require_complete_directory_answers(
230
326
 
231
327
 
232
328
  def _validate_answers(
233
- path: str | Path,
329
+ source: FileAnswersInput | DirectoryAnswersInput,
234
330
  *,
235
331
  mode: Literal["file", "directory"],
236
332
  ) -> Any:
237
- normalized = _normalize_answers(_load_raw_answers(path))
333
+ if mode == "file" and isinstance(source, FileAnswers):
334
+ return source
335
+ if mode == "directory" and isinstance(source, DirectoryAnswers):
336
+ return source
337
+
338
+ normalized = _normalize_answers(_load_raw_answers(source))
238
339
  model = FileAnswers if mode == "file" else DirectoryAnswers
239
340
  return _model_validate(model, normalized)
240
341
 
241
342
 
242
- def _load_raw_answers(path: str | Path) -> dict[str, Any]:
243
- answers_path = Path(path).expanduser()
343
+ def _load_raw_answers(
344
+ source: str | Path | Mapping[str, Any] | BaseAnswers,
345
+ ) -> dict[str, Any]:
346
+ if isinstance(source, BaseAnswers):
347
+ return source.model_dump()
348
+ if isinstance(source, Mapping):
349
+ return _expand_env_vars(dict(source), path="$")
350
+
351
+ answers_path = Path(source).expanduser()
244
352
  if not answers_path.is_file():
245
353
  raise AnswersError(f"answers file not found: {answers_path}")
246
354
  try:
@@ -416,3 +524,22 @@ def _missing_required_common_fields(
416
524
 
417
525
  def _has_text(value: Any) -> bool:
418
526
  return isinstance(value, str) and bool(value.strip())
527
+
528
+
529
+ __all__ = [
530
+ "AnswersError",
531
+ "BaseAnswers",
532
+ "ChildBundleAnswers",
533
+ "DirectoryAnswers",
534
+ "DirectoryAnswersInput",
535
+ "DirectoryArtifactAnswers",
536
+ "DirectoryArtifactGroupAnswers",
537
+ "FileAnswers",
538
+ "FileAnswersInput",
539
+ "ProvenanceAnswers",
540
+ "check_answers",
541
+ "load_directory_answers",
542
+ "load_file_answers",
543
+ "require_complete_directory_answers",
544
+ "require_complete_file_answers",
545
+ ]
@@ -17,11 +17,81 @@ from data_annotations.description.models import DocumentedArtifact, FieldDefinit
17
17
  from data_annotations.provenance import writers as provenance_writers
18
18
  from data_annotations.provenance.models import ArtifactKind, ChecksumPolicy
19
19
 
20
+ from . import answers as answer_payloads
20
21
  from .writers import annotate_directory, annotate_file
21
22
 
22
23
 
24
+ def _documented_artifact_from_answer(
25
+ artifact: answer_payloads.DirectoryArtifactAnswers,
26
+ ) -> DocumentedArtifact:
27
+ return DocumentedArtifact(
28
+ path=artifact.path,
29
+ kind=artifact.kind,
30
+ title=artifact.title,
31
+ summary=artifact.summary,
32
+ fields=list(artifact.fields),
33
+ primary_key=list(artifact.primary_key),
34
+ missing_value_codes=dict(artifact.missing_value_codes),
35
+ )
36
+
37
+
38
+ def _directory_relative_label(path: str | Path, output_dir: Path) -> str:
39
+ resolved_path = provenance_writers._resolve_directory_member_path(path, output_dir)
40
+ return provenance_writers._directory_relative_label(resolved_path, output_dir)
41
+
42
+
43
+ def _merge_artifact_answer(
44
+ artifact: DocumentedArtifact,
45
+ answer: answer_payloads.DirectoryArtifactAnswers,
46
+ ) -> DocumentedArtifact:
47
+ updates: dict[str, Any] = {}
48
+ if "kind" in answer.model_fields_set:
49
+ updates["kind"] = answer.kind
50
+ if "title" in answer.model_fields_set:
51
+ updates["title"] = answer.title
52
+ if "summary" in answer.model_fields_set:
53
+ updates["summary"] = answer.summary
54
+ if "fields" in answer.model_fields_set:
55
+ updates["fields"] = list(answer.fields)
56
+ if "primary_key" in answer.model_fields_set:
57
+ updates["primary_key"] = list(answer.primary_key)
58
+ if "missing_value_codes" in answer.model_fields_set:
59
+ updates["missing_value_codes"] = dict(answer.missing_value_codes)
60
+ return artifact.model_copy(update=updates)
61
+
62
+
63
+ def _merge_directory_artifacts_from_answers(
64
+ artifacts: list[DocumentedArtifact],
65
+ answers: answer_payloads.DirectoryAnswers | None,
66
+ *,
67
+ output_dir: Path,
68
+ ) -> list[DocumentedArtifact]:
69
+ if answers is None or not answers.artifacts:
70
+ return artifacts
71
+
72
+ answer_by_path = {
73
+ _directory_relative_label(answer.path, output_dir): answer
74
+ for answer in answers.artifacts
75
+ }
76
+ if not artifacts:
77
+ return [
78
+ _documented_artifact_from_answer(answer) for answer in answers.artifacts
79
+ ]
80
+
81
+ merged: list[DocumentedArtifact] = []
82
+ for artifact in artifacts:
83
+ answer = answer_by_path.get(
84
+ _directory_relative_label(artifact.path, output_dir)
85
+ )
86
+ merged.append(
87
+ _merge_artifact_answer(artifact, answer) if answer is not None else artifact
88
+ )
89
+ return merged
90
+
91
+
23
92
  def record_file_annotation(
24
93
  *,
94
+ answers: answer_payloads.FileAnswersInput | None = None,
25
95
  artifact_path_arg: str = "artifact_path",
26
96
  input_args: tuple[str, ...] = DEFAULT_INPUT_ARGS,
27
97
  title: str | None = None,
@@ -31,10 +101,9 @@ def record_file_annotation(
31
101
  missing_value_codes: dict[str, str] | None = None,
32
102
  acquisition_context: dict[str, Any] | None = None,
33
103
  generation_context: dict[str, Any] | None = None,
34
- artifact_kind: ArtifactKind = "other",
104
+ artifact_kind: ArtifactKind | None = None,
35
105
  artifact_sha256: str | None = None,
36
106
  write_readme: bool = True,
37
- write_schema: bool | None = None,
38
107
  annotation_suffix: str = ".annotation.json",
39
108
  readme_suffix: str = ".README.md",
40
109
  checksum_policy: ChecksumPolicy = "auto",
@@ -67,6 +136,7 @@ def record_file_annotation(
67
136
  inputs = extract_inputs(bound, input_args=input_args)
68
137
  annotate_file(
69
138
  artifact_path,
139
+ answers=answers,
70
140
  title=title,
71
141
  summary=summary,
72
142
  fields=fields,
@@ -80,7 +150,6 @@ def record_file_annotation(
80
150
  inputs=inputs,
81
151
  function=fn,
82
152
  write_readme=write_readme,
83
- write_schema=write_schema,
84
153
  annotation_suffix=annotation_suffix,
85
154
  readme_suffix=readme_suffix,
86
155
  checksum_policy=checksum_policy,
@@ -96,6 +165,7 @@ def record_file_annotation(
96
165
 
97
166
  def record_directory_annotation(
98
167
  *,
168
+ answers: answer_payloads.DirectoryAnswersInput | None = None,
99
169
  output_dir_arg: str = "output_dir",
100
170
  input_args: tuple[str, ...] = DEFAULT_INPUT_ARGS,
101
171
  title: str | None = None,
@@ -103,7 +173,6 @@ def record_directory_annotation(
103
173
  acquisition_context: dict[str, Any] | None = None,
104
174
  generation_context: dict[str, Any] | None = None,
105
175
  write_readme: bool = True,
106
- write_schema: bool | None = None,
107
176
  annotation_filename: str = "data-annotations.json",
108
177
  readme_filename: str = "README.md",
109
178
  checksum_policy: ChecksumPolicy = "auto",
@@ -138,10 +207,20 @@ def record_directory_annotation(
138
207
  non_child_items, child_bundles = split_child_bundles(items)
139
208
  artifact_items, artifact_groups = split_artifact_groups(non_child_items)
140
209
  output_dir = argument_path(bound, argument_name=output_dir_arg)
210
+ directory_answers = (
211
+ answer_payloads.load_directory_answers(answers)
212
+ if answers is not None
213
+ else None
214
+ )
141
215
  artifacts: list[DocumentedArtifact] = coerce_documented_artifacts(
142
216
  artifact_items,
143
217
  normalize_paths=False,
144
218
  )
219
+ artifacts = _merge_directory_artifacts_from_answers(
220
+ artifacts,
221
+ directory_answers,
222
+ output_dir=output_dir,
223
+ )
145
224
  params = extract_params(
146
225
  bound,
147
226
  target_args=(output_dir_arg,),
@@ -150,9 +229,10 @@ def record_directory_annotation(
150
229
  inputs = extract_inputs(bound, input_args=input_args)
151
230
  annotate_directory(
152
231
  output_dir,
232
+ answers=directory_answers,
153
233
  artifacts=artifacts,
154
- artifact_groups=artifact_groups,
155
- child_bundles=child_bundles,
234
+ artifact_groups=artifact_groups or None,
235
+ child_bundles=child_bundles or None,
156
236
  title=title,
157
237
  summary=summary,
158
238
  acquisition_context=acquisition_context,
@@ -161,7 +241,6 @@ def record_directory_annotation(
161
241
  inputs=inputs,
162
242
  function=fn,
163
243
  write_readme=write_readme,
164
- write_schema=write_schema,
165
244
  annotation_filename=annotation_filename,
166
245
  readme_filename=readme_filename,
167
246
  checksum_policy=checksum_policy,
@@ -3,8 +3,8 @@ from pathlib import Path
3
3
  from typing import Any, Callable
4
4
 
5
5
  from data_annotations.description import (
6
- ArtifactGroupDescription,
7
6
  ArtifactDescription,
7
+ ArtifactGroupDescription,
8
8
  DirectoryDescription,
9
9
  DocumentedArtifact,
10
10
  DocumentedArtifactGroup,
@@ -16,12 +16,13 @@ from data_annotations.description import (
16
16
  from data_annotations.provenance import (
17
17
  ArtifactKind,
18
18
  BaseProvenance,
19
- ChildBundle,
20
19
  ChecksumPolicy,
20
+ ChildBundle,
21
21
  ProducedFile,
22
22
  )
23
23
  from data_annotations.provenance import writers as provenance_writers
24
24
 
25
+ from . import answers as answer_payloads
25
26
  from .models import (
26
27
  DirectoryAnnotationDocument,
27
28
  DirectoryAnnotationResult,
@@ -31,6 +32,21 @@ from .models import (
31
32
  FileArtifactSubject,
32
33
  )
33
34
 
35
+ _PROVENANCE_ANSWER_OVERRIDE_FIELDS = (
36
+ "command",
37
+ "script",
38
+ "script_repo_path",
39
+ "function",
40
+ "git_sha",
41
+ "git_branch",
42
+ "git_dirty",
43
+ "git_remote_name",
44
+ "git_remote_url",
45
+ "git_tags",
46
+ "git_describe",
47
+ "source_code",
48
+ )
49
+
34
50
 
35
51
  def _validated_file_readme_fields(
36
52
  *, title: str | None, summary: str | None
@@ -144,6 +160,121 @@ def _coerce_fields(
144
160
  return [FieldDefinition.model_validate(field) for field in (fields or [])]
145
161
 
146
162
 
163
+ def _target_from_answers(value: str | None) -> Path | None:
164
+ if value is None or not value.strip():
165
+ return None
166
+ return Path(value).expanduser().resolve()
167
+
168
+
169
+ def _resolve_answer_target(
170
+ explicit_target: str | Path | None,
171
+ answers_target: str | None,
172
+ *,
173
+ label: str,
174
+ ) -> str | Path:
175
+ explicit_path = (
176
+ Path(explicit_target).expanduser().resolve()
177
+ if explicit_target is not None
178
+ else None
179
+ )
180
+ answers_path = _target_from_answers(answers_target)
181
+ if explicit_path is None and answers_path is None:
182
+ raise ValueError(f"{label} is required unless answers supplies target")
183
+ if (
184
+ explicit_path is not None
185
+ and answers_path is not None
186
+ and explicit_path != answers_path
187
+ ):
188
+ raise ValueError(
189
+ f"{label} does not match answers target: {explicit_path} != {answers_path}"
190
+ )
191
+ if explicit_target is not None:
192
+ return explicit_target
193
+ if answers_path is None:
194
+ raise ValueError(f"{label} is required unless answers supplies target")
195
+ return answers_path
196
+
197
+
198
+ def _normalize_answer_git_tags(value: list[str] | None) -> list[str]:
199
+ if value is None:
200
+ return []
201
+ return sorted({tag.strip() for tag in value if tag.strip()})
202
+
203
+
204
+ def _provenance_overrides_from_answers(
205
+ answers: answer_payloads.BaseAnswers | None,
206
+ *,
207
+ function: Callable[..., Any] | None,
208
+ ) -> dict[str, Any] | None:
209
+ if answers is None:
210
+ return None
211
+
212
+ provenance = answers.provenance
213
+ explicit_fields = provenance.model_fields_set
214
+ inferred_fields = provenance.runtime_inference_fields()
215
+ overrides: dict[str, Any] = {}
216
+ for field in _PROVENANCE_ANSWER_OVERRIDE_FIELDS:
217
+ if field in inferred_fields or field not in explicit_fields:
218
+ continue
219
+ if field == "function" and function is not None:
220
+ continue
221
+ if field == "command":
222
+ overrides[field] = provenance.command_tokens()
223
+ elif field == "git_tags":
224
+ overrides[field] = _normalize_answer_git_tags(provenance.git_tags)
225
+ else:
226
+ overrides[field] = getattr(provenance, field)
227
+ return overrides or None
228
+
229
+
230
+ def _documented_artifacts_from_answers(
231
+ artifacts: list[answer_payloads.DirectoryArtifactAnswers],
232
+ ) -> list[DocumentedArtifact]:
233
+ return [
234
+ DocumentedArtifact(
235
+ path=artifact.path,
236
+ kind=artifact.kind,
237
+ title=artifact.title,
238
+ summary=artifact.summary,
239
+ fields=list(artifact.fields),
240
+ primary_key=list(artifact.primary_key),
241
+ missing_value_codes=dict(artifact.missing_value_codes),
242
+ )
243
+ for artifact in artifacts
244
+ ]
245
+
246
+
247
+ def _documented_artifact_groups_from_answers(
248
+ groups: list[answer_payloads.DirectoryArtifactGroupAnswers],
249
+ ) -> list[DocumentedArtifactGroup]:
250
+ return [
251
+ DocumentedArtifactGroup(
252
+ title=group.title,
253
+ summary=group.summary,
254
+ kind=group.kind,
255
+ paths=list(group.paths),
256
+ selector=group.selector,
257
+ fields=list(group.fields),
258
+ primary_key=list(group.primary_key),
259
+ missing_value_codes=dict(group.missing_value_codes),
260
+ )
261
+ for group in groups
262
+ ]
263
+
264
+
265
+ def _child_bundles_from_answers(
266
+ child_bundles: list[answer_payloads.ChildBundleAnswers],
267
+ ) -> list[ChildBundle]:
268
+ return [
269
+ ChildBundle(
270
+ path=child_bundle.path,
271
+ annotation_path=child_bundle.annotation_path,
272
+ content_digest=child_bundle.content_digest,
273
+ )
274
+ for child_bundle in child_bundles
275
+ ]
276
+
277
+
147
278
  def _build_file_annotation_document(
148
279
  artifact_path: str | Path,
149
280
  *,
@@ -382,8 +513,9 @@ def write_directory_annotation(
382
513
 
383
514
 
384
515
  def annotate_file(
385
- artifact_path: str | Path,
516
+ artifact_path: str | Path | None = None,
386
517
  *,
518
+ answers: answer_payloads.FileAnswersInput | None = None,
387
519
  title: str | None = None,
388
520
  summary: str | None = None,
389
521
  fields: list[FieldDefinition] | None = None,
@@ -391,33 +523,87 @@ def annotate_file(
391
523
  missing_value_codes: dict[str, str] | None = None,
392
524
  acquisition_context: dict[str, Any] | None = None,
393
525
  generation_context: dict[str, Any] | None = None,
394
- artifact_kind: ArtifactKind = "other",
526
+ artifact_kind: ArtifactKind | None = None,
395
527
  artifact_sha256: str | None = None,
396
528
  params: dict[str, Any] | None = None,
397
529
  inputs: Sequence[str | Path] | None = None,
398
530
  function: Callable[..., Any] | None = None,
399
531
  write_readme: bool = True,
400
- write_schema: bool | None = None,
401
532
  annotation_suffix: str = ".annotation.json",
402
533
  readme_suffix: str = ".README.md",
403
534
  checksum_policy: ChecksumPolicy = "auto",
404
535
  max_checksum_bytes: int = provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES,
405
536
  checksum_overrides: Mapping[str | Path, str] | None = None,
406
537
  ) -> FileAnnotationResult:
407
- document = _build_file_annotation_document(
538
+ file_answers = (
539
+ answer_payloads.load_file_answers(answers) if answers is not None else None
540
+ )
541
+ selected_artifact_path = _resolve_answer_target(
408
542
  artifact_path,
409
- title=title,
410
- summary=summary,
411
- fields=fields,
412
- primary_key=primary_key,
413
- missing_value_codes=missing_value_codes,
543
+ file_answers.target if file_answers is not None else None,
544
+ label="artifact_path",
545
+ )
546
+ title_value = (
547
+ title if title is not None else (file_answers.title if file_answers else None)
548
+ )
549
+ summary_value = (
550
+ summary
551
+ if summary is not None
552
+ else (file_answers.summary if file_answers else None)
553
+ )
554
+ fields_value = (
555
+ fields
556
+ if fields is not None
557
+ else (list(file_answers.fields) if file_answers else None)
558
+ )
559
+ primary_key_value = (
560
+ primary_key
561
+ if primary_key is not None
562
+ else (list(file_answers.primary_key) if file_answers else None)
563
+ )
564
+ missing_value_codes_value = (
565
+ missing_value_codes
566
+ if missing_value_codes is not None
567
+ else (dict(file_answers.missing_value_codes) if file_answers else None)
568
+ )
569
+ artifact_kind_value = (
570
+ artifact_kind
571
+ if artifact_kind is not None
572
+ else (file_answers.kind if file_answers is not None else "other")
573
+ )
574
+ artifact_sha256_value = (
575
+ artifact_sha256
576
+ if artifact_sha256 is not None
577
+ else (file_answers.sha256 if file_answers else None)
578
+ )
579
+ params_value = (
580
+ params
581
+ if params is not None
582
+ else (dict(file_answers.params) if file_answers else None)
583
+ )
584
+ inputs_value = (
585
+ inputs
586
+ if inputs is not None
587
+ else (list(file_answers.inputs) if file_answers else None)
588
+ )
589
+ document = _build_file_annotation_document(
590
+ selected_artifact_path,
591
+ title=title_value,
592
+ summary=summary_value,
593
+ fields=fields_value,
594
+ primary_key=primary_key_value,
595
+ missing_value_codes=missing_value_codes_value,
414
596
  acquisition_context=acquisition_context,
415
597
  generation_context=generation_context,
416
- artifact_kind=artifact_kind,
417
- artifact_sha256=artifact_sha256,
418
- params=params,
419
- inputs=inputs,
598
+ artifact_kind=artifact_kind_value,
599
+ artifact_sha256=artifact_sha256_value,
600
+ params=params_value,
601
+ inputs=inputs_value,
420
602
  function=function,
603
+ provenance_overrides=_provenance_overrides_from_answers(
604
+ file_answers,
605
+ function=function,
606
+ ),
421
607
  checksum_policy=checksum_policy,
422
608
  max_checksum_bytes=max_checksum_bytes,
423
609
  checksum_overrides=checksum_overrides,
@@ -430,7 +616,7 @@ def annotate_file(
430
616
 
431
617
  readme_path: Path | None = None
432
618
  if write_readme:
433
- _validated_file_readme_fields(title=title, summary=summary)
619
+ _validated_file_readme_fields(title=title_value, summary=summary_value)
434
620
  readme_path = write_file_readme(
435
621
  Path(str(artifact_path) + readme_suffix),
436
622
  artifact_path=document.subject.path,
@@ -446,9 +632,10 @@ def annotate_file(
446
632
 
447
633
 
448
634
  def annotate_directory(
449
- output_dir: str | Path,
635
+ output_dir: str | Path | None = None,
450
636
  *,
451
- artifacts: list[DocumentedArtifact],
637
+ answers: answer_payloads.DirectoryAnswersInput | None = None,
638
+ artifacts: list[DocumentedArtifact] | None = None,
452
639
  artifact_groups: list[DocumentedArtifactGroup] | None = None,
453
640
  child_bundles: list[ChildBundle] | None = None,
454
641
  title: str | None = None,
@@ -459,28 +646,97 @@ def annotate_directory(
459
646
  inputs: Sequence[str | Path] | None = None,
460
647
  function: Callable[..., Any] | None = None,
461
648
  write_readme: bool = True,
462
- write_schema: bool | None = None,
463
649
  annotation_filename: str = "data-annotations.json",
464
650
  readme_filename: str = "README.md",
465
651
  checksum_policy: ChecksumPolicy = "auto",
466
652
  max_checksum_bytes: int = provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES,
467
653
  checksum_overrides: Mapping[str | Path, str] | None = None,
468
654
  ) -> DirectoryAnnotationResult:
469
- document = _build_directory_annotation_document(
655
+ directory_answers = (
656
+ answer_payloads.load_directory_answers(answers) if answers is not None else None
657
+ )
658
+ selected_output_dir = _resolve_answer_target(
470
659
  output_dir,
471
- artifacts=artifacts,
472
- artifact_groups=artifact_groups,
473
- child_bundles=child_bundles,
474
- title=title,
475
- summary=summary,
660
+ directory_answers.target if directory_answers is not None else None,
661
+ label="output_dir",
662
+ )
663
+ artifacts_value = (
664
+ artifacts
665
+ if artifacts is not None
666
+ else (
667
+ _documented_artifacts_from_answers(directory_answers.artifacts)
668
+ if directory_answers
669
+ else None
670
+ )
671
+ )
672
+ if artifacts_value is None:
673
+ raise ValueError("artifacts is required unless answers supplies artifacts")
674
+ artifact_groups_value = (
675
+ artifact_groups
676
+ if artifact_groups is not None
677
+ else (
678
+ _documented_artifact_groups_from_answers(directory_answers.artifact_groups)
679
+ if directory_answers
680
+ else None
681
+ )
682
+ )
683
+ child_bundles_value = (
684
+ child_bundles
685
+ if child_bundles is not None
686
+ else (
687
+ _child_bundles_from_answers(directory_answers.child_bundles)
688
+ if directory_answers
689
+ else None
690
+ )
691
+ )
692
+ title_value = (
693
+ title
694
+ if title is not None
695
+ else (directory_answers.title if directory_answers else None)
696
+ )
697
+ summary_value = (
698
+ summary
699
+ if summary is not None
700
+ else (directory_answers.summary if directory_answers else None)
701
+ )
702
+ params_value = (
703
+ params
704
+ if params is not None
705
+ else (dict(directory_answers.params) if directory_answers else None)
706
+ )
707
+ inputs_value = (
708
+ inputs
709
+ if inputs is not None
710
+ else (list(directory_answers.inputs) if directory_answers else None)
711
+ )
712
+ checksum_overrides_value: dict[str | Path, str] | None
713
+ if directory_answers is not None and directory_answers.checksums:
714
+ checksum_overrides_value = dict(directory_answers.checksums)
715
+ if checksum_overrides is not None:
716
+ checksum_overrides_value.update(checksum_overrides)
717
+ else:
718
+ checksum_overrides_value = (
719
+ dict(checksum_overrides) if checksum_overrides is not None else None
720
+ )
721
+ document = _build_directory_annotation_document(
722
+ selected_output_dir,
723
+ artifacts=artifacts_value,
724
+ artifact_groups=artifact_groups_value,
725
+ child_bundles=child_bundles_value,
726
+ title=title_value,
727
+ summary=summary_value,
476
728
  acquisition_context=acquisition_context,
477
729
  generation_context=generation_context,
478
- params=params,
479
- inputs=inputs,
730
+ params=params_value,
731
+ inputs=inputs_value,
480
732
  function=function,
733
+ provenance_overrides=_provenance_overrides_from_answers(
734
+ directory_answers,
735
+ function=function,
736
+ ),
481
737
  checksum_policy=checksum_policy,
482
738
  max_checksum_bytes=max_checksum_bytes,
483
- checksum_overrides=checksum_overrides,
739
+ checksum_overrides=checksum_overrides_value,
484
740
  )
485
741
  output_dir = Path(document.subject.path)
486
742
  annotation_path = _write_annotation_document(
@@ -490,7 +746,7 @@ def annotate_directory(
490
746
 
491
747
  readme_path: Path | None = None
492
748
  if write_readme:
493
- _validated_directory_readme_fields(title=title, summary=summary)
749
+ _validated_directory_readme_fields(title=title_value, summary=summary_value)
494
750
  readme_path = write_directory_readme(
495
751
  output_dir / readme_filename,
496
752
  output_dir=document.subject.path,
@@ -278,6 +278,11 @@ def _selected_source_code(
278
278
  source_sha256,
279
279
  ]
280
280
  if not any(value is not None for value in source_values):
281
+ if (
282
+ answers is not None
283
+ and "source_code" in answers.provenance.runtime_inference_fields()
284
+ ):
285
+ return None
281
286
  return answers.provenance.source_code if answers is not None else None
282
287
 
283
288
  if source_kind is None:
@@ -297,6 +302,34 @@ def _selected_source_code(
297
302
  )
298
303
 
299
304
 
305
+ def _runtime_inferred_fields(answers: answer_files.BaseAnswers | None) -> set[str]:
306
+ if answers is None:
307
+ return set()
308
+ return answers.provenance.runtime_inference_fields()
309
+
310
+
311
+ def _filter_runtime_inferred_overrides(
312
+ overrides: dict[str, Any],
313
+ *,
314
+ answers: answer_files.BaseAnswers | None,
315
+ explicit_fields: set[str],
316
+ ) -> dict[str, Any]:
317
+ inferred_fields = _runtime_inferred_fields(answers)
318
+ if not inferred_fields:
319
+ return overrides
320
+ if "source_code" in inferred_fields:
321
+ inferred_fields = inferred_fields | {
322
+ "git_remote_url",
323
+ "git_sha",
324
+ "script_repo_path",
325
+ }
326
+ return {
327
+ key: value
328
+ for key, value in overrides.items()
329
+ if key not in inferred_fields or key in explicit_fields
330
+ }
331
+
332
+
300
333
  def _validate_source_code_git_conflicts(
301
334
  source_code: SourceCodeReference | None,
302
335
  *,
@@ -374,6 +407,32 @@ def _collect_post_hoc_provenance_from_sources(
374
407
  selected_inputs = _selected_inputs(input_values, answers)
375
408
  selected_params = _selected_params(param_values, answers)
376
409
  answer_command_tokens = _command_tokens_from_answers(answers)
410
+ source_code_cli_values = {
411
+ source_kind,
412
+ source_uri,
413
+ source_download_uri,
414
+ source_path,
415
+ source_revision,
416
+ source_sha256,
417
+ }
418
+ explicit_override_fields = {
419
+ field_name
420
+ for field_name, is_explicit in {
421
+ "script": script is not None,
422
+ "script_repo_path": script_repo_path is not None,
423
+ "command": command is not None,
424
+ "function": function is not None,
425
+ "git_sha": git_sha is not None,
426
+ "git_branch": git_branch is not None,
427
+ "git_remote_name": git_remote_name is not None,
428
+ "git_remote_url": git_remote_url is not None,
429
+ "git_tags": git_tags is not None,
430
+ "git_describe": git_describe is not None,
431
+ "git_dirty": git_dirty is not None,
432
+ "source_code": any(value is not None for value in source_code_cli_values),
433
+ }.items()
434
+ if is_explicit
435
+ }
377
436
  selected_command = (
378
437
  command
379
438
  if command is not None
@@ -435,7 +494,15 @@ def _collect_post_hoc_provenance_from_sources(
435
494
  )
436
495
  if selected_source_code is not None:
437
496
  overrides["source_code"] = selected_source_code
438
- return inputs, params, overrides
497
+ return (
498
+ inputs,
499
+ params,
500
+ _filter_runtime_inferred_overrides(
501
+ overrides,
502
+ answers=answers,
503
+ explicit_fields=explicit_override_fields,
504
+ ),
505
+ )
439
506
 
440
507
  command_tokens = (
441
508
  _parse_command_string(command)
@@ -475,7 +542,15 @@ def _collect_post_hoc_provenance_from_sources(
475
542
  "git_dirty": _provenance_value(git_dirty, answers, "git_dirty"),
476
543
  "source_code": selected_source_code,
477
544
  }
478
- return selected_inputs or [], selected_params or {}, overrides
545
+ return (
546
+ selected_inputs or [],
547
+ selected_params or {},
548
+ _filter_runtime_inferred_overrides(
549
+ overrides,
550
+ answers=answers,
551
+ explicit_fields=explicit_override_fields,
552
+ ),
553
+ )
479
554
 
480
555
 
481
556
  def _documented_artifacts_from_answers(
@@ -0,0 +1,35 @@
1
+ from data_annotations.annotations.answers import (
2
+ AnswersError,
3
+ BaseAnswers,
4
+ ChildBundleAnswers,
5
+ DirectoryAnswers,
6
+ DirectoryAnswersInput,
7
+ DirectoryArtifactAnswers,
8
+ DirectoryArtifactGroupAnswers,
9
+ FileAnswers,
10
+ FileAnswersInput,
11
+ ProvenanceAnswers,
12
+ check_answers,
13
+ load_directory_answers,
14
+ load_file_answers,
15
+ require_complete_directory_answers,
16
+ require_complete_file_answers,
17
+ )
18
+
19
+ __all__ = [
20
+ "AnswersError",
21
+ "BaseAnswers",
22
+ "ChildBundleAnswers",
23
+ "DirectoryAnswers",
24
+ "DirectoryAnswersInput",
25
+ "DirectoryArtifactAnswers",
26
+ "DirectoryArtifactGroupAnswers",
27
+ "FileAnswers",
28
+ "FileAnswersInput",
29
+ "ProvenanceAnswers",
30
+ "check_answers",
31
+ "load_directory_answers",
32
+ "load_file_answers",
33
+ "require_complete_directory_answers",
34
+ "require_complete_file_answers",
35
+ ]