data-annotations 2.6.0__tar.gz → 2.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {data_annotations-2.6.0 → data_annotations-2.8.0}/PKG-INFO +84 -12
  2. {data_annotations-2.6.0 → data_annotations-2.8.0}/README.md +82 -10
  3. {data_annotations-2.6.0 → data_annotations-2.8.0}/pyproject.toml +3 -3
  4. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/annotations/__init__.py +12 -0
  5. {data_annotations-2.6.0/src/data_annotations/cli_app → data_annotations-2.8.0/src/data_annotations/annotations}/answers.py +137 -10
  6. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/annotations/decorators.py +88 -9
  7. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/annotations/writers.py +287 -31
  8. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/cli_app/annotate/__init__.py +20 -4
  9. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/cli_app/annotate/helpers.py +102 -21
  10. data_annotations-2.8.0/src/data_annotations/cli_app/answers.py +35 -0
  11. {data_annotations-2.6.0 → data_annotations-2.8.0}/LICENSE +0 -0
  12. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/__init__.py +0 -0
  13. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/_decorators.py +0 -0
  14. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/annotations/models.py +0 -0
  15. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/cli.py +0 -0
  16. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/cli_app/__init__.py +0 -0
  17. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/cli_app/common.py +0 -0
  18. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/cli_app/prompts.py +0 -0
  19. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/cli_app/provenance_commands.py +0 -0
  20. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/cli_app/publish.py +0 -0
  21. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/description/__init__.py +0 -0
  22. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/description/decorators.py +0 -0
  23. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/description/models.py +0 -0
  24. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/description/writers.py +0 -0
  25. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/__init__.py +0 -0
  26. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/decorators.py +0 -0
  27. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/git.py +0 -0
  28. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/models.py +0 -0
  29. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/recovery/__init__.py +0 -0
  30. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/recovery/chain.py +0 -0
  31. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/recovery/manifest.py +0 -0
  32. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/recovery/matching.py +0 -0
  33. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/recovery/sources.py +0 -0
  34. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/recovery/types.py +0 -0
  35. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/runtime.py +0 -0
  36. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/provenance/writers.py +0 -0
  37. {data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/publish.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-annotations
3
- Version: 2.6.0
3
+ Version: 2.8.0
4
4
  Summary: Annotate data artifacts with provenance and descriptions
5
5
  Keywords: annotations,data,metadata,provenance,reproducibility
6
6
  Author: Rodrigo C. G. Pena
@@ -18,7 +18,7 @@ Classifier: Programming Language :: Python :: 3.14
18
18
  Classifier: Topic :: Scientific/Engineering
19
19
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
20
  Requires-Dist: pydantic>=2.13.1
21
- Requires-Dist: pyyaml>=6.0.2 ; extra == 'cli'
21
+ Requires-Dist: pyyaml>=6.0.2
22
22
  Requires-Dist: questionary>=2.1.1 ; extra == 'cli'
23
23
  Requires-Dist: typer>=0.16.0 ; extra == 'cli'
24
24
  Requires-Python: >=3.12
@@ -71,8 +71,9 @@ Or add it to a project with [uv](https://astral.sh/uv/):
71
71
  uv add data-annotations
72
72
  ```
73
73
 
74
- The command-line interface uses optional dependencies. Install the package with
75
- CLI support when you want to run `data-annotations` commands:
74
+ The command-line interface uses optional dependencies for prompting and command
75
+ parsing. Install the package with CLI support when you want to run
76
+ `data-annotations` commands:
76
77
 
77
78
  ```bash
78
79
  pip install "data-annotations[cli]"
@@ -125,13 +126,13 @@ Git tags and `git_describe` are human-friendly hints only. For Git sources,
125
126
  The recommended way to annotate your data artifacts is to decorate pipeline
126
127
  functions that consume some inputs and parameters, then write those artifacts.
127
128
  This keeps the artifact-writing logic explicit while letting `data-annotations` capture
128
- provenance and emit sidecars automatically.
129
+ provenance and emit the annotation JSON sidecar automatically.
129
130
 
130
131
  For example, here is a complete file-level annotation workflow using the
131
132
  `record_file_annotation(...)` decorator. Once `write_participants` is called, it
132
- automatically generates sidecars `participants.csv.annotation.json` and `participants.csv.README.md`.
133
- The JSON sidecar will contain provenance and description metadata, and the Markdown sidecar
134
- will have a human-friendly rendering of the description provided in the decorator.
133
+ automatically generates `participants.csv.annotation.json`. Set
134
+ `write_readme=True` when you also want `participants.csv.README.md`, a
135
+ human-friendly Markdown rendering of the description provided in the decorator.
135
136
 
136
137
  ```python
137
138
  from pathlib import Path
@@ -164,6 +165,7 @@ from data_annotations.description import AllowedValue, FieldDefinition
164
165
  artifact_kind="dataset",
165
166
  acquisition_context={"source": "Study A registry export"},
166
167
  generation_context={"pipeline": "baseline-v1"},
168
+ write_readme=True,
167
169
  )
168
170
  def write_participants(
169
171
  artifact_path: Path,
@@ -284,6 +286,7 @@ from data_annotations.provenance import ProducedFile
284
286
  summary="Directory-level documentation for the validation run outputs.",
285
287
  acquisition_context={"source": "Study A registry export"},
286
288
  generation_context={"pipeline": "baseline-v1"},
289
+ write_readme=True,
287
290
  )
288
291
  def build_outputs(
289
292
  output_dir: Path,
@@ -378,6 +381,43 @@ metadata to vary per call instead of staying fixed at decoration time, use
378
381
  `write_directory_annotation(...)` directly instead. See the example gallery in
379
382
  `examples/` for runnable examples of all approaches.
380
383
 
384
+ The Python API can also load the same YAML answers payloads used by the
385
+ CLI:
386
+
387
+ ```python
388
+ from data_annotations.annotations import (
389
+ annotate_directory,
390
+ annotate_file,
391
+ record_file_annotation,
392
+ )
393
+
394
+ annotate_file(answers="participants.yaml")
395
+ annotate_directory(answers="run-001.yaml")
396
+
397
+ annotate_file(
398
+ "outputs/summary.txt",
399
+ answers={"title": "Run Summary", "summary": "Validation run summary."},
400
+ )
401
+
402
+ # Add write_readme=True when you also want Markdown README sidecars.
403
+ annotate_file(answers="participants.yaml", write_readme=True)
404
+ annotate_directory(answers="run-001.yaml", write_readme=True)
405
+
406
+ @record_file_annotation(answers="participants.yaml", write_readme=True)
407
+ def write_participants(artifact_path, input_path):
408
+ ...
409
+ ```
410
+
411
+ If an answers payload includes `target`, the positional artifact path or directory
412
+ may be omitted. When both are provided, they must resolve to the same path.
413
+ Explicit Python keyword arguments override values from `answers`. Environment
414
+ variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded inside string
415
+ values in both YAML files and mapping payloads.
416
+
417
+ For directory decorators, the wrapped function still provides the produced output
418
+ inventory. Matching `answers.artifacts` entries can supply titles, summaries,
419
+ kinds, fields, primary keys, and missing-value codes for those returned paths.
420
+
381
421
  ### When To Use Decorators Vs Direct Functions
382
422
 
383
423
  If a function is only a final serializer for already-prepared data, prefer the
@@ -673,9 +713,14 @@ data-annotations annotate directory path/to/run-001 \
673
713
  --group-kind plot
674
714
  ```
675
715
 
676
- These commands prompt for missing details, write `*.annotation.json` or `data-annotations.json`,
677
- and optionally derive README sidecars. Post-hoc records are marked with
678
- `capture_mode="post_hoc"`.
716
+ These commands prompt for missing details and write `*.annotation.json` or
717
+ `data-annotations.json`. Post-hoc records are marked with
718
+ `capture_mode="post_hoc"`. README sidecars are opt-in:
719
+
720
+ ```bash
721
+ data-annotations annotate file path/to/participants.csv --write-readme
722
+ data-annotations annotate directory path/to/run-001 --write-readme
723
+ ```
679
724
 
680
725
  For shell workflows, you can move the prompt answers into a YAML file and run
681
726
  the command non-interactively:
@@ -772,6 +817,32 @@ Answers files may also use schema-style aliases such as `subject.path`,
772
817
  `description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
773
818
  and `provenance.params`.
774
819
 
820
+ Answers can request selected provenance fields from the current runtime instead
821
+ of taking them from the payload:
822
+
823
+ ```yaml
824
+ target: path/to/run-001
825
+ title: Processing outputs
826
+ summary: Files produced by the shell processing workflow.
827
+
828
+ provenance:
829
+ command: bash generate_some_data_artifact.sh
830
+ script: generate_some_data_artifact.sh
831
+ infer_from_runtime:
832
+ - runtime
833
+ - git
834
+ - source_code
835
+ ```
836
+
837
+ `runtime` covers `created_at`, `hostname`, `username`, and `slurm_job_id`. `git`
838
+ covers Git commit, branch, dirty state, remote, tags, and `git describe`.
839
+ `source_code` leaves the source-code reference derived from runtime Git metadata.
840
+ This is especially useful for timestamps, host/user and SLURM context, Git state,
841
+ and derived `source_code`. Provide generation `command` and `script` explicitly
842
+ in CLI answers files, because the runtime command and script would describe the
843
+ `data-annotations annotate ...` invocation rather than the script that generated
844
+ the artifact.
845
+
775
846
  For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
776
847
  `file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
777
848
  sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
@@ -984,4 +1055,5 @@ uv run python examples/publish_cli.py
984
1055
  ```
985
1056
 
986
1057
  Each example writes its outputs to a fresh temporary directory and prints the
987
- location so you can inspect the generated annotation documents and README sidecars.
1058
+ location so you can inspect the generated annotation documents and any requested
1059
+ README sidecars.
@@ -41,8 +41,9 @@ Or add it to a project with [uv](https://astral.sh/uv/):
41
41
  uv add data-annotations
42
42
  ```
43
43
 
44
- The command-line interface uses optional dependencies. Install the package with
45
- CLI support when you want to run `data-annotations` commands:
44
+ The command-line interface uses optional dependencies for prompting and command
45
+ parsing. Install the package with CLI support when you want to run
46
+ `data-annotations` commands:
46
47
 
47
48
  ```bash
48
49
  pip install "data-annotations[cli]"
@@ -95,13 +96,13 @@ Git tags and `git_describe` are human-friendly hints only. For Git sources,
95
96
  The recommended way to annotate your data artifacts is to decorate pipeline
96
97
  functions that consume some inputs and parameters, then write those artifacts.
97
98
  This keeps the artifact-writing logic explicit while letting `data-annotations` capture
98
- provenance and emit sidecars automatically.
99
+ provenance and emit the annotation JSON sidecar automatically.
99
100
 
100
101
  For example, here is a complete file-level annotation workflow using the
101
102
  `record_file_annotation(...)` decorator. Once `write_participants` is called, it
102
- automatically generates sidecars `participants.csv.annotation.json` and `participants.csv.README.md`.
103
- The JSON sidecar will contain provenance and description metadata, and the Markdown sidecar
104
- will have a human-friendly rendering of the description provided in the decorator.
103
+ automatically generates `participants.csv.annotation.json`. Set
104
+ `write_readme=True` when you also want `participants.csv.README.md`, a
105
+ human-friendly Markdown rendering of the description provided in the decorator.
105
106
 
106
107
  ```python
107
108
  from pathlib import Path
@@ -134,6 +135,7 @@ from data_annotations.description import AllowedValue, FieldDefinition
134
135
  artifact_kind="dataset",
135
136
  acquisition_context={"source": "Study A registry export"},
136
137
  generation_context={"pipeline": "baseline-v1"},
138
+ write_readme=True,
137
139
  )
138
140
  def write_participants(
139
141
  artifact_path: Path,
@@ -254,6 +256,7 @@ from data_annotations.provenance import ProducedFile
254
256
  summary="Directory-level documentation for the validation run outputs.",
255
257
  acquisition_context={"source": "Study A registry export"},
256
258
  generation_context={"pipeline": "baseline-v1"},
259
+ write_readme=True,
257
260
  )
258
261
  def build_outputs(
259
262
  output_dir: Path,
@@ -348,6 +351,43 @@ metadata to vary per call instead of staying fixed at decoration time, use
348
351
  `write_directory_annotation(...)` directly instead. See the example gallery in
349
352
  `examples/` for runnable examples of all approaches.
350
353
 
354
+ The Python API can also load the same YAML answers payloads used by the
355
+ CLI:
356
+
357
+ ```python
358
+ from data_annotations.annotations import (
359
+ annotate_directory,
360
+ annotate_file,
361
+ record_file_annotation,
362
+ )
363
+
364
+ annotate_file(answers="participants.yaml")
365
+ annotate_directory(answers="run-001.yaml")
366
+
367
+ annotate_file(
368
+ "outputs/summary.txt",
369
+ answers={"title": "Run Summary", "summary": "Validation run summary."},
370
+ )
371
+
372
+ # Add write_readme=True when you also want Markdown README sidecars.
373
+ annotate_file(answers="participants.yaml", write_readme=True)
374
+ annotate_directory(answers="run-001.yaml", write_readme=True)
375
+
376
+ @record_file_annotation(answers="participants.yaml", write_readme=True)
377
+ def write_participants(artifact_path, input_path):
378
+ ...
379
+ ```
380
+
381
+ If an answers payload includes `target`, the positional artifact path or directory
382
+ may be omitted. When both are provided, they must resolve to the same path.
383
+ Explicit Python keyword arguments override values from `answers`. Environment
384
+ variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded inside string
385
+ values in both YAML files and mapping payloads.
386
+
387
+ For directory decorators, the wrapped function still provides the produced output
388
+ inventory. Matching `answers.artifacts` entries can supply titles, summaries,
389
+ kinds, fields, primary keys, and missing-value codes for those returned paths.
390
+
351
391
  ### When To Use Decorators Vs Direct Functions
352
392
 
353
393
  If a function is only a final serializer for already-prepared data, prefer the
@@ -643,9 +683,14 @@ data-annotations annotate directory path/to/run-001 \
643
683
  --group-kind plot
644
684
  ```
645
685
 
646
- These commands prompt for missing details, write `*.annotation.json` or `data-annotations.json`,
647
- and optionally derive README sidecars. Post-hoc records are marked with
648
- `capture_mode="post_hoc"`.
686
+ These commands prompt for missing details and write `*.annotation.json` or
687
+ `data-annotations.json`. Post-hoc records are marked with
688
+ `capture_mode="post_hoc"`. README sidecars are opt-in:
689
+
690
+ ```bash
691
+ data-annotations annotate file path/to/participants.csv --write-readme
692
+ data-annotations annotate directory path/to/run-001 --write-readme
693
+ ```
649
694
 
650
695
  For shell workflows, you can move the prompt answers into a YAML file and run
651
696
  the command non-interactively:
@@ -742,6 +787,32 @@ Answers files may also use schema-style aliases such as `subject.path`,
742
787
  `description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
743
788
  and `provenance.params`.
744
789
 
790
+ Answers can request selected provenance fields from the current runtime instead
791
+ of taking them from the payload:
792
+
793
+ ```yaml
794
+ target: path/to/run-001
795
+ title: Processing outputs
796
+ summary: Files produced by the shell processing workflow.
797
+
798
+ provenance:
799
+ command: bash generate_some_data_artifact.sh
800
+ script: generate_some_data_artifact.sh
801
+ infer_from_runtime:
802
+ - runtime
803
+ - git
804
+ - source_code
805
+ ```
806
+
807
+ `runtime` covers `created_at`, `hostname`, `username`, and `slurm_job_id`. `git`
808
+ covers Git commit, branch, dirty state, remote, tags, and `git describe`.
809
+ `source_code` leaves the source-code reference derived from runtime Git metadata.
810
+ This is especially useful for timestamps, host/user and SLURM context, Git state,
811
+ and derived `source_code`. Provide generation `command` and `script` explicitly
812
+ in CLI answers files, because the runtime command and script would describe the
813
+ `data-annotations annotate ...` invocation rather than the script that generated
814
+ the artifact.
815
+
745
816
  For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
746
817
  `file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
747
818
  sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
@@ -954,4 +1025,5 @@ uv run python examples/publish_cli.py
954
1025
  ```
955
1026
 
956
1027
  Each example writes its outputs to a fresh temporary directory and prints the
957
- location so you can inspect the generated annotation documents and README sidecars.
1028
+ location so you can inspect the generated annotation documents and any requested
1029
+ README sidecars.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "data-annotations"
3
- version = "2.6.0"
3
+ version = "2.8.0"
4
4
  description = "Annotate data artifacts with provenance and descriptions"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -9,7 +9,7 @@ authors = [
9
9
  license = "BSD-3-Clause"
10
10
  license-files = ["LICENSE"]
11
11
  requires-python = ">=3.12"
12
- dependencies = ["pydantic>=2.13.1"]
12
+ dependencies = ["pydantic>=2.13.1", "PyYAML>=6.0.2"]
13
13
  keywords = ["annotations", "data", "metadata", "provenance", "reproducibility"]
14
14
  classifiers = [
15
15
  "Development Status :: 4 - Beta",
@@ -30,7 +30,7 @@ Changelog = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/blob/main/C
30
30
  Issues = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/issues"
31
31
 
32
32
  [project.optional-dependencies]
33
- cli = ["PyYAML>=6.0.2", "questionary>=2.1.1", "typer>=0.16.0"]
33
+ cli = ["questionary>=2.1.1", "typer>=0.16.0"]
34
34
 
35
35
  [project.scripts]
36
36
  data-annotations = "data_annotations.cli:main"
@@ -1,3 +1,10 @@
1
+ from .answers import (
2
+ AnswersError,
3
+ DirectoryAnswers,
4
+ FileAnswers,
5
+ load_directory_answers,
6
+ load_file_answers,
7
+ )
1
8
  from .models import (
2
9
  DirectoryAnnotationDocument,
3
10
  DirectoryAnnotationResult,
@@ -17,13 +24,18 @@ from .writers import (
17
24
  __all__ = [
18
25
  "annotate_directory",
19
26
  "annotate_file",
27
+ "load_directory_answers",
28
+ "load_file_answers",
20
29
  "record_directory_annotation",
21
30
  "record_file_annotation",
22
31
  "write_directory_annotation",
23
32
  "write_file_annotation",
33
+ "AnswersError",
34
+ "DirectoryAnswers",
24
35
  "DirectoryAnnotationDocument",
25
36
  "DirectoryAnnotationResult",
26
37
  "DirectoryArtifactSubject",
38
+ "FileAnswers",
27
39
  "FileAnnotationDocument",
28
40
  "FileAnnotationResult",
29
41
  "FileArtifactSubject",
@@ -1,18 +1,20 @@
1
1
  import os
2
2
  import re
3
3
  import shlex
4
+ from collections.abc import Mapping
4
5
  from pathlib import Path
5
- from typing import Any, Literal
6
+ from typing import Any, Literal, TypeAlias
6
7
 
7
8
  import yaml
8
9
  from pydantic import BaseModel, ConfigDict, Field, ValidationError, field_validator
10
+ from pydantic import model_validator
9
11
 
10
12
  from data_annotations.description import FieldDefinition
11
13
  from data_annotations.provenance.models import ArtifactKind, SourceCodeReference
12
14
 
13
15
 
14
16
  class AnswersError(ValueError):
15
- """Raised when a CLI answers file cannot be used."""
17
+ """Raised when an annotation answers payload cannot be used."""
16
18
 
17
19
 
18
20
  _ENV_VAR_PATTERN = re.compile(
@@ -79,6 +81,46 @@ _PROVENANCE_KEYS = {
79
81
  "git_tags",
80
82
  "git_describe",
81
83
  "source_code",
84
+ "infer_from_runtime",
85
+ }
86
+
87
+ _RUNTIME_CONTEXT_FIELDS = {
88
+ "created_at",
89
+ "hostname",
90
+ "username",
91
+ "slurm_job_id",
92
+ }
93
+ _GIT_RUNTIME_FIELDS = {
94
+ "git_sha",
95
+ "git_branch",
96
+ "git_dirty",
97
+ "git_remote_name",
98
+ "git_remote_url",
99
+ "git_tags",
100
+ "git_describe",
101
+ }
102
+ _RUNTIME_INFERENCE_GROUPS = {
103
+ "runtime": _RUNTIME_CONTEXT_FIELDS,
104
+ "git": _GIT_RUNTIME_FIELDS,
105
+ }
106
+ _RUNTIME_INFERENCE_FIELDS = (
107
+ _RUNTIME_CONTEXT_FIELDS
108
+ | _GIT_RUNTIME_FIELDS
109
+ | set(_RUNTIME_INFERENCE_GROUPS)
110
+ | {"source_code"}
111
+ )
112
+ _UNSUPPORTED_RUNTIME_INFERENCE_FIELDS = {
113
+ "command",
114
+ "script",
115
+ "script_repo_path",
116
+ }
117
+ _EXPLICIT_PROVENANCE_OVERRIDE_FIELDS = {
118
+ "command",
119
+ "script",
120
+ "script_repo_path",
121
+ "function",
122
+ *_GIT_RUNTIME_FIELDS,
123
+ "source_code",
82
124
  }
83
125
 
84
126
 
@@ -97,6 +139,50 @@ class ProvenanceAnswers(BaseModel):
97
139
  git_tags: list[str] | None = None
98
140
  git_describe: str | None = None
99
141
  source_code: SourceCodeReference | None = None
142
+ infer_from_runtime: list[str] = Field(default_factory=list)
143
+
144
+ @field_validator("infer_from_runtime", mode="before")
145
+ @classmethod
146
+ def _coerce_runtime_inference_fields(cls, value: Any) -> Any:
147
+ if value is None:
148
+ return []
149
+ if isinstance(value, str):
150
+ return [value]
151
+ return value
152
+
153
+ @field_validator("infer_from_runtime")
154
+ @classmethod
155
+ def _validate_runtime_inference_fields(cls, values: list[str]) -> list[str]:
156
+ normalized: list[str] = []
157
+ for value in values:
158
+ if value in _UNSUPPORTED_RUNTIME_INFERENCE_FIELDS:
159
+ raise ValueError(
160
+ "runtime inference is not supported for "
161
+ f"provenance.{value}; provide it explicitly"
162
+ )
163
+ if value not in _RUNTIME_INFERENCE_FIELDS:
164
+ allowed = sorted(_RUNTIME_INFERENCE_FIELDS)
165
+ raise ValueError(
166
+ f"unknown runtime inference field {value!r}; "
167
+ "expected one of: " + ", ".join(allowed)
168
+ )
169
+ if value not in normalized:
170
+ normalized.append(value)
171
+ return normalized
172
+
173
+ @model_validator(mode="after")
174
+ def _validate_runtime_inference_conflicts(self) -> "ProvenanceAnswers":
175
+ inferred = self.runtime_inference_fields()
176
+ conflicts = sorted(
177
+ field
178
+ for field in inferred & _EXPLICIT_PROVENANCE_OVERRIDE_FIELDS
179
+ if field in self.model_fields_set
180
+ )
181
+ if conflicts:
182
+ raise ValueError(
183
+ "cannot both set and infer provenance field(s): " + ", ".join(conflicts)
184
+ )
185
+ return self
100
186
 
101
187
  def command_tokens(self) -> list[str] | None:
102
188
  if self.command is None:
@@ -108,6 +194,12 @@ class ProvenanceAnswers(BaseModel):
108
194
  except ValueError as exc:
109
195
  raise AnswersError(f"invalid provenance.command: {exc}") from exc
110
196
 
197
+ def runtime_inference_fields(self) -> set[str]:
198
+ fields: set[str] = set()
199
+ for field in self.infer_from_runtime:
200
+ fields.update(_RUNTIME_INFERENCE_GROUPS.get(field, {field}))
201
+ return fields
202
+
111
203
 
112
204
  class BaseAnswers(BaseModel):
113
205
  model_config = ConfigDict(extra="forbid")
@@ -177,12 +269,16 @@ class DirectoryAnswers(BaseAnswers):
177
269
  checksums: dict[str, str] = Field(default_factory=dict)
178
270
 
179
271
 
180
- def load_file_answers(path: str | Path) -> FileAnswers:
181
- return _validate_answers(path, mode="file")
272
+ FileAnswersInput: TypeAlias = str | Path | Mapping[str, Any] | FileAnswers
273
+ DirectoryAnswersInput: TypeAlias = str | Path | Mapping[str, Any] | DirectoryAnswers
274
+
275
+
276
+ def load_file_answers(source: FileAnswersInput) -> FileAnswers:
277
+ return _validate_answers(source, mode="file")
182
278
 
183
279
 
184
- def load_directory_answers(path: str | Path) -> DirectoryAnswers:
185
- return _validate_answers(path, mode="directory")
280
+ def load_directory_answers(source: DirectoryAnswersInput) -> DirectoryAnswers:
281
+ return _validate_answers(source, mode="directory")
186
282
 
187
283
 
188
284
  def check_answers(path: str | Path) -> tuple[Literal["file", "directory"], Path]:
@@ -230,17 +326,29 @@ def require_complete_directory_answers(
230
326
 
231
327
 
232
328
  def _validate_answers(
233
- path: str | Path,
329
+ source: FileAnswersInput | DirectoryAnswersInput,
234
330
  *,
235
331
  mode: Literal["file", "directory"],
236
332
  ) -> Any:
237
- normalized = _normalize_answers(_load_raw_answers(path))
333
+ if mode == "file" and isinstance(source, FileAnswers):
334
+ return source
335
+ if mode == "directory" and isinstance(source, DirectoryAnswers):
336
+ return source
337
+
338
+ normalized = _normalize_answers(_load_raw_answers(source))
238
339
  model = FileAnswers if mode == "file" else DirectoryAnswers
239
340
  return _model_validate(model, normalized)
240
341
 
241
342
 
242
- def _load_raw_answers(path: str | Path) -> dict[str, Any]:
243
- answers_path = Path(path).expanduser()
343
+ def _load_raw_answers(
344
+ source: str | Path | Mapping[str, Any] | BaseAnswers,
345
+ ) -> dict[str, Any]:
346
+ if isinstance(source, BaseAnswers):
347
+ return source.model_dump()
348
+ if isinstance(source, Mapping):
349
+ return _expand_env_vars(dict(source), path="$")
350
+
351
+ answers_path = Path(source).expanduser()
244
352
  if not answers_path.is_file():
245
353
  raise AnswersError(f"answers file not found: {answers_path}")
246
354
  try:
@@ -416,3 +524,22 @@ def _missing_required_common_fields(
416
524
 
417
525
  def _has_text(value: Any) -> bool:
418
526
  return isinstance(value, str) and bool(value.strip())
527
+
528
+
529
+ __all__ = [
530
+ "AnswersError",
531
+ "BaseAnswers",
532
+ "ChildBundleAnswers",
533
+ "DirectoryAnswers",
534
+ "DirectoryAnswersInput",
535
+ "DirectoryArtifactAnswers",
536
+ "DirectoryArtifactGroupAnswers",
537
+ "FileAnswers",
538
+ "FileAnswersInput",
539
+ "ProvenanceAnswers",
540
+ "check_answers",
541
+ "load_directory_answers",
542
+ "load_file_answers",
543
+ "require_complete_directory_answers",
544
+ "require_complete_file_answers",
545
+ ]