PyPI - data-annotations - Versions diffs - 2.6.0__tar.gz → 2.8.0__tar.gz - Mend

data-annotations 2.6.0tar.gz → 2.8.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

{data_annotations-2.6.0 → data_annotations-2.8.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: data-annotations
-Version: 2.6.0
+Version: 2.8.0
 Summary: Annotate data artifacts with provenance and descriptions
 Keywords: annotations,data,metadata,provenance,reproducibility
 Author: Rodrigo C.  G.  Pena
@@ -18,7 +18,7 @@ Classifier: Programming Language :: Python :: 3.14
 Classifier: Topic :: Scientific/Engineering
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Dist: pydantic>=2.13.1
-Requires-Dist: pyyaml>=6.0.2 ; extra == 'cli'
+Requires-Dist: pyyaml>=6.0.2
 Requires-Dist: questionary>=2.1.1 ; extra == 'cli'
 Requires-Dist: typer>=0.16.0 ; extra == 'cli'
 Requires-Python: >=3.12
@@ -71,8 +71,9 @@ Or add it to a project with [uv](https://astral.sh/uv/):
 uv add data-annotations
 ```
-The command-line interface uses optional dependencies. Install the package with
-CLI support when you want to run `data-annotations` commands:
+The command-line interface uses optional dependencies for prompting and command
+parsing. Install the package with CLI support when you want to run
+`data-annotations` commands:
 ```bash
 pip install "data-annotations[cli]"
@@ -125,13 +126,13 @@ Git tags and `git_describe` are human-friendly hints only. For Git sources,
 The recommended way to annotate your data artifacts is to decorate pipeline
 functions that consume some inputs and parameters, then write those artifacts.
 This keeps the artifact-writing logic explicit while letting `data-annotations` capture
-provenance and emit sidecars automatically.
+provenance and emit the annotation JSON sidecar automatically.
 For example, here is a complete file-level annotation workflow using the
 `record_file_annotation(...)` decorator. Once `write_participants` is called, it
-automatically generates sidecars `participants.csv.annotation.json` and `participants.csv.README.md`.
-The JSON sidecar will contain provenance and description metadata, and the Markdown sidecar
-will have a human-friendly rendering of the description provided in the decorator.
+automatically generates `participants.csv.annotation.json`. Set
+`write_readme=True` when you also want `participants.csv.README.md`, a
+human-friendly Markdown rendering of the description provided in the decorator.
 ```python
 from pathlib import Path
@@ -164,6 +165,7 @@ from data_annotations.description import AllowedValue, FieldDefinition
     artifact_kind="dataset",
     acquisition_context={"source": "Study A registry export"},
     generation_context={"pipeline": "baseline-v1"},
+    write_readme=True,
 )
 def write_participants(
     artifact_path: Path,
@@ -284,6 +286,7 @@ from data_annotations.provenance import ProducedFile
     summary="Directory-level documentation for the validation run outputs.",
     acquisition_context={"source": "Study A registry export"},
     generation_context={"pipeline": "baseline-v1"},
+    write_readme=True,
 )
 def build_outputs(
     output_dir: Path,
@@ -378,6 +381,43 @@ metadata to vary per call instead of staying fixed at decoration time, use
 `write_directory_annotation(...)` directly instead. See the example gallery in
 `examples/` for runnable examples of all approaches.
+The Python API can also load the same YAML answers payloads used by the
+CLI:
+```python
+from data_annotations.annotations import (
+    annotate_directory,
+    annotate_file,
+    record_file_annotation,
+)
+annotate_file(answers="participants.yaml")
+annotate_directory(answers="run-001.yaml")
+annotate_file(
+    "outputs/summary.txt",
+    answers={"title": "Run Summary", "summary": "Validation run summary."},
+)
+# Add write_readme=True when you also want Markdown README sidecars.
+annotate_file(answers="participants.yaml", write_readme=True)
+annotate_directory(answers="run-001.yaml", write_readme=True)
+@record_file_annotation(answers="participants.yaml", write_readme=True)
+def write_participants(artifact_path, input_path):
+    ...
+```
+If an answers payload includes `target`, the positional artifact path or directory
+may be omitted. When both are provided, they must resolve to the same path.
+Explicit Python keyword arguments override values from `answers`. Environment
+variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded inside string
+values in both YAML files and mapping payloads.
+For directory decorators, the wrapped function still provides the produced output
+inventory. Matching `answers.artifacts` entries can supply titles, summaries,
+kinds, fields, primary keys, and missing-value codes for those returned paths.
 ### When To Use Decorators Vs Direct Functions
 If a function is only a final serializer for already-prepared data, prefer the
@@ -673,9 +713,14 @@ data-annotations annotate directory path/to/run-001 \
   --group-kind plot
 ```
-These commands prompt for missing details, write `*.annotation.json` or `data-annotations.json`,
-and optionally derive README sidecars. Post-hoc records are marked with
-`capture_mode="post_hoc"`.
+These commands prompt for missing details and write `*.annotation.json` or
+`data-annotations.json`. Post-hoc records are marked with
+`capture_mode="post_hoc"`. README sidecars are opt-in:
+```bash
+data-annotations annotate file path/to/participants.csv --write-readme
+data-annotations annotate directory path/to/run-001 --write-readme
+```
 For shell workflows, you can move the prompt answers into a YAML file and run
 the command non-interactively:
@@ -772,6 +817,32 @@ Answers files may also use schema-style aliases such as `subject.path`,
 `description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
 and `provenance.params`.
+Answers can request selected provenance fields from the current runtime instead
+of taking them from the payload:
+```yaml
+target: path/to/run-001
+title: Processing outputs
+summary: Files produced by the shell processing workflow.
+provenance:
+    command: bash generate_some_data_artifact.sh
+    script: generate_some_data_artifact.sh
+    infer_from_runtime:
+        - runtime
+        - git
+        - source_code
+```
+`runtime` covers `created_at`, `hostname`, `username`, and `slurm_job_id`. `git`
+covers Git commit, branch, dirty state, remote, tags, and `git describe`.
+`source_code` leaves the source-code reference derived from runtime Git metadata.
+This is especially useful for timestamps, host/user and SLURM context, Git state,
+and derived `source_code`. Provide generation `command` and `script` explicitly
+in CLI answers files, because the runtime command and script would describe the
+`data-annotations annotate ...` invocation rather than the script that generated
+the artifact.
 For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
 `file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
 sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
@@ -984,4 +1055,5 @@ uv run python examples/publish_cli.py
 ```
 Each example writes its outputs to a fresh temporary directory and prints the
-location so you can inspect the generated annotation documents and README sidecars.
+location so you can inspect the generated annotation documents and any requested
+README sidecars.

{data_annotations-2.6.0 → data_annotations-2.8.0}/README.md RENAMED Viewed

@@ -41,8 +41,9 @@ Or add it to a project with [uv](https://astral.sh/uv/):
 uv add data-annotations
 ```
-The command-line interface uses optional dependencies. Install the package with
-CLI support when you want to run `data-annotations` commands:
+The command-line interface uses optional dependencies for prompting and command
+parsing. Install the package with CLI support when you want to run
+`data-annotations` commands:
 ```bash
 pip install "data-annotations[cli]"
@@ -95,13 +96,13 @@ Git tags and `git_describe` are human-friendly hints only. For Git sources,
 The recommended way to annotate your data artifacts is to decorate pipeline
 functions that consume some inputs and parameters, then write those artifacts.
 This keeps the artifact-writing logic explicit while letting `data-annotations` capture
-provenance and emit sidecars automatically.
+provenance and emit the annotation JSON sidecar automatically.
 For example, here is a complete file-level annotation workflow using the
 `record_file_annotation(...)` decorator. Once `write_participants` is called, it
-automatically generates sidecars `participants.csv.annotation.json` and `participants.csv.README.md`.
-The JSON sidecar will contain provenance and description metadata, and the Markdown sidecar
-will have a human-friendly rendering of the description provided in the decorator.
+automatically generates `participants.csv.annotation.json`. Set
+`write_readme=True` when you also want `participants.csv.README.md`, a
+human-friendly Markdown rendering of the description provided in the decorator.
 ```python
 from pathlib import Path
@@ -134,6 +135,7 @@ from data_annotations.description import AllowedValue, FieldDefinition
     artifact_kind="dataset",
     acquisition_context={"source": "Study A registry export"},
     generation_context={"pipeline": "baseline-v1"},
+    write_readme=True,
 )
 def write_participants(
     artifact_path: Path,
@@ -254,6 +256,7 @@ from data_annotations.provenance import ProducedFile
     summary="Directory-level documentation for the validation run outputs.",
     acquisition_context={"source": "Study A registry export"},
     generation_context={"pipeline": "baseline-v1"},
+    write_readme=True,
 )
 def build_outputs(
     output_dir: Path,
@@ -348,6 +351,43 @@ metadata to vary per call instead of staying fixed at decoration time, use
 `write_directory_annotation(...)` directly instead. See the example gallery in
 `examples/` for runnable examples of all approaches.
+The Python API can also load the same YAML answers payloads used by the
+CLI:
+```python
+from data_annotations.annotations import (
+    annotate_directory,
+    annotate_file,
+    record_file_annotation,
+)
+annotate_file(answers="participants.yaml")
+annotate_directory(answers="run-001.yaml")
+annotate_file(
+    "outputs/summary.txt",
+    answers={"title": "Run Summary", "summary": "Validation run summary."},
+)
+# Add write_readme=True when you also want Markdown README sidecars.
+annotate_file(answers="participants.yaml", write_readme=True)
+annotate_directory(answers="run-001.yaml", write_readme=True)
+@record_file_annotation(answers="participants.yaml", write_readme=True)
+def write_participants(artifact_path, input_path):
+    ...
+```
+If an answers payload includes `target`, the positional artifact path or directory
+may be omitted. When both are provided, they must resolve to the same path.
+Explicit Python keyword arguments override values from `answers`. Environment
+variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded inside string
+values in both YAML files and mapping payloads.
+For directory decorators, the wrapped function still provides the produced output
+inventory. Matching `answers.artifacts` entries can supply titles, summaries,
+kinds, fields, primary keys, and missing-value codes for those returned paths.
 ### When To Use Decorators Vs Direct Functions
 If a function is only a final serializer for already-prepared data, prefer the
@@ -643,9 +683,14 @@ data-annotations annotate directory path/to/run-001 \
   --group-kind plot
 ```
-These commands prompt for missing details, write `*.annotation.json` or `data-annotations.json`,
-and optionally derive README sidecars. Post-hoc records are marked with
-`capture_mode="post_hoc"`.
+These commands prompt for missing details and write `*.annotation.json` or
+`data-annotations.json`. Post-hoc records are marked with
+`capture_mode="post_hoc"`. README sidecars are opt-in:
+```bash
+data-annotations annotate file path/to/participants.csv --write-readme
+data-annotations annotate directory path/to/run-001 --write-readme
+```
 For shell workflows, you can move the prompt answers into a YAML file and run
 the command non-interactively:
@@ -742,6 +787,32 @@ Answers files may also use schema-style aliases such as `subject.path`,
 `description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
 and `provenance.params`.
+Answers can request selected provenance fields from the current runtime instead
+of taking them from the payload:
+```yaml
+target: path/to/run-001
+title: Processing outputs
+summary: Files produced by the shell processing workflow.
+provenance:
+    command: bash generate_some_data_artifact.sh
+    script: generate_some_data_artifact.sh
+    infer_from_runtime:
+        - runtime
+        - git
+        - source_code
+```
+`runtime` covers `created_at`, `hostname`, `username`, and `slurm_job_id`. `git`
+covers Git commit, branch, dirty state, remote, tags, and `git describe`.
+`source_code` leaves the source-code reference derived from runtime Git metadata.
+This is especially useful for timestamps, host/user and SLURM context, Git state,
+and derived `source_code`. Provide generation `command` and `script` explicitly
+in CLI answers files, because the runtime command and script would describe the
+`data-annotations annotate ...` invocation rather than the script that generated
+the artifact.
 For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
 `file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
 sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
@@ -954,4 +1025,5 @@ uv run python examples/publish_cli.py
 ```
 Each example writes its outputs to a fresh temporary directory and prints the
-location so you can inspect the generated annotation documents and README sidecars.
+location so you can inspect the generated annotation documents and any requested
+README sidecars.

{data_annotations-2.6.0 → data_annotations-2.8.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "data-annotations"
-version = "2.6.0"
+version = "2.8.0"
 description = "Annotate data artifacts with provenance and descriptions"
 readme = "README.md"
 authors = [
@@ -9,7 +9,7 @@ authors = [
 license = "BSD-3-Clause"
 license-files = ["LICENSE"]
 requires-python = ">=3.12"
-dependencies = ["pydantic>=2.13.1"]
+dependencies = ["pydantic>=2.13.1", "PyYAML>=6.0.2"]
 keywords = ["annotations", "data", "metadata", "provenance", "reproducibility"]
 classifiers = [
   "Development Status :: 4 - Beta",
@@ -30,7 +30,7 @@ Changelog = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/blob/main/C
 Issues = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/issues"
 [project.optional-dependencies]
-cli = ["PyYAML>=6.0.2", "questionary>=2.1.1", "typer>=0.16.0"]
+cli = ["questionary>=2.1.1", "typer>=0.16.0"]
 [project.scripts]
 data-annotations = "data_annotations.cli:main"

{data_annotations-2.6.0 → data_annotations-2.8.0}/src/data_annotations/annotations/__init__.py RENAMED Viewed

@@ -1,3 +1,10 @@
+from .answers import (
+    AnswersError,
+    DirectoryAnswers,
+    FileAnswers,
+    load_directory_answers,
+    load_file_answers,
+)
 from .models import (
     DirectoryAnnotationDocument,
     DirectoryAnnotationResult,
@@ -17,13 +24,18 @@ from .writers import (
 __all__ = [
     "annotate_directory",
     "annotate_file",
+    "load_directory_answers",
+    "load_file_answers",
     "record_directory_annotation",
     "record_file_annotation",
     "write_directory_annotation",
     "write_file_annotation",
+    "AnswersError",
+    "DirectoryAnswers",
     "DirectoryAnnotationDocument",
     "DirectoryAnnotationResult",
     "DirectoryArtifactSubject",
+    "FileAnswers",
     "FileAnnotationDocument",
     "FileAnnotationResult",
     "FileArtifactSubject",

{data_annotations-2.6.0/src/data_annotations/cli_app → data_annotations-2.8.0/src/data_annotations/annotations}/answers.py RENAMED Viewed

@@ -1,18 +1,20 @@
 import os
 import re
 import shlex
+from collections.abc import Mapping
 from pathlib import Path
-from typing import Any, Literal
+from typing import Any, Literal, TypeAlias
 import yaml
 from pydantic import BaseModel, ConfigDict, Field, ValidationError, field_validator
+from pydantic import model_validator
 from data_annotations.description import FieldDefinition
 from data_annotations.provenance.models import ArtifactKind, SourceCodeReference
 class AnswersError(ValueError):
-    """Raised when a CLI answers file cannot be used."""
+    """Raised when an annotation answers payload cannot be used."""
 _ENV_VAR_PATTERN = re.compile(
@@ -79,6 +81,46 @@ _PROVENANCE_KEYS = {
     "git_tags",
     "git_describe",
     "source_code",
+    "infer_from_runtime",
+}
+_RUNTIME_CONTEXT_FIELDS = {
+    "created_at",
+    "hostname",
+    "username",
+    "slurm_job_id",
+}
+_GIT_RUNTIME_FIELDS = {
+    "git_sha",
+    "git_branch",
+    "git_dirty",
+    "git_remote_name",
+    "git_remote_url",
+    "git_tags",
+    "git_describe",
+}
+_RUNTIME_INFERENCE_GROUPS = {
+    "runtime": _RUNTIME_CONTEXT_FIELDS,
+    "git": _GIT_RUNTIME_FIELDS,
+}
+_RUNTIME_INFERENCE_FIELDS = (
+    _RUNTIME_CONTEXT_FIELDS
+    | _GIT_RUNTIME_FIELDS
+    | set(_RUNTIME_INFERENCE_GROUPS)
+    | {"source_code"}
+)
+_UNSUPPORTED_RUNTIME_INFERENCE_FIELDS = {
+    "command",
+    "script",
+    "script_repo_path",
+}
+_EXPLICIT_PROVENANCE_OVERRIDE_FIELDS = {
+    "command",
+    "script",
+    "script_repo_path",
+    "function",
+    *_GIT_RUNTIME_FIELDS,
+    "source_code",
 }
@@ -97,6 +139,50 @@ class ProvenanceAnswers(BaseModel):
     git_tags: list[str] | None = None
     git_describe: str | None = None
     source_code: SourceCodeReference | None = None
+    infer_from_runtime: list[str] = Field(default_factory=list)
+    @field_validator("infer_from_runtime", mode="before")
+    @classmethod
+    def _coerce_runtime_inference_fields(cls, value: Any) -> Any:
+        if value is None:
+            return []
+        if isinstance(value, str):
+            return [value]
+        return value
+    @field_validator("infer_from_runtime")
+    @classmethod
+    def _validate_runtime_inference_fields(cls, values: list[str]) -> list[str]:
+        normalized: list[str] = []
+        for value in values:
+            if value in _UNSUPPORTED_RUNTIME_INFERENCE_FIELDS:
+                raise ValueError(
+                    "runtime inference is not supported for "
+                    f"provenance.{value}; provide it explicitly"
+                )
+            if value not in _RUNTIME_INFERENCE_FIELDS:
+                allowed = sorted(_RUNTIME_INFERENCE_FIELDS)
+                raise ValueError(
+                    f"unknown runtime inference field {value!r}; "
+                    "expected one of: " + ", ".join(allowed)
+                )
+            if value not in normalized:
+                normalized.append(value)
+        return normalized
+    @model_validator(mode="after")
+    def _validate_runtime_inference_conflicts(self) -> "ProvenanceAnswers":
+        inferred = self.runtime_inference_fields()
+        conflicts = sorted(
+            field
+            for field in inferred & _EXPLICIT_PROVENANCE_OVERRIDE_FIELDS
+            if field in self.model_fields_set
+        )
+        if conflicts:
+            raise ValueError(
+                "cannot both set and infer provenance field(s): " + ", ".join(conflicts)
+            )
+        return self
     def command_tokens(self) -> list[str] | None:
         if self.command is None:
@@ -108,6 +194,12 @@ class ProvenanceAnswers(BaseModel):
         except ValueError as exc:
             raise AnswersError(f"invalid provenance.command: {exc}") from exc
+    def runtime_inference_fields(self) -> set[str]:
+        fields: set[str] = set()
+        for field in self.infer_from_runtime:
+            fields.update(_RUNTIME_INFERENCE_GROUPS.get(field, {field}))
+        return fields
 class BaseAnswers(BaseModel):
     model_config = ConfigDict(extra="forbid")
@@ -177,12 +269,16 @@ class DirectoryAnswers(BaseAnswers):
     checksums: dict[str, str] = Field(default_factory=dict)
-def load_file_answers(path: str | Path) -> FileAnswers:
-    return _validate_answers(path, mode="file")
+FileAnswersInput: TypeAlias = str | Path | Mapping[str, Any] | FileAnswers
+DirectoryAnswersInput: TypeAlias = str | Path | Mapping[str, Any] | DirectoryAnswers
+def load_file_answers(source: FileAnswersInput) -> FileAnswers:
+    return _validate_answers(source, mode="file")
-def load_directory_answers(path: str | Path) -> DirectoryAnswers:
-    return _validate_answers(path, mode="directory")
+def load_directory_answers(source: DirectoryAnswersInput) -> DirectoryAnswers:
+    return _validate_answers(source, mode="directory")
 def check_answers(path: str | Path) -> tuple[Literal["file", "directory"], Path]:
@@ -230,17 +326,29 @@ def require_complete_directory_answers(
 def _validate_answers(
-    path: str | Path,
+    source: FileAnswersInput | DirectoryAnswersInput,
     *,
     mode: Literal["file", "directory"],
 ) -> Any:
-    normalized = _normalize_answers(_load_raw_answers(path))
+    if mode == "file" and isinstance(source, FileAnswers):
+        return source
+    if mode == "directory" and isinstance(source, DirectoryAnswers):
+        return source
+    normalized = _normalize_answers(_load_raw_answers(source))
     model = FileAnswers if mode == "file" else DirectoryAnswers
     return _model_validate(model, normalized)
-def _load_raw_answers(path: str | Path) -> dict[str, Any]:
-    answers_path = Path(path).expanduser()
+def _load_raw_answers(
+    source: str | Path | Mapping[str, Any] | BaseAnswers,
+) -> dict[str, Any]:
+    if isinstance(source, BaseAnswers):
+        return source.model_dump()
+    if isinstance(source, Mapping):
+        return _expand_env_vars(dict(source), path="$")
+    answers_path = Path(source).expanduser()
     if not answers_path.is_file():
         raise AnswersError(f"answers file not found: {answers_path}")
     try:
@@ -416,3 +524,22 @@ def _missing_required_common_fields(
 def _has_text(value: Any) -> bool:
     return isinstance(value, str) and bool(value.strip())
+__all__ = [
+    "AnswersError",
+    "BaseAnswers",
+    "ChildBundleAnswers",
+    "DirectoryAnswers",
+    "DirectoryAnswersInput",
+    "DirectoryArtifactAnswers",
+    "DirectoryArtifactGroupAnswers",
+    "FileAnswers",
+    "FileAnswersInput",
+    "ProvenanceAnswers",
+    "check_answers",
+    "load_directory_answers",
+    "load_file_answers",
+    "require_complete_directory_answers",
+    "require_complete_file_answers",
+]

data-annotations 2.6.0__tar.gz → 2.8.0__tar.gz

data-annotations 2.6.0tar.gz → 2.8.0tar.gz