PyPI - data-annotations - Versions diffs - 2.5.0__tar.gz → 2.7.0__tar.gz - Mend

data-annotations 2.5.0tar.gz → 2.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

{data_annotations-2.5.0 → data_annotations-2.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: data-annotations
-Version: 2.5.0
+Version: 2.7.0
 Summary: Annotate data artifacts with provenance and descriptions
 Keywords: annotations,data,metadata,provenance,reproducibility
 Author: Rodrigo C.  G.  Pena
@@ -18,7 +18,7 @@ Classifier: Programming Language :: Python :: 3.14
 Classifier: Topic :: Scientific/Engineering
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Dist: pydantic>=2.13.1
-Requires-Dist: pyyaml>=6.0.2 ; extra == 'cli'
+Requires-Dist: pyyaml>=6.0.2
 Requires-Dist: questionary>=2.1.1 ; extra == 'cli'
 Requires-Dist: typer>=0.16.0 ; extra == 'cli'
 Requires-Python: >=3.12
@@ -71,8 +71,9 @@ Or add it to a project with [uv](https://astral.sh/uv/):
 uv add data-annotations
 ```
-The command-line interface uses optional dependencies. Install the package with
-CLI support when you want to run `data-annotations` commands:
+The command-line interface uses optional dependencies for prompting and command
+parsing. Install the package with CLI support when you want to run
+`data-annotations` commands:
 ```bash
 pip install "data-annotations[cli]"
@@ -109,6 +110,11 @@ Every annotation document includes provenance with:
   directory content digests, and upstream annotation sidecar references when
   present
+Local file hashing defaults to checksum policy `auto`: existing files are hashed
+only up to `10 * 1024**3` bytes (10 GiB). Larger files are still recorded, but
+their `sha256` or directory `content_digest` is left unset unless you provide a
+precomputed checksum yourself.
 You can also attach your own parameters, input file paths, and function names.
 Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
 such as `s3://...` or `https://...` are preserved as provided.
@@ -373,6 +379,39 @@ metadata to vary per call instead of staying fixed at decoration time, use
 `write_directory_annotation(...)` directly instead. See the example gallery in
 `examples/` for runnable examples of all approaches.
+The Python API can also load the same YAML answers payloads used by the
+CLI:
+```python
+from data_annotations.annotations import (
+    annotate_directory,
+    annotate_file,
+    record_file_annotation,
+)
+annotate_file(answers="participants.yaml")
+annotate_directory(answers="run-001.yaml")
+annotate_file(
+    "outputs/summary.txt",
+    answers={"title": "Run Summary", "summary": "Validation run summary."},
+)
+@record_file_annotation(answers="participants.yaml")
+def write_participants(artifact_path, input_path):
+    ...
+```
+If an answers payload includes `target`, the positional artifact path or directory
+may be omitted. When both are provided, they must resolve to the same path.
+Explicit Python keyword arguments override values from `answers`. Environment
+variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded inside string
+values in both YAML files and mapping payloads.
+For directory decorators, the wrapped function still provides the produced output
+inventory. Matching `answers.artifacts` entries can supply titles, summaries,
+kinds, fields, primary keys, and missing-value codes for those returned paths.
 ### When To Use Decorators Vs Direct Functions
 If a function is only a final serializer for already-prepared data, prefer the
@@ -502,6 +541,75 @@ README.
 If you want the direct writer approach instead, use `write_file_manifest(...)` and
 `write_directory_manifest(...)` (see `examples/`).
+## Checksum Policy
+All provenance and annotation entry points that hash local files support the same
+policy controls:
+- `checksum_policy="auto"`: hash existing local files only when they are at or
+  below `max_checksum_bytes`. This is the default, and
+  `max_checksum_bytes` defaults to `10 * 1024**3` bytes (10 GiB).
+- `checksum_policy="always"`: hash existing local files regardless of size.
+- `checksum_policy="never"`: never hash local files automatically. Checksums are
+  recorded only when you supply them explicitly.
+When a checksum is skipped, JSON sidecars keep the same schema and simply store
+`sha256: null`. Directory `content_digest` is also left unset when any tracked
+member file lacks a checksum.
+You can change the policy from Python:
+```python
+from data_annotations.annotations import annotate_file
+from data_annotations.provenance import write_file_manifest
+write_file_manifest(
+    "outputs/summary.txt",
+    checksum_policy="always",
+)
+annotate_file(
+    "outputs/summary.txt",
+    title="Run Summary",
+    summary="Post-hoc summary.",
+    artifact_sha256="precomputed-sha256",
+    checksum_policy="never",
+)
+```
+You can also inject precomputed checksums directly:
+- File APIs: pass `artifact_sha256=...`.
+- File or directory APIs: pass `checksum_overrides={path: sha256}`. For
+  directory outputs, keys can be relative to the output directory or absolute
+  paths.
+- Decorators such as `record_file_manifest(...)`, `record_directory_manifest(...)`,
+  `record_file_annotation(...)`, and `record_directory_annotation(...)` accept the
+  same checksum-policy arguments.
+From the CLI, use `--checksum-policy`, `--max-checksum-bytes`, `--sha256`, and
+repeatable `--checksum PATH=SHA256`:
+```bash
+data-annotations annotate file path/to/summary.txt \
+  --title "Run Summary" \
+  --summary "Post-hoc summary." \
+  --kind report \
+  --checksum-policy never \
+  --sha256 0123456789abcdef...
+data-annotations annotate directory path/to/run-001 \
+  --title "Processing outputs" \
+  --summary "Directory-level outputs." \
+  --checksum-policy never \
+  --checksum processed.csv=0123456789abcdef...
+data-annotations provenance chain path/to/run-001 \
+  --checksum-policy always
+```
+For a complete runnable workflow, see `examples/checksum_policy.py`.
 ## Description Layer
 The `data_annotations.description` sub-package provides the structured description
@@ -628,6 +736,7 @@ target: path/to/participants.csv
 title: Participant Cohort
 summary: Participant-level cohort assignments.
 kind: dataset
+sha256: 0123456789abcdef...
 inputs:
     - ${DATA_ROOT}/raw/participants.csv
@@ -670,6 +779,9 @@ provenance:
     command: bash process_from_instrument.sh
     script: process_from_instrument.sh
+checksums:
+    processed.csv: 0123456789abcdef...
 artifacts:
     - path: processed.csv
       kind: dataset
@@ -694,6 +806,32 @@ Answers files may also use schema-style aliases such as `subject.path`,
 `description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
 and `provenance.params`.
+Answers can request selected provenance fields from the current runtime instead
+of taking them from the payload:
+```yaml
+target: path/to/run-001
+title: Processing outputs
+summary: Files produced by the shell processing workflow.
+provenance:
+    command: bash generate_some_data_artifact.sh
+    script: generate_some_data_artifact.sh
+    infer_from_runtime:
+        - runtime
+        - git
+        - source_code
+```
+`runtime` covers `created_at`, `hostname`, `username`, and `slurm_job_id`. `git`
+covers Git commit, branch, dirty state, remote, tags, and `git describe`.
+`source_code` leaves the source-code reference derived from runtime Git metadata.
+This is especially useful for timestamps, host/user and SLURM context, Git state,
+and derived `source_code`. Provide generation `command` and `script` explicitly
+in CLI answers files, because the runtime command and script would describe the
+`data-annotations annotate ...` invocation rather than the script that generated
+the artifact.
 For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
 `file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
 sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
@@ -765,6 +903,11 @@ resolving an older installed command. From a source checkout, use
 `uv run data-annotations provenance chain ...`, or reinstall the CLI from the
 updated source before using the bare `data-annotations` command.
+Both `match` and `chain` also accept `--checksum-policy` and
+`--max-checksum-bytes`. Use `--checksum-policy always` when you want full
+verification of large local files, and leave the default `auto` when you prefer
+to avoid long checksum passes on very large artifacts.
 ### Run With `uvx`
 ```bash
@@ -886,6 +1029,7 @@ uv run python examples/record_file_description.py
 uv run python examples/record_directory_description.py
 uv run python examples/annotate_file.py
 uv run python examples/annotate_directory.py
+uv run python examples/checksum_policy.py
 uv run python examples/annotate_file_answers_cli.py
 uv run python examples/write_file_manifest.py
 uv run python examples/write_directory_manifest.py

{data_annotations-2.5.0 → data_annotations-2.7.0}/README.md RENAMED Viewed

@@ -41,8 +41,9 @@ Or add it to a project with [uv](https://astral.sh/uv/):
 uv add data-annotations
 ```
-The command-line interface uses optional dependencies. Install the package with
-CLI support when you want to run `data-annotations` commands:
+The command-line interface uses optional dependencies for prompting and command
+parsing. Install the package with CLI support when you want to run
+`data-annotations` commands:
 ```bash
 pip install "data-annotations[cli]"
@@ -79,6 +80,11 @@ Every annotation document includes provenance with:
   directory content digests, and upstream annotation sidecar references when
   present
+Local file hashing defaults to checksum policy `auto`: existing files are hashed
+only up to `10 * 1024**3` bytes (10 GiB). Larger files are still recorded, but
+their `sha256` or directory `content_digest` is left unset unless you provide a
+precomputed checksum yourself.
 You can also attach your own parameters, input file paths, and function names.
 Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
 such as `s3://...` or `https://...` are preserved as provided.
@@ -343,6 +349,39 @@ metadata to vary per call instead of staying fixed at decoration time, use
 `write_directory_annotation(...)` directly instead. See the example gallery in
 `examples/` for runnable examples of all approaches.
+The Python API can also load the same YAML answers payloads used by the
+CLI:
+```python
+from data_annotations.annotations import (
+    annotate_directory,
+    annotate_file,
+    record_file_annotation,
+)
+annotate_file(answers="participants.yaml")
+annotate_directory(answers="run-001.yaml")
+annotate_file(
+    "outputs/summary.txt",
+    answers={"title": "Run Summary", "summary": "Validation run summary."},
+)
+@record_file_annotation(answers="participants.yaml")
+def write_participants(artifact_path, input_path):
+    ...
+```
+If an answers payload includes `target`, the positional artifact path or directory
+may be omitted. When both are provided, they must resolve to the same path.
+Explicit Python keyword arguments override values from `answers`. Environment
+variables such as `$DATA_ROOT` and `${DATA_ROOT}` are expanded inside string
+values in both YAML files and mapping payloads.
+For directory decorators, the wrapped function still provides the produced output
+inventory. Matching `answers.artifacts` entries can supply titles, summaries,
+kinds, fields, primary keys, and missing-value codes for those returned paths.
 ### When To Use Decorators Vs Direct Functions
 If a function is only a final serializer for already-prepared data, prefer the
@@ -472,6 +511,75 @@ README.
 If you want the direct writer approach instead, use `write_file_manifest(...)` and
 `write_directory_manifest(...)` (see `examples/`).
+## Checksum Policy
+All provenance and annotation entry points that hash local files support the same
+policy controls:
+- `checksum_policy="auto"`: hash existing local files only when they are at or
+  below `max_checksum_bytes`. This is the default, and
+  `max_checksum_bytes` defaults to `10 * 1024**3` bytes (10 GiB).
+- `checksum_policy="always"`: hash existing local files regardless of size.
+- `checksum_policy="never"`: never hash local files automatically. Checksums are
+  recorded only when you supply them explicitly.
+When a checksum is skipped, JSON sidecars keep the same schema and simply store
+`sha256: null`. Directory `content_digest` is also left unset when any tracked
+member file lacks a checksum.
+You can change the policy from Python:
+```python
+from data_annotations.annotations import annotate_file
+from data_annotations.provenance import write_file_manifest
+write_file_manifest(
+    "outputs/summary.txt",
+    checksum_policy="always",
+)
+annotate_file(
+    "outputs/summary.txt",
+    title="Run Summary",
+    summary="Post-hoc summary.",
+    artifact_sha256="precomputed-sha256",
+    checksum_policy="never",
+)
+```
+You can also inject precomputed checksums directly:
+- File APIs: pass `artifact_sha256=...`.
+- File or directory APIs: pass `checksum_overrides={path: sha256}`. For
+  directory outputs, keys can be relative to the output directory or absolute
+  paths.
+- Decorators such as `record_file_manifest(...)`, `record_directory_manifest(...)`,
+  `record_file_annotation(...)`, and `record_directory_annotation(...)` accept the
+  same checksum-policy arguments.
+From the CLI, use `--checksum-policy`, `--max-checksum-bytes`, `--sha256`, and
+repeatable `--checksum PATH=SHA256`:
+```bash
+data-annotations annotate file path/to/summary.txt \
+  --title "Run Summary" \
+  --summary "Post-hoc summary." \
+  --kind report \
+  --checksum-policy never \
+  --sha256 0123456789abcdef...
+data-annotations annotate directory path/to/run-001 \
+  --title "Processing outputs" \
+  --summary "Directory-level outputs." \
+  --checksum-policy never \
+  --checksum processed.csv=0123456789abcdef...
+data-annotations provenance chain path/to/run-001 \
+  --checksum-policy always
+```
+For a complete runnable workflow, see `examples/checksum_policy.py`.
 ## Description Layer
 The `data_annotations.description` sub-package provides the structured description
@@ -598,6 +706,7 @@ target: path/to/participants.csv
 title: Participant Cohort
 summary: Participant-level cohort assignments.
 kind: dataset
+sha256: 0123456789abcdef...
 inputs:
     - ${DATA_ROOT}/raw/participants.csv
@@ -640,6 +749,9 @@ provenance:
     command: bash process_from_instrument.sh
     script: process_from_instrument.sh
+checksums:
+    processed.csv: 0123456789abcdef...
 artifacts:
     - path: processed.csv
       kind: dataset
@@ -664,6 +776,32 @@ Answers files may also use schema-style aliases such as `subject.path`,
 `description.artifacts`, `description.artifact_groups`, `provenance.inputs`,
 and `provenance.params`.
+Answers can request selected provenance fields from the current runtime instead
+of taking them from the payload:
+```yaml
+target: path/to/run-001
+title: Processing outputs
+summary: Files produced by the shell processing workflow.
+provenance:
+    command: bash generate_some_data_artifact.sh
+    script: generate_some_data_artifact.sh
+    infer_from_runtime:
+        - runtime
+        - git
+        - source_code
+```
+`runtime` covers `created_at`, `hostname`, `username`, and `slurm_job_id`. `git`
+covers Git commit, branch, dirty state, remote, tags, and `git describe`.
+`source_code` leaves the source-code reference derived from runtime Git metadata.
+This is especially useful for timestamps, host/user and SLURM context, Git state,
+and derived `source_code`. Provide generation `command` and `script` explicitly
+in CLI answers files, because the runtime command and script would describe the
+`data-annotations annotate ...` invocation rather than the script that generated
+the artifact.
 For source-code recovery, `provenance.source_code.kind` may be `git`, `archive`,
 `file`, or `uri`. Git sources use `uri` plus `revision`; archive and file
 sources use `uri` or `download_uri` plus an optional `sha256`; `path` points to
@@ -735,6 +873,11 @@ resolving an older installed command. From a source checkout, use
 `uv run data-annotations provenance chain ...`, or reinstall the CLI from the
 updated source before using the bare `data-annotations` command.
+Both `match` and `chain` also accept `--checksum-policy` and
+`--max-checksum-bytes`. Use `--checksum-policy always` when you want full
+verification of large local files, and leave the default `auto` when you prefer
+to avoid long checksum passes on very large artifacts.
 ### Run With `uvx`
 ```bash
@@ -856,6 +999,7 @@ uv run python examples/record_file_description.py
 uv run python examples/record_directory_description.py
 uv run python examples/annotate_file.py
 uv run python examples/annotate_directory.py
+uv run python examples/checksum_policy.py
 uv run python examples/annotate_file_answers_cli.py
 uv run python examples/write_file_manifest.py
 uv run python examples/write_directory_manifest.py

{data_annotations-2.5.0 → data_annotations-2.7.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "data-annotations"
-version = "2.5.0"
+version = "2.7.0"
 description = "Annotate data artifacts with provenance and descriptions"
 readme = "README.md"
 authors = [
@@ -9,7 +9,7 @@ authors = [
 license = "BSD-3-Clause"
 license-files = ["LICENSE"]
 requires-python = ">=3.12"
-dependencies = ["pydantic>=2.13.1"]
+dependencies = ["pydantic>=2.13.1", "PyYAML>=6.0.2"]
 keywords = ["annotations", "data", "metadata", "provenance", "reproducibility"]
 classifiers = [
   "Development Status :: 4 - Beta",
@@ -30,7 +30,7 @@ Changelog = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/blob/main/C
 Issues = "https://gitlab.com/ceda-unibas/tools/data-annotations/-/issues"
 [project.optional-dependencies]
-cli = ["PyYAML>=6.0.2", "questionary>=2.1.1", "typer>=0.16.0"]
+cli = ["questionary>=2.1.1", "typer>=0.16.0"]
 [project.scripts]
 data-annotations = "data_annotations.cli:main"

{data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/_decorators.py RENAMED Viewed

@@ -9,6 +9,7 @@ if TYPE_CHECKING:
         DocumentedArtifactGroup,
     )
     from data_annotations.provenance.models import ChildBundle, ProducedFile
+    from data_annotations.provenance.models import ChecksumPolicy
 DEFAULT_INPUT_ARGS = ("input_path", "input_paths")
@@ -78,6 +79,8 @@ def coerce_produced_file(
     item: Any,
     *,
     normalize_paths: bool = True,
+    checksum_policy: "ChecksumPolicy" = "auto",
+    max_checksum_bytes: int | None = None,
 ) -> "ProducedFile":
     from data_annotations.description.models import DocumentedArtifact
     from data_annotations.provenance import writers as provenance_writers
@@ -89,7 +92,15 @@ def coerce_produced_file(
             path=str(path),
             kind=item.kind,
             sha256=(
-                provenance_writers.sha256_file(path)
+                provenance_writers._resolve_file_sha256(
+                    path,
+                    checksum_policy=checksum_policy,
+                    max_checksum_bytes=(
+                        max_checksum_bytes
+                        if max_checksum_bytes is not None
+                        else provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES
+                    ),
+                )
                 if normalize_paths and path.exists()
                 else None
             ),
@@ -106,7 +117,15 @@ def coerce_produced_file(
             path=str(normalized),
             kind=kind,
             sha256=(
-                provenance_writers.sha256_file(normalized)
+                provenance_writers._resolve_file_sha256(
+                    normalized,
+                    checksum_policy=checksum_policy,
+                    max_checksum_bytes=(
+                        max_checksum_bytes
+                        if max_checksum_bytes is not None
+                        else provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES
+                    ),
+                )
                 if normalize_paths and normalized.exists()
                 else None
             ),
@@ -117,7 +136,15 @@ def coerce_produced_file(
         path=str(path),
         kind="other",
         sha256=(
-            provenance_writers.sha256_file(path)
+            provenance_writers._resolve_file_sha256(
+                path,
+                checksum_policy=checksum_policy,
+                max_checksum_bytes=(
+                    max_checksum_bytes
+                    if max_checksum_bytes is not None
+                    else provenance_writers.DEFAULT_MAX_CHECKSUM_BYTES
+                ),
+            )
             if normalize_paths and path.exists()
             else None
         ),
@@ -128,9 +155,17 @@ def coerce_produced_files(
     items: Iterable[Any],
     *,
     normalize_paths: bool = True,
+    checksum_policy: "ChecksumPolicy" = "auto",
+    max_checksum_bytes: int | None = None,
 ) -> list["ProducedFile"]:
     return [
-        coerce_produced_file(item, normalize_paths=normalize_paths) for item in items
+        coerce_produced_file(
+            item,
+            normalize_paths=normalize_paths,
+            checksum_policy=checksum_policy,
+            max_checksum_bytes=max_checksum_bytes,
+        )
+        for item in items
     ]

{data_annotations-2.5.0 → data_annotations-2.7.0}/src/data_annotations/annotations/__init__.py RENAMED Viewed

@@ -1,3 +1,10 @@
+from .answers import (
+    AnswersError,
+    DirectoryAnswers,
+    FileAnswers,
+    load_directory_answers,
+    load_file_answers,
+)
 from .models import (
     DirectoryAnnotationDocument,
     DirectoryAnnotationResult,
@@ -17,13 +24,18 @@ from .writers import (
 __all__ = [
     "annotate_directory",
     "annotate_file",
+    "load_directory_answers",
+    "load_file_answers",
     "record_directory_annotation",
     "record_file_annotation",
     "write_directory_annotation",
     "write_file_annotation",
+    "AnswersError",
+    "DirectoryAnswers",
     "DirectoryAnnotationDocument",
     "DirectoryAnnotationResult",
     "DirectoryArtifactSubject",
+    "FileAnswers",
     "FileAnnotationDocument",
     "FileAnnotationResult",
     "FileArtifactSubject",

data-annotations 2.5.0__tar.gz → 2.7.0__tar.gz

data-annotations 2.5.0tar.gz → 2.7.0tar.gz