PyPI - data-annotations - Versions diffs - 2.1.2__py3-none-any.whl - Mend

data-annotations 2.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

data_annotations/__init__.py +2 -0
data_annotations/_decorators.py +140 -0
data_annotations/annotations/__init__.py +30 -0
data_annotations/annotations/decorators.py +147 -0
data_annotations/annotations/models.py +45 -0
data_annotations/annotations/writers.py +368 -0
data_annotations/cli.py +37 -0
data_annotations/cli_app/__init__.py +1 -0
data_annotations/cli_app/annotate.py +307 -0
data_annotations/cli_app/common.py +276 -0
data_annotations/cli_app/prompts.py +534 -0
data_annotations/cli_app/provenance_commands.py +107 -0
data_annotations/description/__init__.py +37 -0
data_annotations/description/decorators.py +145 -0
data_annotations/description/models.py +63 -0
data_annotations/description/writers.py +321 -0
data_annotations/provenance/__init__.py +37 -0
data_annotations/provenance/decorators.py +111 -0
data_annotations/provenance/git.py +121 -0
data_annotations/provenance/models.py +50 -0
data_annotations/provenance/recovery.py +473 -0
data_annotations/provenance/runtime.py +248 -0
data_annotations/provenance/writers.py +206 -0
data_annotations-2.1.2.dist-info/METADATA +616 -0
data_annotations-2.1.2.dist-info/RECORD +28 -0
data_annotations-2.1.2.dist-info/WHEEL +4 -0
data_annotations-2.1.2.dist-info/entry_points.txt +3 -0
data_annotations-2.1.2.dist-info/licenses/LICENSE +28 -0

data_annotations/cli_app/prompts.py ADDED Viewed

@@ -0,0 +1,534 @@
+import re
+import shlex
+from collections.abc import Sequence
+from pathlib import Path
+from typing import Any, Callable
+import questionary
+import typer
+from questionary import Choice
+from data_annotations.provenance.models import ArtifactKind
+from data_annotations.description import FieldDefinition
+from data_annotations.provenance import git as provenance_git
+from data_annotations.provenance import runtime as provenance_runtime
+from .common import (
+    _ARTIFACT_KINDS,
+    _UNKNOWN_SELECTION,
+    _collect_params,
+    _is_current_annotation_command,
+    _normalize_remote_url,
+    _parse_command_string,
+    _parse_param_entry,
+    _validate_artifact_kind,
+)
+def _ask_question(result: Any) -> Any:
+    if result is None:
+        raise typer.Abort()
+    return result
+def _questionary_text(
+    message: str,
+    *,
+    default: str = "",
+    instruction: str | None = None,
+) -> str:
+    return _ask_question(
+        questionary.text(
+            message,
+            default=default,
+            instruction=instruction,
+        ).ask()
+    )
+def _questionary_select(
+    message: str,
+    choices: Sequence[str | Choice],
+    *,
+    default: str | Choice | None = None,
+    instruction: str | None = None,
+) -> Any:
+    return _ask_question(
+        questionary.select(
+            message,
+            choices=choices,
+            default=default,
+            instruction=instruction,
+            use_search_filter=False,
+        ).ask()
+    )
+def _questionary_confirm(
+    message: str,
+    *,
+    default: bool = True,
+    instruction: str | None = None,
+) -> bool:
+    return bool(
+        _ask_question(
+            questionary.confirm(
+                message,
+                default=default,
+                instruction=instruction,
+            ).ask()
+        )
+    )
+def _questionary_checkbox(
+    message: str,
+    choices: list[str | Choice],
+    *,
+    instruction: str | None = None,
+) -> list[Any]:
+    return list(
+        _ask_question(
+            questionary.checkbox(
+                message,
+                choices=choices,
+                instruction=instruction,
+            ).ask()
+        )
+    )
+def _prompt_required_text(label: str, *, default: str | None = None) -> str:
+    while True:
+        value = _questionary_text(label, default=default or "").strip()
+        if value:
+            return value
+        typer.secho("Please enter a value.", err=True, fg=typer.colors.RED)
+        default = None
+def _prompt_optional_text(label: str, *, default: str | None = None) -> str | None:
+    value = _questionary_text(label, default=default or "").strip()
+    return value or None
+def _prompt_optional_text_no_default(label: str) -> str | None:
+    value = _questionary_text(label).strip()
+    return value or None
+def _prompt_artifact_kind(
+    label: str,
+    *,
+    default: ArtifactKind = "other",
+) -> ArtifactKind:
+    return _validate_artifact_kind(
+        str(
+            _questionary_select(
+                label,
+                _ARTIFACT_KINDS,
+                default=default,
+                instruction="Choose the artifact type.",
+            )
+        )
+    )
+def _prompt_optional_bool(label: str) -> bool | None:
+    value = _questionary_select(
+        label,
+        [
+            Choice("Unknown", value=_UNKNOWN_SELECTION),
+            Choice("Yes", value=True),
+            Choice("No", value=False),
+        ],
+        default=_UNKNOWN_SELECTION,
+        instruction="Choose a value or leave it unknown.",
+    )
+    return None if value == _UNKNOWN_SELECTION else value
+def _echo_provenance_prompt_intro() -> None:
+    typer.echo(
+        "\n"
+        "Provenance questions: optional fields may be skipped.\n"
+        "Where a detected value is available, you can reuse it or enter a custom one."
+    )
+def _prompt_override_text(label: str, *, current: str | None) -> str | None:
+    if not current:
+        return _prompt_optional_text_no_default(label)
+    action = _questionary_select(
+        label,
+        [
+            Choice("Leave unknown", value="unknown"),
+            Choice(f"Use detected value: {current}", value="detected"),
+            Choice("Enter custom value", value="custom"),
+        ],
+        default="unknown",
+        instruction="Select how to fill this field.",
+    )
+    if action == "unknown":
+        return None
+    if action == "detected":
+        return current
+    return _prompt_optional_text_no_default(f"{label} (custom value)")
+def _prompt_override_bool(label: str, *, current: bool | None) -> bool | None:
+    choices: list[Choice] = [Choice("Leave unknown", value=_UNKNOWN_SELECTION)]
+    if current is not None:
+        detected_label = "yes" if current else "no"
+        choices.append(
+            Choice(f"Use detected value: {detected_label}", value="detected")
+        )
+    choices.extend(
+        [
+            Choice("Yes", value=True),
+            Choice("No", value=False),
+        ]
+    )
+    value = _questionary_select(
+        label,
+        choices,
+        default=_UNKNOWN_SELECTION,
+        instruction="Select how to fill this field.",
+    )
+    if value == "detected":
+        return current
+    if value == _UNKNOWN_SELECTION:
+        return None
+    return value
+def _looks_like_script_path(token: str) -> bool:
+    if not token:
+        return False
+    if "/" in token or "\\" in token:
+        return True
+    if token.startswith("."):
+        return True
+    return Path(token).suffix.lower() in {
+        ".py",
+        ".ipynb",
+        ".sh",
+        ".bash",
+        ".zsh",
+        ".r",
+        ".rmd",
+    }
+def _unwrap_invocation_script_token(command: list[str]) -> str | None:
+    tokens = list(command)
+    if not tokens:
+        return None
+    if len(tokens) == 1:
+        return tokens[0]
+    if tokens[:2] == ["uv", "run"]:
+        tokens = tokens[2:]
+        if not tokens:
+            return None
+    runner = Path(tokens[0]).name
+    if re.fullmatch(r"python(\d+(\.\d+)?)?", runner):
+        index = 1
+        while index < len(tokens):
+            token = tokens[index]
+            if token == "-m":
+                return None
+            if token == "-c":
+                return None
+            if token.startswith("-"):
+                index += 1
+                continue
+            return token if _looks_like_script_path(token) else None
+        return None
+    if runner in {"bash", "sh", "zsh", "Rscript"}:
+        index = 1
+        while index < len(tokens):
+            token = tokens[index]
+            if token.startswith("-"):
+                index += 1
+                continue
+            return token if _looks_like_script_path(token) else None
+        return None
+    return tokens[0] if _looks_like_script_path(tokens[0]) else None
+def _infer_invocation_provenance(
+    value: str | None,
+    *,
+    infer_script_repo_path_fn: Callable[[str | Path | None], str | None] | None = None,
+) -> tuple[list[str], str | None, str | None]:
+    if value is None:
+        return [], None, None
+    stripped = value.strip()
+    if not stripped:
+        return [], None, None
+    command = _parse_command_string(stripped)
+    if not command:
+        return [], None, None
+    script = _unwrap_invocation_script_token(command)
+    infer_script_repo_path_fn = (
+        provenance_runtime.infer_script_repo_path
+        if infer_script_repo_path_fn is None
+        else infer_script_repo_path_fn
+    )
+    script_repo_path = infer_script_repo_path_fn(script) if script is not None else None
+    return command, script, script_repo_path
+def _prompt_invocation_override(
+    *,
+    current_command: list[str],
+    current_script: str | None,
+    current_script_repo_path: str | None,
+) -> tuple[list[str], str | None, str | None]:
+    current_command_text = shlex.join(current_command) if current_command else None
+    if current_command_text is None:
+        return _infer_invocation_provenance(
+            _prompt_optional_text_no_default(
+                "Original generating command or script path"
+            )
+        )
+    action = _questionary_select(
+        "Original generating command or script path",
+        [
+            Choice("Leave unknown", value="unknown"),
+            Choice(f"Use detected value: {current_command_text}", value="detected"),
+            Choice("Enter custom value", value="custom"),
+        ],
+        default="unknown",
+        instruction="Select how to fill this field.",
+    )
+    if action == "unknown":
+        return [], None, None
+    if action == "detected":
+        return current_command, current_script, current_script_repo_path
+    return _infer_invocation_provenance(
+        _prompt_optional_text_no_default(
+            "Original generating command or script path (custom value)"
+        )
+    )
+def _prompt_directory_file_selection(candidates: list[Path]) -> list[Path]:
+    if not candidates:
+        return []
+    selected = _questionary_checkbox(
+        "Select files to include in this directory annotation",
+        [
+            Choice(candidate.name, value=str(candidate), checked=True)
+            for candidate in candidates
+        ],
+        instruction="Use space to toggle files, then press Enter to continue.",
+    )
+    selected_paths = {str(Path(value).resolve()) for value in selected}
+    return [
+        candidate
+        for candidate in candidates
+        if str(candidate.resolve()) in selected_paths
+    ]
+def _prompt_inputs() -> list[str]:
+    inputs: list[str] = []
+    while _questionary_confirm(
+        "Were there input files or directories used to generate this artifact?",
+        default=False,
+        instruction="(y/N) Inputs can be local paths or URIs. ",
+    ):
+        inputs.append(_prompt_required_text("Input path or URI"))
+    return inputs
+def _prompt_params() -> dict[str, Any]:
+    params: dict[str, Any] = {}
+    while _questionary_confirm(
+        "Were there function parameters used to generate this artifact? ",
+        default=False,
+        instruction="(y/N) Parameters should be entered as KEY=VALUE. ",
+    ):
+        key, parsed_value = _parse_param_entry(
+            _prompt_required_text("Generation parameter (KEY=VALUE)")
+        )
+        params[key] = parsed_value
+    return params
+def _prompt_primary_key(field_names: list[str]) -> list[str]:
+    if not field_names:
+        return []
+    return [
+        str(value)
+        for value in _questionary_checkbox(
+            "Select primary key fields",
+            [Choice(name, value=name) for name in field_names],
+            instruction="Leave everything unchecked if there is no primary key.",
+        )
+    ]
+def _prompt_missing_value_codes() -> dict[str, str]:
+    codes: dict[str, str] = {}
+    while _questionary_confirm(
+        "Add a missing value code?",
+        default=False,
+    ):
+        entry = _prompt_required_text("Missing value code (CODE=MEANING)")
+        if "=" not in entry:
+            typer.secho(
+                "Please enter missing value codes as CODE=MEANING.",
+                err=True,
+                fg=typer.colors.RED,
+            )
+            continue
+        code, meaning = entry.split("=", 1)
+        code = code.strip()
+        meaning = meaning.strip()
+        if not code or not meaning:
+            typer.secho(
+                "Both the missing value code and its meaning are required.",
+                err=True,
+                fg=typer.colors.RED,
+            )
+            continue
+        codes[code] = meaning
+    return codes
+def _prompt_field_definitions() -> list[FieldDefinition]:
+    fields: list[FieldDefinition] = []
+    while _questionary_confirm(
+        "Add a field definition?",
+        default=False,
+    ):
+        field_name = _prompt_required_text("Field name")
+        fields.append(
+            FieldDefinition(
+                name=field_name,
+                summary=_prompt_required_text(f"Summary for {field_name}"),
+                data_type=_prompt_optional_text(f"Data type for {field_name}"),
+                required=_prompt_optional_bool(f"Required for {field_name}"),
+                nullable=_prompt_optional_bool(f"Nullable for {field_name}"),
+            )
+        )
+    return fields
+def _prompt_schema_details() -> tuple[list[FieldDefinition], list[str], dict[str, str]]:
+    if not _questionary_confirm("Add field-level schema details?", default=False):
+        return [], [], {}
+    fields = _prompt_field_definitions()
+    primary_key = _prompt_primary_key([field.name for field in fields])
+    missing_value_codes = _prompt_missing_value_codes()
+    return fields, primary_key, missing_value_codes
+def _collect_post_hoc_provenance(
+    *,
+    input_values: list[str] | None,
+    param_values: list[str] | None,
+    script: str | None,
+    script_repo_path: str | None,
+    command: str | None,
+    function: str | None,
+    git_sha: str | None,
+    git_branch: str | None,
+    git_remote_name: str | None,
+    git_remote_url: str | None,
+    git_dirty: bool | None,
+) -> tuple[list[str], dict[str, Any], dict[str, Any]]:
+    runtime_info = provenance_runtime.capture_runtime_info()
+    git_info = provenance_git.capture_git_info()
+    _echo_provenance_prompt_intro()
+    overrides: dict[str, Any] = {}
+    current_command = runtime_info.get("command") or []
+    if _is_current_annotation_command(current_command):
+        current_command = []
+    if script is not None or script_repo_path is not None or command is not None:
+        overrides["script"] = script
+        overrides["script_repo_path"] = script_repo_path
+        overrides["command"] = (
+            _parse_command_string(command) if command is not None else []
+        )
+    else:
+        (
+            overrides["command"],
+            overrides["script"],
+            overrides["script_repo_path"],
+        ) = _prompt_invocation_override(
+            current_command=current_command,
+            current_script=runtime_info.get("script"),
+            current_script_repo_path=runtime_info.get("script_repo_path"),
+        )
+    function_value: str | None = (
+        function
+        if function is not None
+        else _prompt_override_text(
+            "Generating function within the script", current=None
+        )
+    )
+    overrides["function"] = function_value
+    inputs = list(input_values) if input_values else _prompt_inputs()
+    params = _collect_params(param_values) if param_values else _prompt_params()
+    git_sha_value: str | None = (
+        git_sha
+        if git_sha is not None
+        else _prompt_override_text("Git commit SHA", current=git_info.get("git_sha"))
+    )
+    overrides["git_sha"] = git_sha_value
+    git_branch_value: str | None = (
+        git_branch
+        if git_branch is not None
+        else _prompt_override_text("Git branch", current=git_info.get("git_branch"))
+    )
+    overrides["git_branch"] = git_branch_value
+    git_remote_name_value: str | None = (
+        git_remote_name
+        if git_remote_name is not None
+        else _prompt_override_text(
+            "Git remote name",
+            current=git_info.get("git_remote_name"),
+        )
+    )
+    overrides["git_remote_name"] = git_remote_name_value
+    git_remote_url_value: str | None
+    if git_remote_url is not None:
+        git_remote_url_value = _normalize_remote_url(git_remote_url)
+    else:
+        git_remote_url_value = _prompt_override_text(
+            "Git remote URL",
+            current=git_info.get("git_remote_url"),
+        )
+        if git_remote_url_value is not None:
+            git_remote_url_value = _normalize_remote_url(str(git_remote_url_value))
+    overrides["git_remote_url"] = git_remote_url_value
+    git_dirty_value: bool | None = (
+        git_dirty
+        if git_dirty is not None
+        else _prompt_override_bool("Git dirty state", current=git_info.get("git_dirty"))
+    )
+    overrides["git_dirty"] = git_dirty_value
+    return inputs, params, overrides

data_annotations/cli_app/provenance_commands.py ADDED Viewed

@@ -0,0 +1,107 @@
+import subprocess
+from pathlib import Path
+import typer
+from data_annotations.provenance import checkout_manifest_source
+from data_annotations.provenance import recovery as provenance_recovery
+from .common import (
+    _checkout_hint,
+    _echo_entries,
+    _error,
+    _match_target_path,
+    _missing_checkout_fields,
+    _resolve_manifest_path,
+    _resolved_path,
+)
+provenance_app = typer.Typer(
+    no_args_is_help=True,
+    help="Inspect provenance recorded in annotation documents.",
+)
+@provenance_app.command("match")
+def match_command(
+    target: Path = typer.Argument(
+        ..., help="Artifact, directory, or annotation document path."
+    ),
+    manifest: Path | None = typer.Option(
+        None,
+        "--manifest",
+        help="Explicit annotation document path to use instead of auto-discovery.",
+    ),
+) -> None:
+    manifest_path = _resolve_manifest_path(target, manifest)
+    candidate_path = _match_target_path(target, manifest)
+    loaded_manifest = provenance_recovery._load_manifest(manifest_path)
+    match = provenance_recovery._analyze_artifact_match(candidate_path, loaded_manifest)
+    typer.echo(f"Target: {candidate_path}")
+    typer.echo(f"Manifest: {manifest_path}")
+    typer.echo(f"Result: {match.status.replace('_', ' ').upper()}")
+    _echo_entries("Verified entries", match.verified_entries)
+    _echo_entries("Missing tracked entries", match.missing_tracked_entries)
+    _echo_entries("Mismatched tracked entries", match.mismatched_tracked_entries)
+    _echo_entries("Extra entries", match.extra_entries)
+    _echo_entries("Unverifiable tracked entries", match.unverifiable_tracked_entries)
+    if match.status in {"match", "partial_match"}:
+        missing_checkout_fields = _missing_checkout_fields(loaded_manifest)
+        if missing_checkout_fields:
+            typer.echo(
+                "Checkout unavailable: manifest is missing "
+                + ", ".join(missing_checkout_fields)
+            )
+        else:
+            typer.echo("Next step:")
+            typer.echo(
+                "  "
+                + _checkout_hint(
+                    str(_resolved_path(target)),
+                    str(_resolved_path(manifest)) if manifest is not None else None,
+                )
+            )
+        return
+    raise typer.Exit(code=1)
+@provenance_app.command("checkout")
+def checkout_command(
+    target: Path = typer.Argument(
+        ..., help="Artifact, directory, or annotation document path."
+    ),
+    manifest: Path | None = typer.Option(
+        None,
+        "--manifest",
+        help="Explicit annotation document path to use instead of auto-discovery.",
+    ),
+    dest: Path | None = typer.Option(
+        None,
+        "--dest",
+        help="Optional checkout destination. Defaults to a stable user cache.",
+    ),
+) -> None:
+    manifest_path = _resolve_manifest_path(target, manifest)
+    try:
+        recovered = checkout_manifest_source(
+            manifest_path,
+            destination_dir=dest,
+        )
+    except ValueError as exc:
+        _error(str(exc), code=1)
+    except subprocess.CalledProcessError:
+        _error("failed to clone or checkout the recorded repository state", code=1)
+    typer.echo(f"Manifest: {manifest_path}")
+    typer.echo(f"Checkout path: {recovered.checkout_path}")
+    if recovered.script_path is not None:
+        typer.echo(f"Recovered script: {recovered.script_path}")
+    else:
+        typer.echo(
+            "Recovered repository checkout, but the generating script could not be resolved."
+        )

data_annotations/description/__init__.py ADDED Viewed

@@ -0,0 +1,37 @@
+from data_annotations.provenance.models import ArtifactKind
+from .models import (
+    AllowedValue,
+    ArtifactDescription,
+    DirectoryDescription,
+    DocumentedArtifact,
+    FieldDefinition,
+    FileDescription,
+)
+from .decorators import record_directory_description, record_file_description
+from .writers import (
+    render_directory_readme,
+    render_file_readme,
+    write_directory_description,
+    write_directory_readme,
+    write_file_description,
+    write_file_readme,
+)
+__all__ = [
+    "ArtifactKind",
+    "AllowedValue",
+    "ArtifactDescription",
+    "DirectoryDescription",
+    "DocumentedArtifact",
+    "FieldDefinition",
+    "FileDescription",
+    "record_directory_description",
+    "record_file_description",
+    "render_directory_readme",
+    "render_file_readme",
+    "write_directory_description",
+    "write_directory_readme",
+    "write_file_description",
+    "write_file_readme",
+]