ml-datarefinery 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. datarefinery/__init__.py +9 -0
  2. datarefinery/__main__.py +8 -0
  3. datarefinery/cache/__init__.py +2 -0
  4. datarefinery/cache/atomic.py +89 -0
  5. datarefinery/cache/cleaner.py +140 -0
  6. datarefinery/cache/identity.py +54 -0
  7. datarefinery/cache/layout.py +83 -0
  8. datarefinery/cli/__init__.py +2 -0
  9. datarefinery/cli/_exit_codes.py +56 -0
  10. datarefinery/cli/app.py +191 -0
  11. datarefinery/cli/commands/__init__.py +8 -0
  12. datarefinery/cli/commands/check_cmd.py +93 -0
  13. datarefinery/cli/commands/clean_cmd.py +147 -0
  14. datarefinery/cli/commands/init_cmd.py +80 -0
  15. datarefinery/cli/commands/inspect_cmd.py +173 -0
  16. datarefinery/cli/commands/materialize_cmd.py +166 -0
  17. datarefinery/cli/commands/report_cmd.py +73 -0
  18. datarefinery/cli/commands/status_cmd.py +167 -0
  19. datarefinery/cli/commands/validate_cmd.py +92 -0
  20. datarefinery/core/__init__.py +2 -0
  21. datarefinery/core/check.py +181 -0
  22. datarefinery/core/config.py +107 -0
  23. datarefinery/core/datarefinery.py +311 -0
  24. datarefinery/core/errors.py +41 -0
  25. datarefinery/core/inspect.py +160 -0
  26. datarefinery/core/instance.py +101 -0
  27. datarefinery/core/status.py +92 -0
  28. datarefinery/logging.py +106 -0
  29. datarefinery/pipeline/__init__.py +2 -0
  30. datarefinery/pipeline/contracts.py +345 -0
  31. datarefinery/pipeline/fitted_stats.py +135 -0
  32. datarefinery/pipeline/inputs.py +439 -0
  33. datarefinery/pipeline/manifest.py +72 -0
  34. datarefinery/pipeline/runner.py +490 -0
  35. datarefinery/pipeline/stages/__init__.py +2 -0
  36. datarefinery/pipeline/stages/augmentations.py +98 -0
  37. datarefinery/pipeline/stages/featurizations.py +157 -0
  38. datarefinery/pipeline/stages/filters.py +159 -0
  39. datarefinery/pipeline/stages/generation.py +130 -0
  40. datarefinery/pipeline/stages/splits.py +303 -0
  41. datarefinery/pipeline/stages/transformations.py +145 -0
  42. datarefinery/pipeline/stages/visualizations.py +112 -0
  43. datarefinery/pipeline/workers.py +114 -0
  44. datarefinery/plugins/__init__.py +2 -0
  45. datarefinery/plugins/base.py +55 -0
  46. datarefinery/plugins/discovery.py +95 -0
  47. datarefinery/plugins/image_classification/__init__.py +9 -0
  48. datarefinery/plugins/image_classification/operations/__init__.py +2 -0
  49. datarefinery/plugins/image_classification/operations/featurizations.py +137 -0
  50. datarefinery/plugins/image_classification/operations/filters.py +79 -0
  51. datarefinery/plugins/image_classification/operations/generation.py +58 -0
  52. datarefinery/plugins/image_classification/operations/transformations.py +222 -0
  53. datarefinery/plugins/image_classification/operations/visualizations.py +231 -0
  54. datarefinery/plugins/image_classification/plugin.py +227 -0
  55. datarefinery/plugins/tabular/__init__.py +6 -0
  56. datarefinery/plugins/tabular/plugin.py +142 -0
  57. datarefinery/plugins/text/__init__.py +6 -0
  58. datarefinery/plugins/text/plugin.py +147 -0
  59. datarefinery/py.typed +0 -0
  60. datarefinery/recipe/__init__.py +2 -0
  61. datarefinery/recipe/canonical.py +37 -0
  62. datarefinery/recipe/loader.py +111 -0
  63. datarefinery/recipe/models.py +214 -0
  64. datarefinery/recipe/validator.py +1036 -0
  65. datarefinery/recipe/variants.py +49 -0
  66. datarefinery/reporting/__init__.py +2 -0
  67. datarefinery/reporting/drift.py +122 -0
  68. datarefinery/reporting/report.py +250 -0
  69. datarefinery/reporting/visualizations.py +51 -0
  70. datarefinery/scaffolder/__init__.py +2 -0
  71. datarefinery/scaffolder/init.py +239 -0
  72. datarefinery/scaffolder/llm.py +77 -0
  73. ml_datarefinery-0.9.4.dist-info/METADATA +498 -0
  74. ml_datarefinery-0.9.4.dist-info/RECORD +77 -0
  75. ml_datarefinery-0.9.4.dist-info/WHEEL +4 -0
  76. ml_datarefinery-0.9.4.dist-info/entry_points.txt +7 -0
  77. ml_datarefinery-0.9.4.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,9 @@
1
+ # Copyright (c) 2026 Pointmatic
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ __version__ = "0.9.4"
5
+
6
+ from datarefinery.core.datarefinery import DataRefinery, materialize
7
+ from datarefinery.core.instance import Instance
8
+
9
+ __all__ = ["DataRefinery", "Instance", "__version__", "materialize"]
@@ -0,0 +1,8 @@
1
+ # Copyright (c) 2026 Pointmatic
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """Entry point for ``python -m datarefinery``."""
4
+
5
+ from datarefinery.cli.app import main_entry
6
+
7
+ if __name__ == "__main__":
8
+ main_entry()
@@ -0,0 +1,2 @@
1
+ # Copyright (c) 2026 Pointmatic
2
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,89 @@
1
+ # Copyright (c) 2026 Pointmatic
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """FR-5 atomic temp-then-promote and `FAILED` marker.
4
+
5
+ `atomic_promote(temp, final)` uses `os.replace` to swap a fully populated
6
+ temp directory into its final location atomically. A cross-device
7
+ mismatch is caught up-front so the EXDEV failure surfaces with a
8
+ "same-filesystem" message rather than deep inside the runner. On
9
+ failure, `mark_failed(temp, exc, stage)` writes a JSON `FAILED` marker
10
+ into the temp dir capturing the stage, exception type, message, and
11
+ traceback for diagnostic recovery.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import os
18
+ import traceback
19
+ from datetime import UTC, datetime
20
+ from pathlib import Path
21
+
22
+ from datarefinery.core.errors import MaterializeError
23
+
24
+ FAILED_MARKER = "FAILED"
25
+
26
+
27
+ def _device_id(path: Path) -> int:
28
+ """Return `st_dev` for `path`. Wrapped so tests can monkey-patch the cross-device guard."""
29
+ return os.stat(path).st_dev
30
+
31
+
32
+ def atomic_promote(temp_dir: Path, final_dir: Path) -> None:
33
+ """Atomically promote `temp_dir` to `final_dir` via `os.replace`.
34
+
35
+ Raises `MaterializeError` if `temp_dir` does not exist, if temp and
36
+ final live on different filesystems (`os.replace` would raise
37
+ `EXDEV`), or if the underlying rename fails. `final_dir.parent` is
38
+ created if missing; the parent directory chain ends one level above
39
+ the eventual instance.
40
+ """
41
+ if not temp_dir.is_dir():
42
+ raise MaterializeError(f"temp dir does not exist: {temp_dir}")
43
+
44
+ final_parent = final_dir.parent
45
+ final_parent.mkdir(parents=True, exist_ok=True)
46
+
47
+ temp_dev = _device_id(temp_dir.parent)
48
+ final_dev = _device_id(final_parent)
49
+ if temp_dev != final_dev:
50
+ raise MaterializeError(
51
+ f"cannot atomically promote across filesystems: "
52
+ f"temp_dir={temp_dir} (st_dev={temp_dev}), "
53
+ f"final_dir={final_dir} (st_dev={final_dev}). "
54
+ f"DataRefinery requires the cache root and the temp dir to "
55
+ f"share a filesystem; configure --cache-root accordingly."
56
+ )
57
+
58
+ try:
59
+ os.replace(temp_dir, final_dir)
60
+ except OSError as exc:
61
+ raise MaterializeError(
62
+ f"atomic promote failed for {temp_dir} -> {final_dir}: {exc}"
63
+ ) from exc
64
+
65
+
66
+ def mark_failed(temp_dir: Path, exc: BaseException, stage: str) -> None:
67
+ """Write a `FAILED` JSON marker into `temp_dir` capturing the failure context.
68
+
69
+ No-op when `temp_dir` does not exist (e.g., it was already promoted
70
+ or deleted before the runner caught the failure). The marker is a
71
+ diagnostic artifact; it never blocks failure propagation in the
72
+ runner.
73
+ """
74
+ if not temp_dir.is_dir():
75
+ return
76
+
77
+ payload = {
78
+ "stage": stage,
79
+ "exc_type": type(exc).__name__,
80
+ "message": str(exc),
81
+ "traceback": "".join(traceback.format_exception(type(exc), exc, exc.__traceback__)),
82
+ "marked_at": (datetime.now(UTC).isoformat().replace("+00:00", "Z")),
83
+ }
84
+
85
+ marker = temp_dir / FAILED_MARKER
86
+ marker.write_text(
87
+ json.dumps(payload, indent=2, sort_keys=True),
88
+ encoding="utf-8",
89
+ )
@@ -0,0 +1,140 @@
1
+ # Copyright (c) 2026 Pointmatic
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """FR-21 cache cleaner: library API only.
4
+
5
+ The CLI verb wraps this in Phase D. The library here exposes
6
+ `CleanSelector` plus `clean(cache_root, selector, *, force=False)`. The
7
+ selector is intersection-style across the `by_*` filters; `orphans` adds
8
+ old temp dirs to the target set; `all=True` requires `force=True` and
9
+ clears every direct child of `<cache-root>/instances/`.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import shutil
15
+ import time
16
+ from collections.abc import Iterable
17
+ from dataclasses import dataclass
18
+ from pathlib import Path
19
+
20
+ from datarefinery.cache.layout import (
21
+ TMP_DIR_NAME,
22
+ instances_root,
23
+ )
24
+ from datarefinery.core.errors import CacheError
25
+
26
+
27
+ @dataclass(frozen=True, slots=True)
28
+ class CleanSelector:
29
+ """Declarative selector for `clean(...)`."""
30
+
31
+ by_recipe_hash: str | None = None
32
+ by_input_hash: str | None = None
33
+ by_seed: int | None = None
34
+ by_age_days: float | None = None
35
+ orphans: bool = False
36
+ orphan_age_days: float = 1.0
37
+ all: bool = False
38
+
39
+
40
+ @dataclass(frozen=True, slots=True)
41
+ class CleanReport:
42
+ removed: tuple[Path, ...]
43
+ skipped: tuple[tuple[Path, str], ...]
44
+
45
+
46
+ def clean(
47
+ cache_root: Path,
48
+ selector: CleanSelector,
49
+ *,
50
+ force: bool = False,
51
+ ) -> CleanReport:
52
+ """Remove cache entries matching `selector`.
53
+
54
+ `selector.all=True` requires `force=True` and removes every direct
55
+ child of `<cache-root>/instances/` (including the `.tmp/` orphans
56
+ dir). The `by_*` filters compose intersection-style: each one
57
+ narrows the candidate set further. `orphans=True` independently
58
+ targets temp dirs older than `orphan_age_days`.
59
+ """
60
+ if selector.all:
61
+ if not force:
62
+ raise CacheError("clean(all=True) requires force=True")
63
+ return _clean_everything(cache_root)
64
+
65
+ instances = instances_root(cache_root)
66
+ if not instances.is_dir():
67
+ return CleanReport(removed=(), skipped=())
68
+
69
+ targets: list[Path] = []
70
+
71
+ if selector.orphans:
72
+ targets.extend(_orphan_temp_dirs(cache_root, selector.orphan_age_days))
73
+
74
+ instance_filters_active = (
75
+ selector.by_recipe_hash is not None
76
+ or selector.by_input_hash is not None
77
+ or selector.by_seed is not None
78
+ or selector.by_age_days is not None
79
+ )
80
+ if instance_filters_active:
81
+ candidates = list(_iter_instance_dirs(instances))
82
+ if selector.by_recipe_hash is not None:
83
+ prefix = selector.by_recipe_hash[:16]
84
+ candidates = [p for p in candidates if p.parent.parent.name == prefix]
85
+ if selector.by_input_hash is not None:
86
+ prefix = selector.by_input_hash[:16]
87
+ candidates = [p for p in candidates if p.parent.name == prefix]
88
+ if selector.by_seed is not None:
89
+ candidates = [p for p in candidates if p.name == str(selector.by_seed)]
90
+ if selector.by_age_days is not None:
91
+ cutoff = time.time() - selector.by_age_days * 86400
92
+ candidates = [p for p in candidates if p.stat().st_mtime < cutoff]
93
+ targets.extend(candidates)
94
+
95
+ return _remove_paths(targets)
96
+
97
+
98
+ def _iter_instance_dirs(instances: Path) -> Iterable[Path]:
99
+ """Yield every `<recipe>/<input>/<seed>/` directory, skipping `.tmp/`."""
100
+ for recipe_shard in instances.iterdir():
101
+ if recipe_shard.name.startswith(".") or not recipe_shard.is_dir():
102
+ continue
103
+ for input_shard in recipe_shard.iterdir():
104
+ if not input_shard.is_dir():
105
+ continue
106
+ for seed_dir in input_shard.iterdir():
107
+ if seed_dir.is_dir():
108
+ yield seed_dir
109
+
110
+
111
+ def _orphan_temp_dirs(cache_root: Path, age_days: float) -> Iterable[Path]:
112
+ tmp_root = instances_root(cache_root) / TMP_DIR_NAME
113
+ if not tmp_root.is_dir():
114
+ return
115
+ cutoff = time.time() - age_days * 86400
116
+ for entry in tmp_root.iterdir():
117
+ if entry.is_dir() and entry.stat().st_mtime < cutoff:
118
+ yield entry
119
+
120
+
121
+ def _clean_everything(cache_root: Path) -> CleanReport:
122
+ instances = instances_root(cache_root)
123
+ if not instances.is_dir():
124
+ return CleanReport(removed=(), skipped=())
125
+ return _remove_paths(list(instances.iterdir()))
126
+
127
+
128
+ def _remove_paths(paths: list[Path]) -> CleanReport:
129
+ removed: list[Path] = []
130
+ skipped: list[tuple[Path, str]] = []
131
+ for path in paths:
132
+ try:
133
+ if path.is_dir():
134
+ shutil.rmtree(path)
135
+ else:
136
+ path.unlink()
137
+ removed.append(path)
138
+ except OSError as exc:
139
+ skipped.append((path, str(exc)))
140
+ return CleanReport(removed=tuple(removed), skipped=tuple(skipped))
@@ -0,0 +1,54 @@
1
+ # Copyright (c) 2026 Pointmatic
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """FR-4 cache identity: `CacheKey` + `compute_cache_key`.
4
+
5
+ The cache key is the triple (recipe_hash, input_hash, seed). Cache
6
+ directory paths use only the first 16 hex characters of `recipe_hash`
7
+ and `input_hash` (`.short`); the full hash is recorded in
8
+ `manifest.json`. See `project-essentials.md` "Cache identity is the
9
+ reproducibility contract - invalidations are ceremonious."
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import hashlib
15
+ from collections.abc import Mapping
16
+ from dataclasses import dataclass
17
+
18
+ from datarefinery.recipe.canonical import to_canonical_bytes
19
+ from datarefinery.recipe.models import Recipe
20
+
21
+
22
+ @dataclass(frozen=True, slots=True)
23
+ class CacheKey:
24
+ """Identity tuple for a materialized instance."""
25
+
26
+ recipe_hash: str
27
+ input_hash: str
28
+ seed: int
29
+
30
+ @property
31
+ def short(self) -> str:
32
+ """First 16 hex characters of `recipe_hash` (cache directory shard)."""
33
+ return self.recipe_hash[:16]
34
+
35
+
36
+ def compute_cache_key(
37
+ recipe: Recipe,
38
+ raw_input_hashes: Mapping[str, str],
39
+ seed: int,
40
+ ) -> CacheKey:
41
+ """Compute the cache key for a (recipe, inputs, seed) triple.
42
+
43
+ `raw_input_hashes` maps each input source name to a SHA-256 hex
44
+ digest of that source's content. The combined `input_hash` is
45
+ order-independent: keys are sorted by source name before
46
+ concatenation.
47
+ """
48
+ recipe_hash = hashlib.sha256(to_canonical_bytes(recipe)).hexdigest()
49
+
50
+ parts = [f"{name}={raw_input_hashes[name]};" for name in sorted(raw_input_hashes)]
51
+ payload = "".join(parts).encode("utf-8")
52
+ input_hash = hashlib.sha256(payload).hexdigest()
53
+
54
+ return CacheKey(recipe_hash=recipe_hash, input_hash=input_hash, seed=seed)
@@ -0,0 +1,83 @@
1
+ # Copyright (c) 2026 Pointmatic
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """Cache directory layout helpers under `<cache-root>/instances/...`.
4
+
5
+ Layout (per tech-spec):
6
+
7
+ <cache-root>/
8
+ └── instances/
9
+ ├── .tmp/<run-id>/ # in-flight runs; promoted via os.replace
10
+ └── <recipe-hash16>/<input-hash16>/<seed>/
11
+ ├── manifest.json
12
+ ├── dataset/
13
+ ├── fitted_statistics/
14
+ └── report/
15
+ ├── report.md
16
+ ├── drift.json
17
+ └── visualizations/
18
+
19
+ The 16-char shards come from `CacheKey.short` (recipe) and the first 16
20
+ chars of `input_hash`; the full hashes are recorded in `manifest.json`.
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import secrets
26
+ from datetime import UTC, datetime
27
+ from pathlib import Path
28
+
29
+ from datarefinery.cache.identity import CacheKey
30
+
31
+ INSTANCES_DIR = "instances"
32
+ TMP_DIR_NAME = ".tmp"
33
+ MANIFEST_FILE = "manifest.json"
34
+ RECIPE_FILE = "recipe.json"
35
+ DATASET_SUBDIR = "dataset"
36
+ FITTED_STATS_SUBDIR = "fitted_statistics"
37
+ REPORT_SUBDIR = "report"
38
+
39
+
40
+ def instances_root(cache_root: Path) -> Path:
41
+ """Root for all materialized instances and temp dirs."""
42
+ return cache_root / INSTANCES_DIR
43
+
44
+
45
+ def instance_dir(cache_root: Path, key: CacheKey) -> Path:
46
+ """Final path for a materialized instance under the cache root."""
47
+ return instances_root(cache_root) / key.recipe_hash[:16] / key.input_hash[:16] / str(key.seed)
48
+
49
+
50
+ def tmp_dir(cache_root: Path, run_id: str) -> Path:
51
+ """Temp directory for an in-flight run, atomically promoted via os.replace."""
52
+ return instances_root(cache_root) / TMP_DIR_NAME / run_id
53
+
54
+
55
+ def manifest_path(instance: Path) -> Path:
56
+ return instance / MANIFEST_FILE
57
+
58
+
59
+ def recipe_path(instance: Path) -> Path:
60
+ return instance / RECIPE_FILE
61
+
62
+
63
+ def dataset_dir(instance: Path) -> Path:
64
+ return instance / DATASET_SUBDIR
65
+
66
+
67
+ def fitted_stats_dir(instance: Path) -> Path:
68
+ return instance / FITTED_STATS_SUBDIR
69
+
70
+
71
+ def report_dir(instance: Path) -> Path:
72
+ return instance / REPORT_SUBDIR
73
+
74
+
75
+ def make_run_id() -> str:
76
+ """Return `<utc_iso_compact>-<8hex>`, e.g. `20260507T143022Z-deadbeef`.
77
+
78
+ Lexicographically sortable down to the second; the 8-hex random
79
+ suffix makes outputs unique under burst/concurrent calls within the
80
+ same second.
81
+ """
82
+ stamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
83
+ return f"{stamp}-{secrets.token_hex(4)}"
@@ -0,0 +1,2 @@
1
+ # Copyright (c) 2026 Pointmatic
2
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,56 @@
1
+ # Copyright (c) 2026 Pointmatic
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """Exception-to-exit-code mapping for the DataRefinery CLI.
4
+
5
+ Per tech-spec:
6
+
7
+ | code | meaning |
8
+ |------|---------------------------------------------------------------|
9
+ | 0 | success |
10
+ | 1 | user/recipe error (Recipe, Validation, Contract, Materialize) |
11
+ | 2 | system error (Plugin, Cache, environment, uncaught) |
12
+ | 130 | SIGINT / Ctrl-C |
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from datarefinery.core.errors import (
18
+ CacheError,
19
+ ContractError,
20
+ DataRefineryError,
21
+ MaterializeError,
22
+ PluginError,
23
+ RecipeError,
24
+ ValidationError,
25
+ )
26
+
27
+ EXIT_OK = 0
28
+ EXIT_USER = 1
29
+ EXIT_SYSTEM = 2
30
+ EXIT_INTERRUPT = 130
31
+
32
+ _USER_ERROR_TYPES: tuple[type[DataRefineryError], ...] = (
33
+ RecipeError,
34
+ ValidationError,
35
+ ContractError,
36
+ MaterializeError,
37
+ )
38
+ _SYSTEM_ERROR_TYPES: tuple[type[DataRefineryError], ...] = (
39
+ PluginError,
40
+ CacheError,
41
+ )
42
+
43
+
44
+ def exit_code_for(exc: BaseException) -> int:
45
+ """Return the documented CLI exit code for `exc`."""
46
+ if isinstance(exc, KeyboardInterrupt):
47
+ return EXIT_INTERRUPT
48
+ if isinstance(exc, _USER_ERROR_TYPES):
49
+ return EXIT_USER
50
+ if isinstance(exc, _SYSTEM_ERROR_TYPES):
51
+ return EXIT_SYSTEM
52
+ if isinstance(exc, DataRefineryError):
53
+ # Unknown future DataRefineryError subclass — treat as user-facing
54
+ # by default; widen the explicit tuples above when adding new types.
55
+ return EXIT_USER
56
+ return EXIT_SYSTEM
@@ -0,0 +1,191 @@
1
+ # Copyright (c) 2026 Pointmatic
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """Typer CLI entry point for DataRefinery."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import sys
8
+ from pathlib import Path
9
+ from typing import Annotated
10
+
11
+ import click
12
+ import typer
13
+ from rich.console import Console
14
+ from rich.panel import Panel
15
+
16
+ from datarefinery import __version__
17
+ from datarefinery.cli._exit_codes import (
18
+ EXIT_INTERRUPT,
19
+ EXIT_OK,
20
+ EXIT_SYSTEM,
21
+ exit_code_for,
22
+ )
23
+ from datarefinery.cli.commands.check_cmd import check as check_cmd
24
+ from datarefinery.cli.commands.clean_cmd import clean_command as clean_cmd
25
+ from datarefinery.cli.commands.init_cmd import init as init_cmd
26
+ from datarefinery.cli.commands.inspect_cmd import inspect as inspect_cmd
27
+ from datarefinery.cli.commands.materialize_cmd import materialize as materialize_cmd
28
+ from datarefinery.cli.commands.report_cmd import report as report_cmd
29
+ from datarefinery.cli.commands.status_cmd import status as status_cmd
30
+ from datarefinery.cli.commands.validate_cmd import validate as validate_cmd
31
+ from datarefinery.core.config import RuntimeConfig
32
+ from datarefinery.core.errors import DataRefineryError
33
+ from datarefinery.logging import get_logger
34
+
35
+ app = typer.Typer(
36
+ name="datarefinery",
37
+ help="DataRefinery — recipe-driven data preparation and caching for ML.",
38
+ no_args_is_help=False,
39
+ add_completion=False,
40
+ )
41
+
42
+
43
+ def _version_callback(value: bool) -> None:
44
+ if value:
45
+ typer.echo(__version__)
46
+ raise typer.Exit()
47
+
48
+
49
+ @app.callback(invoke_without_command=True)
50
+ def main(
51
+ ctx: typer.Context,
52
+ version: Annotated[
53
+ bool,
54
+ typer.Option(
55
+ "--version",
56
+ callback=_version_callback,
57
+ is_eager=True,
58
+ help="Show the package version and exit.",
59
+ ),
60
+ ] = False,
61
+ cache_root: Annotated[
62
+ Path | None,
63
+ typer.Option(
64
+ "--cache-root",
65
+ help="Root directory for the cache (env: DATAREFINERY_CACHE_ROOT).",
66
+ ),
67
+ ] = None,
68
+ log_level: Annotated[
69
+ str | None,
70
+ typer.Option(
71
+ "--log-level",
72
+ help="Log level (env: DATAREFINERY_LOG_LEVEL).",
73
+ ),
74
+ ] = None,
75
+ log_target: Annotated[
76
+ str | None,
77
+ typer.Option(
78
+ "--log-target",
79
+ help="Log routing target; reserved no-op stub (env: DATAREFINERY_LOG_TARGET).",
80
+ ),
81
+ ] = None,
82
+ plugin_path: Annotated[
83
+ list[Path] | None,
84
+ typer.Option(
85
+ "--plugin-path",
86
+ help="Extra plugin discovery path; repeatable "
87
+ "(env: DATAREFINERY_PLUGIN_PATH, PATH-style).",
88
+ ),
89
+ ] = None,
90
+ workers: Annotated[
91
+ int | None,
92
+ typer.Option(
93
+ "--workers",
94
+ help="Process pool worker count (env: DATAREFINERY_WORKERS).",
95
+ ),
96
+ ] = None,
97
+ seed: Annotated[
98
+ int | None,
99
+ typer.Option(
100
+ "--seed",
101
+ help="Override the recipe-declared seed (changes cache identity).",
102
+ ),
103
+ ] = None,
104
+ variant: Annotated[
105
+ str | None,
106
+ typer.Option("--variant", help="Recipe variant to apply before canonicalization."),
107
+ ] = None,
108
+ no_color: Annotated[
109
+ bool,
110
+ typer.Option("--no-color", help="Disable colored output."),
111
+ ] = False,
112
+ quiet: Annotated[
113
+ bool,
114
+ typer.Option("--quiet", "-q", help="Suppress non-essential output."),
115
+ ] = False,
116
+ verbose: Annotated[
117
+ bool,
118
+ typer.Option("--verbose", "-v", help="Verbose output."),
119
+ ] = False,
120
+ ) -> None:
121
+ """DataRefinery — recipe-driven data preparation and caching for ML."""
122
+ config = RuntimeConfig.resolve(
123
+ cache_root=cache_root,
124
+ log_level=log_level,
125
+ log_target=log_target,
126
+ plugin_path=plugin_path,
127
+ workers=workers,
128
+ )
129
+ state = ctx.ensure_object(dict)
130
+ state["config"] = config
131
+ state["seed"] = seed
132
+ state["variant"] = variant
133
+ state["no_color"] = no_color
134
+ state["quiet"] = quiet
135
+ state["verbose"] = verbose
136
+
137
+ get_logger("cli")
138
+ return None
139
+
140
+
141
+ app.command("check", help="Report environment soundness (FR-18).")(check_cmd)
142
+ app.command("validate", help="Validate a recipe (FR-2).")(validate_cmd)
143
+ app.command("init", help="Scaffold a starter recipe from raw inputs (FR-17).")(init_cmd)
144
+ app.command(
145
+ "materialize",
146
+ help="Run the pipeline end-to-end against the recipe's inputs (FR-3).",
147
+ )(materialize_cmd)
148
+ app.command(
149
+ "status",
150
+ help="Summarize a materialized instance or resolve a recipe to one (FR-19).",
151
+ )(status_cmd)
152
+ app.command(
153
+ "report",
154
+ help="Re-render report.md, drift.json, and reporting visualizations (FR-15).",
155
+ )(report_cmd)
156
+ app.command(
157
+ "inspect",
158
+ help="Read-only views of a materialized instance (FR-20).",
159
+ )(inspect_cmd)
160
+ app.command(
161
+ "clean",
162
+ help="Remove cached instances and orphan temp directories (FR-21).",
163
+ )(clean_cmd)
164
+
165
+
166
+ def _render_error(message: str, *, title: str) -> None:
167
+ Console(stderr=True).print(Panel(message, title=title, border_style="red", expand=False))
168
+
169
+
170
+ def main_entry() -> None:
171
+ """Console-script entry point with DataRefinery's exit-code mapping."""
172
+ try:
173
+ app(standalone_mode=False)
174
+ except click.exceptions.Exit as exc:
175
+ sys.exit(exc.exit_code)
176
+ except click.exceptions.UsageError as exc:
177
+ exc.show()
178
+ sys.exit(EXIT_SYSTEM)
179
+ except click.exceptions.ClickException as exc:
180
+ exc.show()
181
+ sys.exit(exc.exit_code)
182
+ except (click.exceptions.Abort, KeyboardInterrupt):
183
+ _render_error("Interrupted.", title="Aborted")
184
+ sys.exit(EXIT_INTERRUPT)
185
+ except DataRefineryError as exc:
186
+ _render_error(str(exc) or type(exc).__name__, title=type(exc).__name__)
187
+ sys.exit(exit_code_for(exc))
188
+ except Exception as exc:
189
+ _render_error(f"{type(exc).__name__}: {exc}", title="Internal Error")
190
+ sys.exit(EXIT_SYSTEM)
191
+ sys.exit(EXIT_OK)
@@ -0,0 +1,8 @@
1
+ # Copyright (c) 2026 Pointmatic
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """Subcommand modules for the ``datarefinery`` CLI.
4
+
5
+ Each verb lives in its own ``<verb>_cmd.py`` so the verb name does not
6
+ collide with Python keyword-adjacent identifiers and stays readable in
7
+ import paths (per tech-spec).
8
+ """