ml-datarefinery 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datarefinery/__init__.py +9 -0
- datarefinery/__main__.py +8 -0
- datarefinery/cache/__init__.py +2 -0
- datarefinery/cache/atomic.py +89 -0
- datarefinery/cache/cleaner.py +140 -0
- datarefinery/cache/identity.py +54 -0
- datarefinery/cache/layout.py +83 -0
- datarefinery/cli/__init__.py +2 -0
- datarefinery/cli/_exit_codes.py +56 -0
- datarefinery/cli/app.py +191 -0
- datarefinery/cli/commands/__init__.py +8 -0
- datarefinery/cli/commands/check_cmd.py +93 -0
- datarefinery/cli/commands/clean_cmd.py +147 -0
- datarefinery/cli/commands/init_cmd.py +80 -0
- datarefinery/cli/commands/inspect_cmd.py +173 -0
- datarefinery/cli/commands/materialize_cmd.py +166 -0
- datarefinery/cli/commands/report_cmd.py +73 -0
- datarefinery/cli/commands/status_cmd.py +167 -0
- datarefinery/cli/commands/validate_cmd.py +92 -0
- datarefinery/core/__init__.py +2 -0
- datarefinery/core/check.py +181 -0
- datarefinery/core/config.py +107 -0
- datarefinery/core/datarefinery.py +311 -0
- datarefinery/core/errors.py +41 -0
- datarefinery/core/inspect.py +160 -0
- datarefinery/core/instance.py +101 -0
- datarefinery/core/status.py +92 -0
- datarefinery/logging.py +106 -0
- datarefinery/pipeline/__init__.py +2 -0
- datarefinery/pipeline/contracts.py +345 -0
- datarefinery/pipeline/fitted_stats.py +135 -0
- datarefinery/pipeline/inputs.py +439 -0
- datarefinery/pipeline/manifest.py +72 -0
- datarefinery/pipeline/runner.py +490 -0
- datarefinery/pipeline/stages/__init__.py +2 -0
- datarefinery/pipeline/stages/augmentations.py +98 -0
- datarefinery/pipeline/stages/featurizations.py +157 -0
- datarefinery/pipeline/stages/filters.py +159 -0
- datarefinery/pipeline/stages/generation.py +130 -0
- datarefinery/pipeline/stages/splits.py +303 -0
- datarefinery/pipeline/stages/transformations.py +145 -0
- datarefinery/pipeline/stages/visualizations.py +112 -0
- datarefinery/pipeline/workers.py +114 -0
- datarefinery/plugins/__init__.py +2 -0
- datarefinery/plugins/base.py +55 -0
- datarefinery/plugins/discovery.py +95 -0
- datarefinery/plugins/image_classification/__init__.py +9 -0
- datarefinery/plugins/image_classification/operations/__init__.py +2 -0
- datarefinery/plugins/image_classification/operations/featurizations.py +137 -0
- datarefinery/plugins/image_classification/operations/filters.py +79 -0
- datarefinery/plugins/image_classification/operations/generation.py +58 -0
- datarefinery/plugins/image_classification/operations/transformations.py +222 -0
- datarefinery/plugins/image_classification/operations/visualizations.py +231 -0
- datarefinery/plugins/image_classification/plugin.py +227 -0
- datarefinery/plugins/tabular/__init__.py +6 -0
- datarefinery/plugins/tabular/plugin.py +142 -0
- datarefinery/plugins/text/__init__.py +6 -0
- datarefinery/plugins/text/plugin.py +147 -0
- datarefinery/py.typed +0 -0
- datarefinery/recipe/__init__.py +2 -0
- datarefinery/recipe/canonical.py +37 -0
- datarefinery/recipe/loader.py +111 -0
- datarefinery/recipe/models.py +214 -0
- datarefinery/recipe/validator.py +1036 -0
- datarefinery/recipe/variants.py +49 -0
- datarefinery/reporting/__init__.py +2 -0
- datarefinery/reporting/drift.py +122 -0
- datarefinery/reporting/report.py +250 -0
- datarefinery/reporting/visualizations.py +51 -0
- datarefinery/scaffolder/__init__.py +2 -0
- datarefinery/scaffolder/init.py +239 -0
- datarefinery/scaffolder/llm.py +77 -0
- ml_datarefinery-0.9.4.dist-info/METADATA +498 -0
- ml_datarefinery-0.9.4.dist-info/RECORD +77 -0
- ml_datarefinery-0.9.4.dist-info/WHEEL +4 -0
- ml_datarefinery-0.9.4.dist-info/entry_points.txt +7 -0
- ml_datarefinery-0.9.4.dist-info/licenses/LICENSE +201 -0
datarefinery/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# Copyright (c) 2026 Pointmatic
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
__version__ = "0.9.4"
|
|
5
|
+
|
|
6
|
+
from datarefinery.core.datarefinery import DataRefinery, materialize
|
|
7
|
+
from datarefinery.core.instance import Instance
|
|
8
|
+
|
|
9
|
+
__all__ = ["DataRefinery", "Instance", "__version__", "materialize"]
|
datarefinery/__main__.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# Copyright (c) 2026 Pointmatic
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""FR-5 atomic temp-then-promote and `FAILED` marker.
|
|
4
|
+
|
|
5
|
+
`atomic_promote(temp, final)` uses `os.replace` to swap a fully populated
|
|
6
|
+
temp directory into its final location atomically. A cross-device
|
|
7
|
+
mismatch is caught up-front so the EXDEV failure surfaces with a
|
|
8
|
+
"same-filesystem" message rather than deep inside the runner. On
|
|
9
|
+
failure, `mark_failed(temp, exc, stage)` writes a JSON `FAILED` marker
|
|
10
|
+
into the temp dir capturing the stage, exception type, message, and
|
|
11
|
+
traceback for diagnostic recovery.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
import traceback
|
|
19
|
+
from datetime import UTC, datetime
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
from datarefinery.core.errors import MaterializeError
|
|
23
|
+
|
|
24
|
+
FAILED_MARKER = "FAILED"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _device_id(path: Path) -> int:
|
|
28
|
+
"""Return `st_dev` for `path`. Wrapped so tests can monkey-patch the cross-device guard."""
|
|
29
|
+
return os.stat(path).st_dev
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def atomic_promote(temp_dir: Path, final_dir: Path) -> None:
|
|
33
|
+
"""Atomically promote `temp_dir` to `final_dir` via `os.replace`.
|
|
34
|
+
|
|
35
|
+
Raises `MaterializeError` if `temp_dir` does not exist, if temp and
|
|
36
|
+
final live on different filesystems (`os.replace` would raise
|
|
37
|
+
`EXDEV`), or if the underlying rename fails. `final_dir.parent` is
|
|
38
|
+
created if missing; the parent directory chain ends one level above
|
|
39
|
+
the eventual instance.
|
|
40
|
+
"""
|
|
41
|
+
if not temp_dir.is_dir():
|
|
42
|
+
raise MaterializeError(f"temp dir does not exist: {temp_dir}")
|
|
43
|
+
|
|
44
|
+
final_parent = final_dir.parent
|
|
45
|
+
final_parent.mkdir(parents=True, exist_ok=True)
|
|
46
|
+
|
|
47
|
+
temp_dev = _device_id(temp_dir.parent)
|
|
48
|
+
final_dev = _device_id(final_parent)
|
|
49
|
+
if temp_dev != final_dev:
|
|
50
|
+
raise MaterializeError(
|
|
51
|
+
f"cannot atomically promote across filesystems: "
|
|
52
|
+
f"temp_dir={temp_dir} (st_dev={temp_dev}), "
|
|
53
|
+
f"final_dir={final_dir} (st_dev={final_dev}). "
|
|
54
|
+
f"DataRefinery requires the cache root and the temp dir to "
|
|
55
|
+
f"share a filesystem; configure --cache-root accordingly."
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
os.replace(temp_dir, final_dir)
|
|
60
|
+
except OSError as exc:
|
|
61
|
+
raise MaterializeError(
|
|
62
|
+
f"atomic promote failed for {temp_dir} -> {final_dir}: {exc}"
|
|
63
|
+
) from exc
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def mark_failed(temp_dir: Path, exc: BaseException, stage: str) -> None:
|
|
67
|
+
"""Write a `FAILED` JSON marker into `temp_dir` capturing the failure context.
|
|
68
|
+
|
|
69
|
+
No-op when `temp_dir` does not exist (e.g., it was already promoted
|
|
70
|
+
or deleted before the runner caught the failure). The marker is a
|
|
71
|
+
diagnostic artifact; it never blocks failure propagation in the
|
|
72
|
+
runner.
|
|
73
|
+
"""
|
|
74
|
+
if not temp_dir.is_dir():
|
|
75
|
+
return
|
|
76
|
+
|
|
77
|
+
payload = {
|
|
78
|
+
"stage": stage,
|
|
79
|
+
"exc_type": type(exc).__name__,
|
|
80
|
+
"message": str(exc),
|
|
81
|
+
"traceback": "".join(traceback.format_exception(type(exc), exc, exc.__traceback__)),
|
|
82
|
+
"marked_at": (datetime.now(UTC).isoformat().replace("+00:00", "Z")),
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
marker = temp_dir / FAILED_MARKER
|
|
86
|
+
marker.write_text(
|
|
87
|
+
json.dumps(payload, indent=2, sort_keys=True),
|
|
88
|
+
encoding="utf-8",
|
|
89
|
+
)
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# Copyright (c) 2026 Pointmatic
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""FR-21 cache cleaner: library API only.
|
|
4
|
+
|
|
5
|
+
The CLI verb wraps this in Phase D. The library here exposes
|
|
6
|
+
`CleanSelector` plus `clean(cache_root, selector, *, force=False)`. The
|
|
7
|
+
selector is intersection-style across the `by_*` filters; `orphans` adds
|
|
8
|
+
old temp dirs to the target set; `all=True` requires `force=True` and
|
|
9
|
+
clears every direct child of `<cache-root>/instances/`.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import shutil
|
|
15
|
+
import time
|
|
16
|
+
from collections.abc import Iterable
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
from datarefinery.cache.layout import (
|
|
21
|
+
TMP_DIR_NAME,
|
|
22
|
+
instances_root,
|
|
23
|
+
)
|
|
24
|
+
from datarefinery.core.errors import CacheError
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True, slots=True)
|
|
28
|
+
class CleanSelector:
|
|
29
|
+
"""Declarative selector for `clean(...)`."""
|
|
30
|
+
|
|
31
|
+
by_recipe_hash: str | None = None
|
|
32
|
+
by_input_hash: str | None = None
|
|
33
|
+
by_seed: int | None = None
|
|
34
|
+
by_age_days: float | None = None
|
|
35
|
+
orphans: bool = False
|
|
36
|
+
orphan_age_days: float = 1.0
|
|
37
|
+
all: bool = False
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass(frozen=True, slots=True)
|
|
41
|
+
class CleanReport:
|
|
42
|
+
removed: tuple[Path, ...]
|
|
43
|
+
skipped: tuple[tuple[Path, str], ...]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def clean(
|
|
47
|
+
cache_root: Path,
|
|
48
|
+
selector: CleanSelector,
|
|
49
|
+
*,
|
|
50
|
+
force: bool = False,
|
|
51
|
+
) -> CleanReport:
|
|
52
|
+
"""Remove cache entries matching `selector`.
|
|
53
|
+
|
|
54
|
+
`selector.all=True` requires `force=True` and removes every direct
|
|
55
|
+
child of `<cache-root>/instances/` (including the `.tmp/` orphans
|
|
56
|
+
dir). The `by_*` filters compose intersection-style: each one
|
|
57
|
+
narrows the candidate set further. `orphans=True` independently
|
|
58
|
+
targets temp dirs older than `orphan_age_days`.
|
|
59
|
+
"""
|
|
60
|
+
if selector.all:
|
|
61
|
+
if not force:
|
|
62
|
+
raise CacheError("clean(all=True) requires force=True")
|
|
63
|
+
return _clean_everything(cache_root)
|
|
64
|
+
|
|
65
|
+
instances = instances_root(cache_root)
|
|
66
|
+
if not instances.is_dir():
|
|
67
|
+
return CleanReport(removed=(), skipped=())
|
|
68
|
+
|
|
69
|
+
targets: list[Path] = []
|
|
70
|
+
|
|
71
|
+
if selector.orphans:
|
|
72
|
+
targets.extend(_orphan_temp_dirs(cache_root, selector.orphan_age_days))
|
|
73
|
+
|
|
74
|
+
instance_filters_active = (
|
|
75
|
+
selector.by_recipe_hash is not None
|
|
76
|
+
or selector.by_input_hash is not None
|
|
77
|
+
or selector.by_seed is not None
|
|
78
|
+
or selector.by_age_days is not None
|
|
79
|
+
)
|
|
80
|
+
if instance_filters_active:
|
|
81
|
+
candidates = list(_iter_instance_dirs(instances))
|
|
82
|
+
if selector.by_recipe_hash is not None:
|
|
83
|
+
prefix = selector.by_recipe_hash[:16]
|
|
84
|
+
candidates = [p for p in candidates if p.parent.parent.name == prefix]
|
|
85
|
+
if selector.by_input_hash is not None:
|
|
86
|
+
prefix = selector.by_input_hash[:16]
|
|
87
|
+
candidates = [p for p in candidates if p.parent.name == prefix]
|
|
88
|
+
if selector.by_seed is not None:
|
|
89
|
+
candidates = [p for p in candidates if p.name == str(selector.by_seed)]
|
|
90
|
+
if selector.by_age_days is not None:
|
|
91
|
+
cutoff = time.time() - selector.by_age_days * 86400
|
|
92
|
+
candidates = [p for p in candidates if p.stat().st_mtime < cutoff]
|
|
93
|
+
targets.extend(candidates)
|
|
94
|
+
|
|
95
|
+
return _remove_paths(targets)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _iter_instance_dirs(instances: Path) -> Iterable[Path]:
|
|
99
|
+
"""Yield every `<recipe>/<input>/<seed>/` directory, skipping `.tmp/`."""
|
|
100
|
+
for recipe_shard in instances.iterdir():
|
|
101
|
+
if recipe_shard.name.startswith(".") or not recipe_shard.is_dir():
|
|
102
|
+
continue
|
|
103
|
+
for input_shard in recipe_shard.iterdir():
|
|
104
|
+
if not input_shard.is_dir():
|
|
105
|
+
continue
|
|
106
|
+
for seed_dir in input_shard.iterdir():
|
|
107
|
+
if seed_dir.is_dir():
|
|
108
|
+
yield seed_dir
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _orphan_temp_dirs(cache_root: Path, age_days: float) -> Iterable[Path]:
|
|
112
|
+
tmp_root = instances_root(cache_root) / TMP_DIR_NAME
|
|
113
|
+
if not tmp_root.is_dir():
|
|
114
|
+
return
|
|
115
|
+
cutoff = time.time() - age_days * 86400
|
|
116
|
+
for entry in tmp_root.iterdir():
|
|
117
|
+
if entry.is_dir() and entry.stat().st_mtime < cutoff:
|
|
118
|
+
yield entry
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _clean_everything(cache_root: Path) -> CleanReport:
|
|
122
|
+
instances = instances_root(cache_root)
|
|
123
|
+
if not instances.is_dir():
|
|
124
|
+
return CleanReport(removed=(), skipped=())
|
|
125
|
+
return _remove_paths(list(instances.iterdir()))
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _remove_paths(paths: list[Path]) -> CleanReport:
|
|
129
|
+
removed: list[Path] = []
|
|
130
|
+
skipped: list[tuple[Path, str]] = []
|
|
131
|
+
for path in paths:
|
|
132
|
+
try:
|
|
133
|
+
if path.is_dir():
|
|
134
|
+
shutil.rmtree(path)
|
|
135
|
+
else:
|
|
136
|
+
path.unlink()
|
|
137
|
+
removed.append(path)
|
|
138
|
+
except OSError as exc:
|
|
139
|
+
skipped.append((path, str(exc)))
|
|
140
|
+
return CleanReport(removed=tuple(removed), skipped=tuple(skipped))
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# Copyright (c) 2026 Pointmatic
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""FR-4 cache identity: `CacheKey` + `compute_cache_key`.
|
|
4
|
+
|
|
5
|
+
The cache key is the triple (recipe_hash, input_hash, seed). Cache
|
|
6
|
+
directory paths use only the first 16 hex characters of `recipe_hash`
|
|
7
|
+
and `input_hash` (`.short`); the full hash is recorded in
|
|
8
|
+
`manifest.json`. See `project-essentials.md` "Cache identity is the
|
|
9
|
+
reproducibility contract - invalidations are ceremonious."
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import hashlib
|
|
15
|
+
from collections.abc import Mapping
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
|
|
18
|
+
from datarefinery.recipe.canonical import to_canonical_bytes
|
|
19
|
+
from datarefinery.recipe.models import Recipe
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True, slots=True)
|
|
23
|
+
class CacheKey:
|
|
24
|
+
"""Identity tuple for a materialized instance."""
|
|
25
|
+
|
|
26
|
+
recipe_hash: str
|
|
27
|
+
input_hash: str
|
|
28
|
+
seed: int
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def short(self) -> str:
|
|
32
|
+
"""First 16 hex characters of `recipe_hash` (cache directory shard)."""
|
|
33
|
+
return self.recipe_hash[:16]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def compute_cache_key(
|
|
37
|
+
recipe: Recipe,
|
|
38
|
+
raw_input_hashes: Mapping[str, str],
|
|
39
|
+
seed: int,
|
|
40
|
+
) -> CacheKey:
|
|
41
|
+
"""Compute the cache key for a (recipe, inputs, seed) triple.
|
|
42
|
+
|
|
43
|
+
`raw_input_hashes` maps each input source name to a SHA-256 hex
|
|
44
|
+
digest of that source's content. The combined `input_hash` is
|
|
45
|
+
order-independent: keys are sorted by source name before
|
|
46
|
+
concatenation.
|
|
47
|
+
"""
|
|
48
|
+
recipe_hash = hashlib.sha256(to_canonical_bytes(recipe)).hexdigest()
|
|
49
|
+
|
|
50
|
+
parts = [f"{name}={raw_input_hashes[name]};" for name in sorted(raw_input_hashes)]
|
|
51
|
+
payload = "".join(parts).encode("utf-8")
|
|
52
|
+
input_hash = hashlib.sha256(payload).hexdigest()
|
|
53
|
+
|
|
54
|
+
return CacheKey(recipe_hash=recipe_hash, input_hash=input_hash, seed=seed)
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# Copyright (c) 2026 Pointmatic
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""Cache directory layout helpers under `<cache-root>/instances/...`.
|
|
4
|
+
|
|
5
|
+
Layout (per tech-spec):
|
|
6
|
+
|
|
7
|
+
<cache-root>/
|
|
8
|
+
└── instances/
|
|
9
|
+
├── .tmp/<run-id>/ # in-flight runs; promoted via os.replace
|
|
10
|
+
└── <recipe-hash16>/<input-hash16>/<seed>/
|
|
11
|
+
├── manifest.json
|
|
12
|
+
├── dataset/
|
|
13
|
+
├── fitted_statistics/
|
|
14
|
+
└── report/
|
|
15
|
+
├── report.md
|
|
16
|
+
├── drift.json
|
|
17
|
+
└── visualizations/
|
|
18
|
+
|
|
19
|
+
The 16-char shards come from `CacheKey.short` (recipe) and the first 16
|
|
20
|
+
chars of `input_hash`; the full hashes are recorded in `manifest.json`.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import secrets
|
|
26
|
+
from datetime import UTC, datetime
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
from datarefinery.cache.identity import CacheKey
|
|
30
|
+
|
|
31
|
+
INSTANCES_DIR = "instances"
|
|
32
|
+
TMP_DIR_NAME = ".tmp"
|
|
33
|
+
MANIFEST_FILE = "manifest.json"
|
|
34
|
+
RECIPE_FILE = "recipe.json"
|
|
35
|
+
DATASET_SUBDIR = "dataset"
|
|
36
|
+
FITTED_STATS_SUBDIR = "fitted_statistics"
|
|
37
|
+
REPORT_SUBDIR = "report"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def instances_root(cache_root: Path) -> Path:
|
|
41
|
+
"""Root for all materialized instances and temp dirs."""
|
|
42
|
+
return cache_root / INSTANCES_DIR
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def instance_dir(cache_root: Path, key: CacheKey) -> Path:
|
|
46
|
+
"""Final path for a materialized instance under the cache root."""
|
|
47
|
+
return instances_root(cache_root) / key.recipe_hash[:16] / key.input_hash[:16] / str(key.seed)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def tmp_dir(cache_root: Path, run_id: str) -> Path:
|
|
51
|
+
"""Temp directory for an in-flight run, atomically promoted via os.replace."""
|
|
52
|
+
return instances_root(cache_root) / TMP_DIR_NAME / run_id
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def manifest_path(instance: Path) -> Path:
|
|
56
|
+
return instance / MANIFEST_FILE
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def recipe_path(instance: Path) -> Path:
|
|
60
|
+
return instance / RECIPE_FILE
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def dataset_dir(instance: Path) -> Path:
|
|
64
|
+
return instance / DATASET_SUBDIR
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def fitted_stats_dir(instance: Path) -> Path:
|
|
68
|
+
return instance / FITTED_STATS_SUBDIR
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def report_dir(instance: Path) -> Path:
|
|
72
|
+
return instance / REPORT_SUBDIR
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def make_run_id() -> str:
|
|
76
|
+
"""Return `<utc_iso_compact>-<8hex>`, e.g. `20260507T143022Z-deadbeef`.
|
|
77
|
+
|
|
78
|
+
Lexicographically sortable down to the second; the 8-hex random
|
|
79
|
+
suffix makes outputs unique under burst/concurrent calls within the
|
|
80
|
+
same second.
|
|
81
|
+
"""
|
|
82
|
+
stamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
|
|
83
|
+
return f"{stamp}-{secrets.token_hex(4)}"
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# Copyright (c) 2026 Pointmatic
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""Exception-to-exit-code mapping for the DataRefinery CLI.
|
|
4
|
+
|
|
5
|
+
Per tech-spec:
|
|
6
|
+
|
|
7
|
+
| code | meaning |
|
|
8
|
+
|------|---------------------------------------------------------------|
|
|
9
|
+
| 0 | success |
|
|
10
|
+
| 1 | user/recipe error (Recipe, Validation, Contract, Materialize) |
|
|
11
|
+
| 2 | system error (Plugin, Cache, environment, uncaught) |
|
|
12
|
+
| 130 | SIGINT / Ctrl-C |
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from datarefinery.core.errors import (
|
|
18
|
+
CacheError,
|
|
19
|
+
ContractError,
|
|
20
|
+
DataRefineryError,
|
|
21
|
+
MaterializeError,
|
|
22
|
+
PluginError,
|
|
23
|
+
RecipeError,
|
|
24
|
+
ValidationError,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
EXIT_OK = 0
|
|
28
|
+
EXIT_USER = 1
|
|
29
|
+
EXIT_SYSTEM = 2
|
|
30
|
+
EXIT_INTERRUPT = 130
|
|
31
|
+
|
|
32
|
+
_USER_ERROR_TYPES: tuple[type[DataRefineryError], ...] = (
|
|
33
|
+
RecipeError,
|
|
34
|
+
ValidationError,
|
|
35
|
+
ContractError,
|
|
36
|
+
MaterializeError,
|
|
37
|
+
)
|
|
38
|
+
_SYSTEM_ERROR_TYPES: tuple[type[DataRefineryError], ...] = (
|
|
39
|
+
PluginError,
|
|
40
|
+
CacheError,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def exit_code_for(exc: BaseException) -> int:
|
|
45
|
+
"""Return the documented CLI exit code for `exc`."""
|
|
46
|
+
if isinstance(exc, KeyboardInterrupt):
|
|
47
|
+
return EXIT_INTERRUPT
|
|
48
|
+
if isinstance(exc, _USER_ERROR_TYPES):
|
|
49
|
+
return EXIT_USER
|
|
50
|
+
if isinstance(exc, _SYSTEM_ERROR_TYPES):
|
|
51
|
+
return EXIT_SYSTEM
|
|
52
|
+
if isinstance(exc, DataRefineryError):
|
|
53
|
+
# Unknown future DataRefineryError subclass — treat as user-facing
|
|
54
|
+
# by default; widen the explicit tuples above when adding new types.
|
|
55
|
+
return EXIT_USER
|
|
56
|
+
return EXIT_SYSTEM
|
datarefinery/cli/app.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
# Copyright (c) 2026 Pointmatic
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""Typer CLI entry point for DataRefinery."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Annotated
|
|
10
|
+
|
|
11
|
+
import click
|
|
12
|
+
import typer
|
|
13
|
+
from rich.console import Console
|
|
14
|
+
from rich.panel import Panel
|
|
15
|
+
|
|
16
|
+
from datarefinery import __version__
|
|
17
|
+
from datarefinery.cli._exit_codes import (
|
|
18
|
+
EXIT_INTERRUPT,
|
|
19
|
+
EXIT_OK,
|
|
20
|
+
EXIT_SYSTEM,
|
|
21
|
+
exit_code_for,
|
|
22
|
+
)
|
|
23
|
+
from datarefinery.cli.commands.check_cmd import check as check_cmd
|
|
24
|
+
from datarefinery.cli.commands.clean_cmd import clean_command as clean_cmd
|
|
25
|
+
from datarefinery.cli.commands.init_cmd import init as init_cmd
|
|
26
|
+
from datarefinery.cli.commands.inspect_cmd import inspect as inspect_cmd
|
|
27
|
+
from datarefinery.cli.commands.materialize_cmd import materialize as materialize_cmd
|
|
28
|
+
from datarefinery.cli.commands.report_cmd import report as report_cmd
|
|
29
|
+
from datarefinery.cli.commands.status_cmd import status as status_cmd
|
|
30
|
+
from datarefinery.cli.commands.validate_cmd import validate as validate_cmd
|
|
31
|
+
from datarefinery.core.config import RuntimeConfig
|
|
32
|
+
from datarefinery.core.errors import DataRefineryError
|
|
33
|
+
from datarefinery.logging import get_logger
|
|
34
|
+
|
|
35
|
+
app = typer.Typer(
|
|
36
|
+
name="datarefinery",
|
|
37
|
+
help="DataRefinery — recipe-driven data preparation and caching for ML.",
|
|
38
|
+
no_args_is_help=False,
|
|
39
|
+
add_completion=False,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _version_callback(value: bool) -> None:
|
|
44
|
+
if value:
|
|
45
|
+
typer.echo(__version__)
|
|
46
|
+
raise typer.Exit()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@app.callback(invoke_without_command=True)
|
|
50
|
+
def main(
|
|
51
|
+
ctx: typer.Context,
|
|
52
|
+
version: Annotated[
|
|
53
|
+
bool,
|
|
54
|
+
typer.Option(
|
|
55
|
+
"--version",
|
|
56
|
+
callback=_version_callback,
|
|
57
|
+
is_eager=True,
|
|
58
|
+
help="Show the package version and exit.",
|
|
59
|
+
),
|
|
60
|
+
] = False,
|
|
61
|
+
cache_root: Annotated[
|
|
62
|
+
Path | None,
|
|
63
|
+
typer.Option(
|
|
64
|
+
"--cache-root",
|
|
65
|
+
help="Root directory for the cache (env: DATAREFINERY_CACHE_ROOT).",
|
|
66
|
+
),
|
|
67
|
+
] = None,
|
|
68
|
+
log_level: Annotated[
|
|
69
|
+
str | None,
|
|
70
|
+
typer.Option(
|
|
71
|
+
"--log-level",
|
|
72
|
+
help="Log level (env: DATAREFINERY_LOG_LEVEL).",
|
|
73
|
+
),
|
|
74
|
+
] = None,
|
|
75
|
+
log_target: Annotated[
|
|
76
|
+
str | None,
|
|
77
|
+
typer.Option(
|
|
78
|
+
"--log-target",
|
|
79
|
+
help="Log routing target; reserved no-op stub (env: DATAREFINERY_LOG_TARGET).",
|
|
80
|
+
),
|
|
81
|
+
] = None,
|
|
82
|
+
plugin_path: Annotated[
|
|
83
|
+
list[Path] | None,
|
|
84
|
+
typer.Option(
|
|
85
|
+
"--plugin-path",
|
|
86
|
+
help="Extra plugin discovery path; repeatable "
|
|
87
|
+
"(env: DATAREFINERY_PLUGIN_PATH, PATH-style).",
|
|
88
|
+
),
|
|
89
|
+
] = None,
|
|
90
|
+
workers: Annotated[
|
|
91
|
+
int | None,
|
|
92
|
+
typer.Option(
|
|
93
|
+
"--workers",
|
|
94
|
+
help="Process pool worker count (env: DATAREFINERY_WORKERS).",
|
|
95
|
+
),
|
|
96
|
+
] = None,
|
|
97
|
+
seed: Annotated[
|
|
98
|
+
int | None,
|
|
99
|
+
typer.Option(
|
|
100
|
+
"--seed",
|
|
101
|
+
help="Override the recipe-declared seed (changes cache identity).",
|
|
102
|
+
),
|
|
103
|
+
] = None,
|
|
104
|
+
variant: Annotated[
|
|
105
|
+
str | None,
|
|
106
|
+
typer.Option("--variant", help="Recipe variant to apply before canonicalization."),
|
|
107
|
+
] = None,
|
|
108
|
+
no_color: Annotated[
|
|
109
|
+
bool,
|
|
110
|
+
typer.Option("--no-color", help="Disable colored output."),
|
|
111
|
+
] = False,
|
|
112
|
+
quiet: Annotated[
|
|
113
|
+
bool,
|
|
114
|
+
typer.Option("--quiet", "-q", help="Suppress non-essential output."),
|
|
115
|
+
] = False,
|
|
116
|
+
verbose: Annotated[
|
|
117
|
+
bool,
|
|
118
|
+
typer.Option("--verbose", "-v", help="Verbose output."),
|
|
119
|
+
] = False,
|
|
120
|
+
) -> None:
|
|
121
|
+
"""DataRefinery — recipe-driven data preparation and caching for ML."""
|
|
122
|
+
config = RuntimeConfig.resolve(
|
|
123
|
+
cache_root=cache_root,
|
|
124
|
+
log_level=log_level,
|
|
125
|
+
log_target=log_target,
|
|
126
|
+
plugin_path=plugin_path,
|
|
127
|
+
workers=workers,
|
|
128
|
+
)
|
|
129
|
+
state = ctx.ensure_object(dict)
|
|
130
|
+
state["config"] = config
|
|
131
|
+
state["seed"] = seed
|
|
132
|
+
state["variant"] = variant
|
|
133
|
+
state["no_color"] = no_color
|
|
134
|
+
state["quiet"] = quiet
|
|
135
|
+
state["verbose"] = verbose
|
|
136
|
+
|
|
137
|
+
get_logger("cli")
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
app.command("check", help="Report environment soundness (FR-18).")(check_cmd)
|
|
142
|
+
app.command("validate", help="Validate a recipe (FR-2).")(validate_cmd)
|
|
143
|
+
app.command("init", help="Scaffold a starter recipe from raw inputs (FR-17).")(init_cmd)
|
|
144
|
+
app.command(
|
|
145
|
+
"materialize",
|
|
146
|
+
help="Run the pipeline end-to-end against the recipe's inputs (FR-3).",
|
|
147
|
+
)(materialize_cmd)
|
|
148
|
+
app.command(
|
|
149
|
+
"status",
|
|
150
|
+
help="Summarize a materialized instance or resolve a recipe to one (FR-19).",
|
|
151
|
+
)(status_cmd)
|
|
152
|
+
app.command(
|
|
153
|
+
"report",
|
|
154
|
+
help="Re-render report.md, drift.json, and reporting visualizations (FR-15).",
|
|
155
|
+
)(report_cmd)
|
|
156
|
+
app.command(
|
|
157
|
+
"inspect",
|
|
158
|
+
help="Read-only views of a materialized instance (FR-20).",
|
|
159
|
+
)(inspect_cmd)
|
|
160
|
+
app.command(
|
|
161
|
+
"clean",
|
|
162
|
+
help="Remove cached instances and orphan temp directories (FR-21).",
|
|
163
|
+
)(clean_cmd)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _render_error(message: str, *, title: str) -> None:
|
|
167
|
+
Console(stderr=True).print(Panel(message, title=title, border_style="red", expand=False))
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def main_entry() -> None:
|
|
171
|
+
"""Console-script entry point with DataRefinery's exit-code mapping."""
|
|
172
|
+
try:
|
|
173
|
+
app(standalone_mode=False)
|
|
174
|
+
except click.exceptions.Exit as exc:
|
|
175
|
+
sys.exit(exc.exit_code)
|
|
176
|
+
except click.exceptions.UsageError as exc:
|
|
177
|
+
exc.show()
|
|
178
|
+
sys.exit(EXIT_SYSTEM)
|
|
179
|
+
except click.exceptions.ClickException as exc:
|
|
180
|
+
exc.show()
|
|
181
|
+
sys.exit(exc.exit_code)
|
|
182
|
+
except (click.exceptions.Abort, KeyboardInterrupt):
|
|
183
|
+
_render_error("Interrupted.", title="Aborted")
|
|
184
|
+
sys.exit(EXIT_INTERRUPT)
|
|
185
|
+
except DataRefineryError as exc:
|
|
186
|
+
_render_error(str(exc) or type(exc).__name__, title=type(exc).__name__)
|
|
187
|
+
sys.exit(exit_code_for(exc))
|
|
188
|
+
except Exception as exc:
|
|
189
|
+
_render_error(f"{type(exc).__name__}: {exc}", title="Internal Error")
|
|
190
|
+
sys.exit(EXIT_SYSTEM)
|
|
191
|
+
sys.exit(EXIT_OK)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# Copyright (c) 2026 Pointmatic
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""Subcommand modules for the ``datarefinery`` CLI.
|
|
4
|
+
|
|
5
|
+
Each verb lives in its own ``<verb>_cmd.py`` so the verb name does not
|
|
6
|
+
collide with Python keyword-adjacent identifiers and stays readable in
|
|
7
|
+
import paths (per tech-spec).
|
|
8
|
+
"""
|