jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +120 -17
- datapipeline/analysis/vector/matrix.py +33 -8
- datapipeline/analysis/vector/report.py +162 -32
- datapipeline/build/tasks/__init__.py +11 -0
- datapipeline/build/tasks/config.py +74 -0
- datapipeline/build/tasks/metadata.py +170 -0
- datapipeline/build/tasks/scaler.py +73 -0
- datapipeline/build/tasks/schema.py +60 -0
- datapipeline/build/tasks/utils.py +169 -0
- datapipeline/cli/app.py +304 -127
- datapipeline/cli/commands/build.py +240 -16
- datapipeline/cli/commands/contract.py +367 -0
- datapipeline/cli/commands/domain.py +8 -3
- datapipeline/cli/commands/inspect.py +401 -149
- datapipeline/cli/commands/list_.py +30 -7
- datapipeline/cli/commands/plugin.py +5 -1
- datapipeline/cli/commands/run.py +227 -241
- datapipeline/cli/commands/run_config.py +101 -0
- datapipeline/cli/commands/serve_pipeline.py +156 -0
- datapipeline/cli/commands/source.py +44 -8
- datapipeline/cli/visuals/__init__.py +4 -2
- datapipeline/cli/visuals/common.py +239 -0
- datapipeline/cli/visuals/labels.py +15 -15
- datapipeline/cli/visuals/runner.py +66 -0
- datapipeline/cli/visuals/sections.py +20 -0
- datapipeline/cli/visuals/sources.py +132 -119
- datapipeline/cli/visuals/sources_basic.py +260 -0
- datapipeline/cli/visuals/sources_off.py +76 -0
- datapipeline/cli/visuals/sources_rich.py +414 -0
- datapipeline/config/catalog.py +37 -3
- datapipeline/config/context.py +214 -0
- datapipeline/config/dataset/loader.py +21 -4
- datapipeline/config/dataset/normalize.py +4 -4
- datapipeline/config/metadata.py +43 -0
- datapipeline/config/postprocess.py +2 -2
- datapipeline/config/project.py +3 -2
- datapipeline/config/resolution.py +129 -0
- datapipeline/config/tasks.py +309 -0
- datapipeline/config/workspace.py +155 -0
- datapipeline/domain/__init__.py +12 -0
- datapipeline/domain/record.py +11 -0
- datapipeline/domain/sample.py +54 -0
- datapipeline/integrations/ml/adapter.py +34 -20
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +1 -6
- datapipeline/integrations/ml/torch_support.py +1 -3
- datapipeline/io/factory.py +112 -0
- datapipeline/io/output.py +132 -0
- datapipeline/io/protocols.py +21 -0
- datapipeline/io/serializers.py +219 -0
- datapipeline/io/sinks/__init__.py +23 -0
- datapipeline/io/sinks/base.py +2 -0
- datapipeline/io/sinks/files.py +79 -0
- datapipeline/io/sinks/rich.py +57 -0
- datapipeline/io/sinks/stdout.py +18 -0
- datapipeline/io/writers/__init__.py +14 -0
- datapipeline/io/writers/base.py +28 -0
- datapipeline/io/writers/csv_writer.py +25 -0
- datapipeline/io/writers/jsonl.py +52 -0
- datapipeline/io/writers/pickle_writer.py +30 -0
- datapipeline/pipeline/artifacts.py +58 -0
- datapipeline/pipeline/context.py +66 -7
- datapipeline/pipeline/observability.py +65 -0
- datapipeline/pipeline/pipelines.py +65 -13
- datapipeline/pipeline/split.py +11 -10
- datapipeline/pipeline/stages.py +127 -16
- datapipeline/pipeline/utils/keygen.py +20 -7
- datapipeline/pipeline/utils/memory_sort.py +22 -10
- datapipeline/pipeline/utils/transform_utils.py +22 -0
- datapipeline/runtime.py +5 -2
- datapipeline/services/artifacts.py +12 -6
- datapipeline/services/bootstrap/config.py +25 -0
- datapipeline/services/bootstrap/core.py +52 -37
- datapipeline/services/constants.py +6 -5
- datapipeline/services/factories.py +123 -1
- datapipeline/services/project_paths.py +43 -16
- datapipeline/services/runs.py +208 -0
- datapipeline/services/scaffold/domain.py +3 -2
- datapipeline/services/scaffold/filter.py +3 -2
- datapipeline/services/scaffold/mappers.py +9 -6
- datapipeline/services/scaffold/plugin.py +54 -10
- datapipeline/services/scaffold/source.py +93 -56
- datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
- datapipeline/sources/decoders.py +83 -18
- datapipeline/sources/factory.py +26 -16
- datapipeline/sources/models/__init__.py +2 -2
- datapipeline/sources/models/generator.py +0 -7
- datapipeline/sources/models/loader.py +3 -3
- datapipeline/sources/models/parsing_error.py +24 -0
- datapipeline/sources/models/source.py +6 -6
- datapipeline/sources/synthetic/time/loader.py +14 -2
- datapipeline/sources/transports.py +74 -37
- datapipeline/templates/plugin_skeleton/README.md +76 -30
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
- datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
- datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
- datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -0
- datapipeline/templates/stubs/mapper.py.j2 +5 -4
- datapipeline/templates/stubs/parser.py.j2 +2 -0
- datapipeline/templates/stubs/record.py.j2 +2 -0
- datapipeline/templates/stubs/source.yaml.j2 +2 -3
- datapipeline/transforms/debug/lint.py +26 -41
- datapipeline/transforms/feature/scaler.py +89 -13
- datapipeline/transforms/record/floor_time.py +4 -4
- datapipeline/transforms/sequence.py +2 -35
- datapipeline/transforms/stream/dedupe.py +24 -0
- datapipeline/transforms/stream/ensure_ticks.py +7 -6
- datapipeline/transforms/vector/__init__.py +5 -0
- datapipeline/transforms/vector/common.py +98 -0
- datapipeline/transforms/vector/drop/__init__.py +4 -0
- datapipeline/transforms/vector/drop/horizontal.py +79 -0
- datapipeline/transforms/vector/drop/orchestrator.py +59 -0
- datapipeline/transforms/vector/drop/vertical.py +182 -0
- datapipeline/transforms/vector/ensure_schema.py +184 -0
- datapipeline/transforms/vector/fill.py +87 -0
- datapipeline/transforms/vector/replace.py +62 -0
- datapipeline/utils/load.py +24 -3
- datapipeline/utils/rich_compat.py +38 -0
- datapipeline/utils/window.py +76 -0
- jerry_thomas-1.0.1.dist-info/METADATA +825 -0
- jerry_thomas-1.0.1.dist-info/RECORD +199 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
- datapipeline/build/tasks.py +0 -186
- datapipeline/cli/commands/link.py +0 -128
- datapipeline/cli/commands/writers.py +0 -138
- datapipeline/config/build.py +0 -64
- datapipeline/config/run.py +0 -116
- datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
- datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
- datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
- datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
- datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
- datapipeline/transforms/vector.py +0 -210
- jerry_thomas-0.3.0.dist-info/METADATA +0 -502
- jerry_thomas-0.3.0.dist-info/RECORD +0 -139
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, asdict
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Tuple
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import shutil
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True)
|
|
13
|
+
class RunPaths:
|
|
14
|
+
"""Resolved filesystem paths for a single run rooted at a serve directory.
|
|
15
|
+
|
|
16
|
+
The serve directory is typically the user-configured `directory` for the
|
|
17
|
+
filesystem transport (e.g. `data/processed/...`).
|
|
18
|
+
|
|
19
|
+
Layout:
|
|
20
|
+
|
|
21
|
+
serve_root/
|
|
22
|
+
runs/
|
|
23
|
+
<run_id>/
|
|
24
|
+
dataset/ # main output for this run
|
|
25
|
+
run.json # metadata for this run
|
|
26
|
+
latest/ # symlink or copy pointing at the current live run
|
|
27
|
+
current_run.json # pointer to the run currently marked as "latest"
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
serve_root: Path
|
|
31
|
+
runs_root: Path
|
|
32
|
+
run_id: str
|
|
33
|
+
run_root: Path
|
|
34
|
+
dataset_dir: Path
|
|
35
|
+
metadata_path: Path
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class RunMetadata:
|
|
40
|
+
"""Metadata describing a single run."""
|
|
41
|
+
|
|
42
|
+
run_id: str
|
|
43
|
+
started_at: str
|
|
44
|
+
finished_at: str | None = None
|
|
45
|
+
status: str | None = None # e.g. "running", "success", "failed"
|
|
46
|
+
notes: str | None = None
|
|
47
|
+
stage: int | None = None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _now_utc_iso() -> str:
|
|
51
|
+
return datetime.now(timezone.utc).isoformat()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def make_run_id() -> str:
|
|
55
|
+
"""Create a filesystem-safe, sortable run identifier."""
|
|
56
|
+
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%SZ")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def get_serve_root(directory: str | Path) -> Path:
|
|
60
|
+
"""Resolve the user-configured serve directory to an absolute path."""
|
|
61
|
+
return Path(directory).expanduser().resolve()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def get_run_paths(serve_root: Path, run_id: str | None = None) -> RunPaths:
|
|
65
|
+
"""Build RunPaths for a run rooted at the given serve directory."""
|
|
66
|
+
if run_id is None:
|
|
67
|
+
run_id = make_run_id()
|
|
68
|
+
|
|
69
|
+
runs_root = serve_root / "runs"
|
|
70
|
+
run_root = runs_root / run_id
|
|
71
|
+
dataset_dir = run_root / "dataset"
|
|
72
|
+
metadata_path = run_root / "run.json"
|
|
73
|
+
|
|
74
|
+
return RunPaths(
|
|
75
|
+
serve_root=serve_root,
|
|
76
|
+
runs_root=runs_root,
|
|
77
|
+
run_id=run_id,
|
|
78
|
+
run_root=run_root,
|
|
79
|
+
dataset_dir=dataset_dir,
|
|
80
|
+
metadata_path=metadata_path,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _write_run_metadata(meta: RunMetadata, path: Path) -> None:
|
|
85
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
86
|
+
with path.open("w", encoding="utf-8") as f:
|
|
87
|
+
json.dump(asdict(meta), f, indent=2, sort_keys=True)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _load_run_metadata(path: Path) -> RunMetadata:
|
|
91
|
+
with path.open("r", encoding="utf-8") as f:
|
|
92
|
+
data: dict[str, Any] = json.load(f)
|
|
93
|
+
return RunMetadata(**data)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def start_run_for_directory(
|
|
97
|
+
directory: str | Path,
|
|
98
|
+
run_id: str | None = None,
|
|
99
|
+
*,
|
|
100
|
+
stage: int | None = None,
|
|
101
|
+
) -> Tuple[RunPaths, RunMetadata]:
|
|
102
|
+
"""Initialise a new run rooted at the given directory.
|
|
103
|
+
|
|
104
|
+
This will create the run's dataset directory and an initial metadata file
|
|
105
|
+
with status set to "running".
|
|
106
|
+
"""
|
|
107
|
+
serve_root = get_serve_root(directory)
|
|
108
|
+
paths = get_run_paths(serve_root, run_id)
|
|
109
|
+
|
|
110
|
+
# Ensure the run directories exist
|
|
111
|
+
paths.dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
112
|
+
|
|
113
|
+
meta = RunMetadata(
|
|
114
|
+
run_id=paths.run_id,
|
|
115
|
+
started_at=_now_utc_iso(),
|
|
116
|
+
finished_at=None,
|
|
117
|
+
status="running",
|
|
118
|
+
notes=None,
|
|
119
|
+
stage=stage,
|
|
120
|
+
)
|
|
121
|
+
_write_run_metadata(meta, paths.metadata_path)
|
|
122
|
+
return paths, meta
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def finish_run(paths: RunPaths, status: str, notes: str | None = None) -> RunMetadata:
|
|
126
|
+
"""Mark an existing run as finished with the given status."""
|
|
127
|
+
if paths.metadata_path.exists():
|
|
128
|
+
meta = _load_run_metadata(paths.metadata_path)
|
|
129
|
+
else:
|
|
130
|
+
# Fallback: create a minimal metadata record if none exists yet
|
|
131
|
+
meta = RunMetadata(
|
|
132
|
+
run_id=paths.run_id,
|
|
133
|
+
started_at=_now_utc_iso(),
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
meta.finished_at = _now_utc_iso()
|
|
137
|
+
meta.status = status
|
|
138
|
+
if notes is not None:
|
|
139
|
+
meta.notes = notes
|
|
140
|
+
|
|
141
|
+
_write_run_metadata(meta, paths.metadata_path)
|
|
142
|
+
return meta
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def finish_run_success(paths: RunPaths, notes: str | None = None) -> RunMetadata:
|
|
146
|
+
"""Convenience wrapper to mark a run as successful."""
|
|
147
|
+
return finish_run(paths, status="success", notes=notes)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def finish_run_failed(paths: RunPaths, notes: str | None = None) -> RunMetadata:
|
|
151
|
+
"""Convenience wrapper to mark a run as failed."""
|
|
152
|
+
return finish_run(paths, status="failed", notes=notes)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def set_latest_run(paths: RunPaths) -> None:
|
|
156
|
+
"""Mark the given run as the latest/live run for its serve directory.
|
|
157
|
+
|
|
158
|
+
This updates two things under the serve root:
|
|
159
|
+
|
|
160
|
+
* `latest/` – a symlink (or copied directory as a fallback) pointing to
|
|
161
|
+
this run's root directory, so consumers can read from
|
|
162
|
+
`<directory>/latest/dataset`.
|
|
163
|
+
|
|
164
|
+
* `current_run.json` – a small pointer file recording which run is
|
|
165
|
+
currently live and when this pointer was updated.
|
|
166
|
+
"""
|
|
167
|
+
serve_root = paths.serve_root
|
|
168
|
+
latest_root = serve_root / "latest"
|
|
169
|
+
|
|
170
|
+
# Ensure serve_root exists so that the layout is predictable
|
|
171
|
+
serve_root.mkdir(parents=True, exist_ok=True)
|
|
172
|
+
|
|
173
|
+
# Remove any existing "latest" pointer
|
|
174
|
+
if latest_root.is_symlink() or latest_root.is_file():
|
|
175
|
+
latest_root.unlink()
|
|
176
|
+
elif latest_root.is_dir():
|
|
177
|
+
shutil.rmtree(latest_root)
|
|
178
|
+
|
|
179
|
+
# Prefer a symlink for efficiency; fall back to copying if symlinks fail
|
|
180
|
+
try:
|
|
181
|
+
latest_root.symlink_to(paths.run_root, target_is_directory=True)
|
|
182
|
+
except OSError:
|
|
183
|
+
shutil.copytree(paths.run_root, latest_root)
|
|
184
|
+
|
|
185
|
+
# Write/update current_run.json with a simple pointer
|
|
186
|
+
current_meta_path = serve_root / "current_run.json"
|
|
187
|
+
current_data: dict[str, Any] = {
|
|
188
|
+
"run_id": paths.run_id,
|
|
189
|
+
"run_root": str(paths.run_root),
|
|
190
|
+
"dataset_dir": str(paths.dataset_dir),
|
|
191
|
+
"updated_at": _now_utc_iso(),
|
|
192
|
+
}
|
|
193
|
+
with current_meta_path.open("w", encoding="utf-8") as f:
|
|
194
|
+
json.dump(current_data, f, indent=2, sort_keys=True)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
__all__ = [
|
|
198
|
+
"RunPaths",
|
|
199
|
+
"RunMetadata",
|
|
200
|
+
"make_run_id",
|
|
201
|
+
"get_serve_root",
|
|
202
|
+
"get_run_paths",
|
|
203
|
+
"start_run_for_directory",
|
|
204
|
+
"finish_run",
|
|
205
|
+
"finish_run_success",
|
|
206
|
+
"finish_run_failed",
|
|
207
|
+
"set_latest_run",
|
|
208
|
+
]
|
|
@@ -9,6 +9,7 @@ from ..paths import pkg_root, resolve_base_pkg_dir
|
|
|
9
9
|
def create_domain(*, domain: str, root: Optional[Path]) -> None:
|
|
10
10
|
root_dir, name, _ = pkg_root(root)
|
|
11
11
|
base = resolve_base_pkg_dir(root_dir, name)
|
|
12
|
+
package_name = base.name
|
|
12
13
|
pkg_dir = base / "domains" / domain
|
|
13
14
|
pkg_dir.mkdir(parents=True, exist_ok=True)
|
|
14
15
|
(pkg_dir / "__init__.py").touch(exist_ok=True)
|
|
@@ -16,10 +17,10 @@ def create_domain(*, domain: str, root: Optional[Path]) -> None:
|
|
|
16
17
|
def write_missing(path: Path, tpl: str, **ctx):
|
|
17
18
|
if not path.exists():
|
|
18
19
|
path.write_text(render(tpl, **ctx))
|
|
19
|
-
print(f"[new]
|
|
20
|
+
print(f"[new] {path}")
|
|
20
21
|
|
|
21
22
|
cls_ = "".join(w.capitalize() for w in domain.split("_"))
|
|
22
23
|
parent = "TemporalRecord"
|
|
23
24
|
write_missing(pkg_dir / "model.py", "record.py.j2",
|
|
24
|
-
PACKAGE_NAME=
|
|
25
|
+
PACKAGE_NAME=package_name, DOMAIN=domain, CLASS_NAME=f"{cls_}Record",
|
|
25
26
|
PARENT_CLASS=parent, time_aware=True)
|
|
@@ -9,6 +9,7 @@ from ..paths import pkg_root, resolve_base_pkg_dir
|
|
|
9
9
|
def create_filter(*, name: str, root: Optional[Path]) -> None:
|
|
10
10
|
root_dir, pkg_name, _ = pkg_root(root)
|
|
11
11
|
base = resolve_base_pkg_dir(root_dir, pkg_name)
|
|
12
|
+
package_name = base.name
|
|
12
13
|
filters_dir = base / FILTERS_GROUP
|
|
13
14
|
filters_dir.mkdir(parents=True, exist_ok=True)
|
|
14
15
|
(filters_dir / "__init__.py").touch(exist_ok=True)
|
|
@@ -18,7 +19,7 @@ def create_filter(*, name: str, root: Optional[Path]) -> None:
|
|
|
18
19
|
path = filters_dir / f"{module_name}.py"
|
|
19
20
|
if not path.exists():
|
|
20
21
|
path.write_text(render("filter.py.j2", FUNCTION_NAME=name))
|
|
21
|
-
print(f"[new]
|
|
22
|
+
print(f"[new] {path}")
|
|
22
23
|
|
|
23
24
|
# Register entry point under datapipeline.filters
|
|
24
25
|
toml_path = root_dir / "pyproject.toml"
|
|
@@ -26,6 +27,6 @@ def create_filter(*, name: str, root: Optional[Path]) -> None:
|
|
|
26
27
|
toml_path.read_text(),
|
|
27
28
|
FILTERS_GROUP,
|
|
28
29
|
name,
|
|
29
|
-
f"{
|
|
30
|
+
f"{package_name}.filters.{module_name}:{name}",
|
|
30
31
|
)
|
|
31
32
|
toml_path.write_text(toml)
|
|
@@ -16,8 +16,11 @@ def _slug(s: str) -> str:
|
|
|
16
16
|
def attach_source_to_domain(*, domain: str, provider: str, dataset: str, root: Optional[Path]) -> None:
|
|
17
17
|
root_dir, name, pyproject = pkg_root(root)
|
|
18
18
|
base = resolve_base_pkg_dir(root_dir, name)
|
|
19
|
+
package_name = base.name
|
|
19
20
|
mappers_root = base / MAPPERS_GROUP
|
|
20
|
-
|
|
21
|
+
_ = _slug(provider)
|
|
22
|
+
ds = _slug(dataset)
|
|
23
|
+
dom = _slug(domain)
|
|
21
24
|
|
|
22
25
|
# Option B layout: mappers/{provider}/{dataset}/to_{domain}.py
|
|
23
26
|
pkg_dir = mappers_root / provider / dataset
|
|
@@ -32,7 +35,7 @@ def attach_source_to_domain(*, domain: str, provider: str, dataset: str, root: O
|
|
|
32
35
|
function_name = "map"
|
|
33
36
|
path.write_text(render(
|
|
34
37
|
"mapper.py.j2",
|
|
35
|
-
PACKAGE_NAME=
|
|
38
|
+
PACKAGE_NAME=package_name,
|
|
36
39
|
ORIGIN=provider,
|
|
37
40
|
DATASET=dataset,
|
|
38
41
|
TARGET_DOMAIN=dom,
|
|
@@ -42,11 +45,11 @@ def attach_source_to_domain(*, domain: str, provider: str, dataset: str, root: O
|
|
|
42
45
|
OriginDTO=f"{camel(provider)}{camel(dataset)}DTO",
|
|
43
46
|
time_aware=True,
|
|
44
47
|
))
|
|
45
|
-
print(f"[new]
|
|
48
|
+
print(f"[new] {path}")
|
|
46
49
|
|
|
47
|
-
# Register the mapper EP as domain.
|
|
48
|
-
ep_key = f"{dom}.{
|
|
49
|
-
ep_target = f"{
|
|
50
|
+
# Register the mapper EP as domain.dataset
|
|
51
|
+
ep_key = f"{dom}.{ds}"
|
|
52
|
+
ep_target = f"{package_name}.mappers.{provider}.{dataset}.{module_name}:map"
|
|
50
53
|
toml = (root_dir / "pyproject.toml").read_text()
|
|
51
54
|
toml = inject_ep(toml, MAPPERS_GROUP, ep_key, ep_target)
|
|
52
55
|
(root_dir / "pyproject.toml").write_text(toml)
|
|
@@ -1,7 +1,15 @@
|
|
|
1
1
|
from importlib.resources import as_file, files
|
|
2
2
|
from pathlib import Path
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
3
5
|
|
|
4
|
-
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
from datapipeline.utils.load import load_yaml
|
|
9
|
+
|
|
10
|
+
from ..constants import DEFAULT_IO_LOADER_EP
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
5
13
|
|
|
6
14
|
_RESERVED_PACKAGE_NAMES = {"datapipeline"}
|
|
7
15
|
|
|
@@ -9,15 +17,13 @@ _RESERVED_PACKAGE_NAMES = {"datapipeline"}
|
|
|
9
17
|
def _normalized_package_name(dist_name: str) -> str:
|
|
10
18
|
package_name = dist_name.replace("-", "_")
|
|
11
19
|
if package_name in _RESERVED_PACKAGE_NAMES:
|
|
12
|
-
|
|
13
|
-
"
|
|
14
|
-
"Choose a different plugin name."
|
|
20
|
+
logger.error(
|
|
21
|
+
"`datapipeline` is reserved for the core package. Choose a different plugin name."
|
|
15
22
|
)
|
|
16
23
|
raise SystemExit(1)
|
|
17
24
|
if not package_name.isidentifier():
|
|
18
|
-
|
|
19
|
-
"
|
|
20
|
-
"with underscores."
|
|
25
|
+
logger.error(
|
|
26
|
+
"Plugin names must be valid Python identifiers once hyphens are replaced with underscores."
|
|
21
27
|
)
|
|
22
28
|
raise SystemExit(1)
|
|
23
29
|
return package_name
|
|
@@ -26,7 +32,7 @@ def _normalized_package_name(dist_name: str) -> str:
|
|
|
26
32
|
def scaffold_plugin(name: str, outdir: Path) -> None:
|
|
27
33
|
target = (outdir / name).absolute()
|
|
28
34
|
if target.exists():
|
|
29
|
-
|
|
35
|
+
logger.error("`%s` already exists", target)
|
|
30
36
|
raise SystemExit(1)
|
|
31
37
|
import shutil
|
|
32
38
|
|
|
@@ -39,11 +45,49 @@ def scaffold_plugin(name: str, outdir: Path) -> None:
|
|
|
39
45
|
replacements = {
|
|
40
46
|
"{{PACKAGE_NAME}}": package_name,
|
|
41
47
|
"{{DIST_NAME}}": name,
|
|
42
|
-
"{{
|
|
48
|
+
"{{DEFAULT_IO_LOADER_EP}}": DEFAULT_IO_LOADER_EP,
|
|
43
49
|
}
|
|
44
50
|
for p in (target / "pyproject.toml", target / "README.md"):
|
|
45
51
|
text = p.read_text()
|
|
46
52
|
for placeholder, value in replacements.items():
|
|
47
53
|
text = text.replace(placeholder, value)
|
|
48
54
|
p.write_text(text)
|
|
49
|
-
|
|
55
|
+
|
|
56
|
+
# Move jerry.yaml up to the workspace root (current working directory) so
|
|
57
|
+
# users can run the CLI from the workspace without cd'ing into the plugin.
|
|
58
|
+
# We adjust plugin_root and dataset paths to point at the plugin directory
|
|
59
|
+
# relative to the workspace. Do not overwrite an existing workspace
|
|
60
|
+
# jerry.yaml.
|
|
61
|
+
plugin_jerry = target / "jerry.yaml"
|
|
62
|
+
workspace_root = Path.cwd().resolve()
|
|
63
|
+
workspace_jerry = workspace_root / "jerry.yaml"
|
|
64
|
+
if plugin_jerry.exists() and not workspace_jerry.exists():
|
|
65
|
+
try:
|
|
66
|
+
plugin_root_rel = target.relative_to(workspace_root)
|
|
67
|
+
except ValueError:
|
|
68
|
+
# Fall back to a relative path between arbitrary directories; this
|
|
69
|
+
# may include ".." segments.
|
|
70
|
+
try:
|
|
71
|
+
plugin_root_rel = Path(os.path.relpath(target, workspace_root))
|
|
72
|
+
except Exception:
|
|
73
|
+
plugin_root_rel = target
|
|
74
|
+
|
|
75
|
+
data = load_yaml(plugin_jerry)
|
|
76
|
+
data["plugin_root"] = plugin_root_rel.as_posix()
|
|
77
|
+
datasets = data.get("datasets") or {}
|
|
78
|
+
updated_datasets = {}
|
|
79
|
+
for alias, path in datasets.items():
|
|
80
|
+
p = Path(path)
|
|
81
|
+
if p.is_absolute():
|
|
82
|
+
updated_datasets[alias] = p.as_posix()
|
|
83
|
+
else:
|
|
84
|
+
updated_datasets[alias] = (plugin_root_rel / p).as_posix()
|
|
85
|
+
data["datasets"] = updated_datasets
|
|
86
|
+
|
|
87
|
+
workspace_jerry.write_text(
|
|
88
|
+
yaml.safe_dump(data, sort_keys=False), encoding="utf-8"
|
|
89
|
+
)
|
|
90
|
+
plugin_jerry.unlink()
|
|
91
|
+
logger.info("workspace jerry.yaml created at %s", workspace_jerry)
|
|
92
|
+
|
|
93
|
+
logger.info("plugin skeleton created at %s", target)
|
|
@@ -4,12 +4,13 @@ from typing import Optional
|
|
|
4
4
|
|
|
5
5
|
from datapipeline.services.scaffold.templates import camel, render
|
|
6
6
|
|
|
7
|
-
from ..constants import
|
|
7
|
+
from ..constants import DEFAULT_IO_LOADER_EP
|
|
8
8
|
from ..entrypoints import inject_ep
|
|
9
9
|
from ..paths import pkg_root, resolve_base_pkg_dir
|
|
10
10
|
from datapipeline.services.project_paths import (
|
|
11
11
|
sources_dir as resolve_sources_dir,
|
|
12
12
|
ensure_project_scaffold,
|
|
13
|
+
resolve_project_yaml_path,
|
|
13
14
|
)
|
|
14
15
|
|
|
15
16
|
|
|
@@ -19,14 +20,14 @@ def _class_prefix(provider: str, dataset: str) -> str:
|
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
def _source_alias(provider: str, dataset: str) -> str:
|
|
22
|
-
return f"{provider}
|
|
23
|
+
return f"{provider}.{dataset}"
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
def _write_if_missing(path: Path, text: str) -> None:
|
|
26
27
|
"""Write file only if it does not exist; echo a friendly message."""
|
|
27
28
|
if not path.exists():
|
|
28
29
|
path.write_text(text)
|
|
29
|
-
print(f"[new]
|
|
30
|
+
print(f"[new] {path}")
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
def _render_loader_stub(transport: str, loader_class: str,
|
|
@@ -55,86 +56,120 @@ def _update_ep(toml_text: str, provider: str, dataset: str, pkg_name: str,
|
|
|
55
56
|
return toml_text, ep_key
|
|
56
57
|
|
|
57
58
|
|
|
58
|
-
def _loader_ep_and_args(transport: str, fmt: Optional[str], ep_key: str) -> tuple[str, dict]:
|
|
59
|
+
def _loader_ep_and_args(transport: str, fmt: Optional[str], ep_key: Optional[str]) -> tuple[str, dict]:
|
|
59
60
|
"""Return (loader EP name, default args) for the YAML snippet."""
|
|
60
61
|
if transport == "fs":
|
|
61
62
|
args = {
|
|
62
63
|
"transport": "fs",
|
|
63
|
-
"format": fmt or "<FORMAT (csv|json|json-lines)>",
|
|
64
|
+
"format": fmt or "<FORMAT (csv|json|json-lines|pickle)>",
|
|
64
65
|
"path": "<PATH OR GLOB>",
|
|
65
66
|
"glob": False,
|
|
66
67
|
"encoding": "utf-8",
|
|
67
68
|
}
|
|
68
69
|
if fmt == "csv":
|
|
69
70
|
args["delimiter"] = ","
|
|
70
|
-
return
|
|
71
|
+
return DEFAULT_IO_LOADER_EP, args
|
|
71
72
|
if transport == "synthetic":
|
|
73
|
+
if ep_key is None:
|
|
74
|
+
raise ValueError("synthetic transport requires scaffolding a loader entrypoint")
|
|
72
75
|
return ep_key, {"start": "<ISO8601>", "end": "<ISO8601>", "frequency": "1h"}
|
|
73
|
-
if transport == "
|
|
76
|
+
if transport == "http":
|
|
74
77
|
args = {
|
|
75
|
-
"transport": "
|
|
78
|
+
"transport": "http",
|
|
76
79
|
"format": fmt or "<FORMAT (json|json-lines|csv)>",
|
|
77
80
|
"url": "<https://api.example.com/data.json>",
|
|
78
81
|
"headers": {},
|
|
82
|
+
"params": {},
|
|
79
83
|
"encoding": "utf-8",
|
|
80
84
|
}
|
|
81
85
|
if fmt == "csv":
|
|
82
86
|
args["delimiter"] = ","
|
|
83
|
-
return
|
|
87
|
+
return DEFAULT_IO_LOADER_EP, args
|
|
88
|
+
if ep_key is None:
|
|
89
|
+
raise ValueError(f"unsupported transport '{transport}' for identity scaffold")
|
|
84
90
|
return ep_key, {}
|
|
85
91
|
|
|
86
92
|
|
|
87
|
-
def create_source(
|
|
88
|
-
|
|
93
|
+
def create_source(
|
|
94
|
+
*,
|
|
95
|
+
provider: str,
|
|
96
|
+
dataset: str,
|
|
97
|
+
transport: str,
|
|
98
|
+
format: Optional[str],
|
|
99
|
+
root: Optional[Path],
|
|
100
|
+
identity: bool = False,
|
|
101
|
+
) -> None:
|
|
89
102
|
root_dir, name, _ = pkg_root(root)
|
|
90
103
|
base = resolve_base_pkg_dir(root_dir, name)
|
|
91
|
-
|
|
92
|
-
src_pkg_dir.mkdir(parents=True, exist_ok=True)
|
|
93
|
-
(src_pkg_dir / "__init__.py").touch(exist_ok=True)
|
|
94
|
-
|
|
95
|
-
class_prefix = _class_prefix(provider, dataset)
|
|
96
|
-
dto_class = f"{class_prefix}DTO"
|
|
97
|
-
parser_class = f"{class_prefix}Parser"
|
|
98
|
-
loader_class = f"{class_prefix}DataLoader"
|
|
99
|
-
|
|
100
|
-
# DTO
|
|
101
|
-
dto_path = src_pkg_dir / "dto.py"
|
|
102
|
-
_write_if_missing(dto_path, render(
|
|
103
|
-
"dto.py.j2",
|
|
104
|
-
PACKAGE_NAME=name, ORIGIN=provider, DOMAIN=dataset,
|
|
105
|
-
CLASS_NAME=dto_class, time_aware=True
|
|
106
|
-
))
|
|
107
|
-
|
|
108
|
-
# Parser
|
|
109
|
-
parser_path = src_pkg_dir / "parser.py"
|
|
110
|
-
_write_if_missing(parser_path, render(
|
|
111
|
-
"parser.py.j2",
|
|
112
|
-
PACKAGE_NAME=name, ORIGIN=provider, DOMAIN=dataset,
|
|
113
|
-
CLASS_NAME=parser_class, DTO_CLASS=dto_class, time_aware=True
|
|
114
|
-
))
|
|
115
|
-
|
|
116
|
-
# Optional loader stub: synthetic (url uses composed loader by default)
|
|
117
|
-
if transport in {"synthetic"}:
|
|
118
|
-
loader_path = src_pkg_dir / "loader.py"
|
|
119
|
-
stub = _render_loader_stub(transport, loader_class, fmt=format)
|
|
120
|
-
if stub is not None:
|
|
121
|
-
_write_if_missing(loader_path, stub)
|
|
122
|
-
|
|
123
|
-
toml_path = root_dir / "pyproject.toml"
|
|
124
|
-
toml_text, ep_key = _update_ep(
|
|
125
|
-
toml_path.read_text(),
|
|
126
|
-
provider, dataset, name,
|
|
127
|
-
transport, parser_class, loader_class
|
|
128
|
-
)
|
|
129
|
-
toml_path.write_text(toml_text)
|
|
104
|
+
package_name = base.name
|
|
130
105
|
|
|
131
106
|
alias = _source_alias(provider, dataset)
|
|
107
|
+
parser_ep: str
|
|
108
|
+
parser_args: dict
|
|
109
|
+
ep_key: Optional[str] = None
|
|
110
|
+
|
|
111
|
+
if identity:
|
|
112
|
+
if transport == "synthetic":
|
|
113
|
+
raise ValueError(
|
|
114
|
+
"identity parser scaffold is not supported for synthetic sources; "
|
|
115
|
+
"generate the standard parser instead."
|
|
116
|
+
)
|
|
117
|
+
parser_ep = "identity"
|
|
118
|
+
parser_args = {}
|
|
119
|
+
else:
|
|
120
|
+
src_pkg_dir = base / "sources" / provider / dataset
|
|
121
|
+
src_pkg_dir.mkdir(parents=True, exist_ok=True)
|
|
122
|
+
(src_pkg_dir / "__init__.py").touch(exist_ok=True)
|
|
123
|
+
|
|
124
|
+
class_prefix = _class_prefix(provider, dataset)
|
|
125
|
+
dto_class = f"{class_prefix}DTO"
|
|
126
|
+
parser_class = f"{class_prefix}Parser"
|
|
127
|
+
loader_class = f"{class_prefix}DataLoader"
|
|
128
|
+
|
|
129
|
+
# DTO
|
|
130
|
+
dto_path = src_pkg_dir / "dto.py"
|
|
131
|
+
_write_if_missing(dto_path, render(
|
|
132
|
+
"dto.py.j2",
|
|
133
|
+
PACKAGE_NAME=package_name, ORIGIN=provider, DOMAIN=dataset,
|
|
134
|
+
CLASS_NAME=dto_class, time_aware=True
|
|
135
|
+
))
|
|
136
|
+
|
|
137
|
+
# Parser
|
|
138
|
+
parser_path = src_pkg_dir / "parser.py"
|
|
139
|
+
_write_if_missing(parser_path, render(
|
|
140
|
+
"parser.py.j2",
|
|
141
|
+
PACKAGE_NAME=package_name, ORIGIN=provider, DOMAIN=dataset,
|
|
142
|
+
CLASS_NAME=parser_class, DTO_CLASS=dto_class, time_aware=True
|
|
143
|
+
))
|
|
144
|
+
|
|
145
|
+
# Optional loader stub: synthetic (http uses core IO loader by default)
|
|
146
|
+
if transport in {"synthetic"}:
|
|
147
|
+
loader_path = src_pkg_dir / "loader.py"
|
|
148
|
+
stub = _render_loader_stub(transport, loader_class, fmt=format)
|
|
149
|
+
if stub is not None:
|
|
150
|
+
_write_if_missing(loader_path, stub)
|
|
151
|
+
|
|
152
|
+
toml_path = root_dir / "pyproject.toml"
|
|
153
|
+
toml_text, ep_key = _update_ep(
|
|
154
|
+
toml_path.read_text(),
|
|
155
|
+
provider,
|
|
156
|
+
dataset,
|
|
157
|
+
package_name,
|
|
158
|
+
transport,
|
|
159
|
+
parser_class,
|
|
160
|
+
loader_class,
|
|
161
|
+
)
|
|
162
|
+
toml_path.write_text(toml_text)
|
|
163
|
+
|
|
164
|
+
parser_ep = ep_key
|
|
165
|
+
parser_args = {}
|
|
166
|
+
|
|
132
167
|
loader_ep, loader_args = _loader_ep_and_args(transport, format, ep_key)
|
|
133
168
|
|
|
134
169
|
# Resolve sources directory from a single dataset-scoped project config.
|
|
135
170
|
# If not present or invalid, let the exception bubble up to prompt the user
|
|
136
171
|
# to provide a valid project path.
|
|
137
|
-
proj_yaml = root_dir
|
|
172
|
+
proj_yaml = resolve_project_yaml_path(root_dir)
|
|
138
173
|
# Best-effort: create a minimal project scaffold if missing
|
|
139
174
|
ensure_project_scaffold(proj_yaml)
|
|
140
175
|
sources_dir = resolve_sources_dir(proj_yaml).resolve()
|
|
@@ -143,11 +178,13 @@ def create_source(*, provider: str, dataset: str, transport: str,
|
|
|
143
178
|
if not src_cfg_path.exists():
|
|
144
179
|
src_cfg_path.write_text(render(
|
|
145
180
|
"source.yaml.j2",
|
|
146
|
-
|
|
147
|
-
parser_ep=
|
|
148
|
-
parser_args=
|
|
181
|
+
id=alias,
|
|
182
|
+
parser_ep=parser_ep,
|
|
183
|
+
parser_args=parser_args,
|
|
149
184
|
loader_ep=loader_ep,
|
|
150
185
|
loader_args=loader_args,
|
|
151
|
-
|
|
186
|
+
default_io_loader_ep=DEFAULT_IO_LOADER_EP,
|
|
152
187
|
))
|
|
153
|
-
print(f"[new]
|
|
188
|
+
print(f"[new] {src_cfg_path.resolve()}")
|
|
189
|
+
elif identity:
|
|
190
|
+
print(f"[info] Source YAML already exists; skipped identity scaffold at {src_cfg_path.resolve()}")
|
|
@@ -1,21 +1,21 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from typing import Iterator, Any, Optional
|
|
4
|
-
from .models.loader import
|
|
5
|
-
from .transports import
|
|
4
|
+
from .models.loader import BaseDataLoader
|
|
5
|
+
from .transports import Transport, HttpTransport
|
|
6
6
|
from .decoders import Decoder
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
class
|
|
10
|
-
"""Compose a
|
|
9
|
+
class DataLoader(BaseDataLoader):
|
|
10
|
+
"""Compose a Transport with a row Decoder."""
|
|
11
11
|
|
|
12
|
-
def __init__(self,
|
|
13
|
-
self.
|
|
12
|
+
def __init__(self, transport: Transport, decoder: Decoder, *, allow_network_count: bool = False):
|
|
13
|
+
self.transport = transport
|
|
14
14
|
self.decoder = decoder
|
|
15
15
|
self._allow_net_count = bool(allow_network_count)
|
|
16
16
|
|
|
17
17
|
def load(self) -> Iterator[Any]:
|
|
18
|
-
for stream in self.
|
|
18
|
+
for stream in self.transport.streams():
|
|
19
19
|
for row in self.decoder.decode(stream):
|
|
20
20
|
yield row
|
|
21
21
|
|
|
@@ -23,11 +23,11 @@ class ComposedRawLoader(RawDataLoader):
|
|
|
23
23
|
# Delegate counting to the decoder using the transport streams.
|
|
24
24
|
# Avoid counting over network unless explicitly enabled.
|
|
25
25
|
try:
|
|
26
|
-
if isinstance(self.
|
|
26
|
+
if isinstance(self.transport, HttpTransport) and not self._allow_net_count:
|
|
27
27
|
return None
|
|
28
28
|
total = 0
|
|
29
29
|
any_stream = False
|
|
30
|
-
for stream in self.
|
|
30
|
+
for stream in self.transport.streams():
|
|
31
31
|
any_stream = True
|
|
32
32
|
c = self.decoder.count(stream)
|
|
33
33
|
if c is None:
|