jerry-thomas 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +120 -17
- datapipeline/analysis/vector/matrix.py +33 -8
- datapipeline/analysis/vector/report.py +162 -32
- datapipeline/build/tasks/__init__.py +11 -0
- datapipeline/build/tasks/config.py +74 -0
- datapipeline/build/tasks/metadata.py +170 -0
- datapipeline/build/tasks/scaler.py +73 -0
- datapipeline/build/tasks/schema.py +60 -0
- datapipeline/build/tasks/utils.py +169 -0
- datapipeline/cli/app.py +304 -127
- datapipeline/cli/commands/build.py +240 -16
- datapipeline/cli/commands/contract.py +367 -0
- datapipeline/cli/commands/domain.py +8 -3
- datapipeline/cli/commands/inspect.py +401 -149
- datapipeline/cli/commands/list_.py +30 -7
- datapipeline/cli/commands/plugin.py +1 -1
- datapipeline/cli/commands/run.py +227 -241
- datapipeline/cli/commands/run_config.py +101 -0
- datapipeline/cli/commands/serve_pipeline.py +156 -0
- datapipeline/cli/commands/source.py +44 -8
- datapipeline/cli/visuals/__init__.py +4 -2
- datapipeline/cli/visuals/common.py +239 -0
- datapipeline/cli/visuals/labels.py +15 -15
- datapipeline/cli/visuals/runner.py +66 -0
- datapipeline/cli/visuals/sections.py +20 -0
- datapipeline/cli/visuals/sources.py +132 -119
- datapipeline/cli/visuals/sources_basic.py +260 -0
- datapipeline/cli/visuals/sources_off.py +76 -0
- datapipeline/cli/visuals/sources_rich.py +414 -0
- datapipeline/config/catalog.py +37 -3
- datapipeline/config/context.py +214 -0
- datapipeline/config/dataset/loader.py +21 -4
- datapipeline/config/dataset/normalize.py +4 -4
- datapipeline/config/metadata.py +43 -0
- datapipeline/config/postprocess.py +2 -2
- datapipeline/config/project.py +3 -2
- datapipeline/config/resolution.py +129 -0
- datapipeline/config/tasks.py +309 -0
- datapipeline/config/workspace.py +155 -0
- datapipeline/domain/__init__.py +12 -0
- datapipeline/domain/record.py +11 -0
- datapipeline/domain/sample.py +54 -0
- datapipeline/integrations/ml/adapter.py +34 -20
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +1 -6
- datapipeline/integrations/ml/torch_support.py +1 -3
- datapipeline/io/factory.py +112 -0
- datapipeline/io/output.py +132 -0
- datapipeline/io/protocols.py +21 -0
- datapipeline/io/serializers.py +219 -0
- datapipeline/io/sinks/__init__.py +23 -0
- datapipeline/io/sinks/base.py +2 -0
- datapipeline/io/sinks/files.py +79 -0
- datapipeline/io/sinks/rich.py +57 -0
- datapipeline/io/sinks/stdout.py +18 -0
- datapipeline/io/writers/__init__.py +14 -0
- datapipeline/io/writers/base.py +28 -0
- datapipeline/io/writers/csv_writer.py +25 -0
- datapipeline/io/writers/jsonl.py +52 -0
- datapipeline/io/writers/pickle_writer.py +30 -0
- datapipeline/pipeline/artifacts.py +58 -0
- datapipeline/pipeline/context.py +66 -7
- datapipeline/pipeline/observability.py +65 -0
- datapipeline/pipeline/pipelines.py +65 -13
- datapipeline/pipeline/split.py +11 -10
- datapipeline/pipeline/stages.py +127 -16
- datapipeline/pipeline/utils/keygen.py +20 -7
- datapipeline/pipeline/utils/memory_sort.py +22 -10
- datapipeline/pipeline/utils/transform_utils.py +22 -0
- datapipeline/runtime.py +5 -2
- datapipeline/services/artifacts.py +12 -6
- datapipeline/services/bootstrap/config.py +25 -0
- datapipeline/services/bootstrap/core.py +52 -37
- datapipeline/services/constants.py +6 -5
- datapipeline/services/factories.py +123 -1
- datapipeline/services/project_paths.py +43 -16
- datapipeline/services/runs.py +208 -0
- datapipeline/services/scaffold/domain.py +3 -2
- datapipeline/services/scaffold/filter.py +3 -2
- datapipeline/services/scaffold/mappers.py +9 -6
- datapipeline/services/scaffold/plugin.py +3 -3
- datapipeline/services/scaffold/source.py +93 -56
- datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
- datapipeline/sources/decoders.py +83 -18
- datapipeline/sources/factory.py +26 -16
- datapipeline/sources/models/__init__.py +2 -2
- datapipeline/sources/models/generator.py +0 -7
- datapipeline/sources/models/loader.py +3 -3
- datapipeline/sources/models/parsing_error.py +24 -0
- datapipeline/sources/models/source.py +6 -6
- datapipeline/sources/synthetic/time/loader.py +14 -2
- datapipeline/sources/transports.py +74 -37
- datapipeline/templates/plugin_skeleton/README.md +74 -30
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
- datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
- datapipeline/templates/plugin_skeleton/jerry.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
- datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -0
- datapipeline/templates/stubs/mapper.py.j2 +5 -4
- datapipeline/templates/stubs/parser.py.j2 +2 -0
- datapipeline/templates/stubs/record.py.j2 +2 -0
- datapipeline/templates/stubs/source.yaml.j2 +2 -3
- datapipeline/transforms/debug/lint.py +26 -41
- datapipeline/transforms/feature/scaler.py +89 -13
- datapipeline/transforms/record/floor_time.py +4 -4
- datapipeline/transforms/sequence.py +2 -35
- datapipeline/transforms/stream/dedupe.py +24 -0
- datapipeline/transforms/stream/ensure_ticks.py +7 -6
- datapipeline/transforms/vector/__init__.py +5 -0
- datapipeline/transforms/vector/common.py +98 -0
- datapipeline/transforms/vector/drop/__init__.py +4 -0
- datapipeline/transforms/vector/drop/horizontal.py +79 -0
- datapipeline/transforms/vector/drop/orchestrator.py +59 -0
- datapipeline/transforms/vector/drop/vertical.py +182 -0
- datapipeline/transforms/vector/ensure_schema.py +184 -0
- datapipeline/transforms/vector/fill.py +87 -0
- datapipeline/transforms/vector/replace.py +62 -0
- datapipeline/utils/load.py +24 -3
- datapipeline/utils/rich_compat.py +38 -0
- datapipeline/utils/window.py +76 -0
- jerry_thomas-1.0.0.dist-info/METADATA +825 -0
- jerry_thomas-1.0.0.dist-info/RECORD +199 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/entry_points.txt +9 -8
- datapipeline/build/tasks.py +0 -186
- datapipeline/cli/commands/link.py +0 -128
- datapipeline/cli/commands/writers.py +0 -138
- datapipeline/config/build.py +0 -64
- datapipeline/config/run.py +0 -116
- datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
- datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
- datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
- datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
- datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
- datapipeline/transforms/vector.py +0 -210
- jerry_thomas-0.3.0.dist-info/METADATA +0 -502
- jerry_thomas-0.3.0.dist-info/RECORD +0 -139
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/WHEEL +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,39 +1,263 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
1
3
|
from pathlib import Path
|
|
4
|
+
from typing import Callable, Optional
|
|
2
5
|
|
|
3
6
|
from datapipeline.build.state import BuildState, load_build_state, save_build_state
|
|
4
|
-
from datapipeline.build.tasks import
|
|
5
|
-
|
|
7
|
+
from datapipeline.build.tasks import (
|
|
8
|
+
compute_config_hash,
|
|
9
|
+
materialize_scaler_statistics,
|
|
10
|
+
materialize_vector_schema,
|
|
11
|
+
materialize_metadata,
|
|
12
|
+
)
|
|
13
|
+
from datapipeline.cli.visuals import get_visuals_backend
|
|
14
|
+
from datapipeline.cli.visuals.runner import run_job
|
|
15
|
+
from datapipeline.cli.visuals.sections import sections_from_path
|
|
16
|
+
from datapipeline.config.tasks import ArtifactTask, MetadataTask, ScalerTask, SchemaTask, artifact_tasks
|
|
17
|
+
from datapipeline.config.context import resolve_build_settings
|
|
6
18
|
from datapipeline.services.bootstrap import artifacts_root, bootstrap
|
|
7
|
-
from datapipeline.services.
|
|
19
|
+
from datapipeline.services.constants import (
|
|
20
|
+
SCALER_STATISTICS,
|
|
21
|
+
VECTOR_SCHEMA,
|
|
22
|
+
VECTOR_SCHEMA_METADATA,
|
|
23
|
+
)
|
|
24
|
+
from datapipeline.services.project_paths import tasks_dir
|
|
8
25
|
|
|
9
26
|
|
|
10
|
-
|
|
11
|
-
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _log_build_settings_debug(project_path: Path, settings) -> None:
|
|
31
|
+
if not logger.isEnabledFor(logging.DEBUG):
|
|
32
|
+
return
|
|
33
|
+
payload = {
|
|
34
|
+
"project": str(project_path),
|
|
35
|
+
"mode": settings.mode,
|
|
36
|
+
"force": settings.force,
|
|
37
|
+
"visuals": settings.visuals,
|
|
38
|
+
"progress": settings.progress,
|
|
39
|
+
}
|
|
40
|
+
logger.debug("Build settings:\n%s", json.dumps(
|
|
41
|
+
payload, indent=2, default=str))
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _log_task_overview(tasks: list[ArtifactTask]) -> None:
|
|
45
|
+
if not logger.isEnabledFor(logging.DEBUG):
|
|
46
|
+
return
|
|
47
|
+
payload = [
|
|
48
|
+
{
|
|
49
|
+
"name": task.effective_name(),
|
|
50
|
+
"kind": task.kind,
|
|
51
|
+
"enabled": task.enabled,
|
|
52
|
+
"output": getattr(task, "output", None),
|
|
53
|
+
}
|
|
54
|
+
for task in tasks
|
|
55
|
+
]
|
|
56
|
+
logger.debug("Artifact tasks:\n%s", json.dumps(payload, indent=2, default=str))
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def run_build_if_needed(
|
|
60
|
+
project: Path | str,
|
|
61
|
+
*,
|
|
62
|
+
force: bool = False,
|
|
63
|
+
cli_visuals: str | None = None,
|
|
64
|
+
cli_progress: str | None = None,
|
|
65
|
+
workspace=None,
|
|
66
|
+
required_artifacts: set[str] | None = None,
|
|
67
|
+
) -> bool:
|
|
68
|
+
"""Execute the build workflow when the cached config hash has changed.
|
|
12
69
|
|
|
70
|
+
Returns True when a build was performed, False if skipped.
|
|
71
|
+
"""
|
|
13
72
|
project_path = Path(project).resolve()
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
73
|
+
settings = resolve_build_settings(
|
|
74
|
+
workspace=workspace,
|
|
75
|
+
cli_visuals=cli_visuals,
|
|
76
|
+
cli_progress=cli_progress,
|
|
77
|
+
force_flag=force,
|
|
78
|
+
)
|
|
79
|
+
effective_provider = settings.visuals
|
|
80
|
+
effective_style = settings.progress
|
|
81
|
+
|
|
82
|
+
if settings.mode == "OFF":
|
|
83
|
+
logger.info("Build skipped (jerry.yaml build.mode=OFF).")
|
|
84
|
+
return False
|
|
85
|
+
force = settings.force
|
|
86
|
+
tasks_root = tasks_dir(project_path)
|
|
87
|
+
config_hash = compute_config_hash(project_path, tasks_root)
|
|
17
88
|
|
|
18
89
|
art_root = artifacts_root(project_path)
|
|
19
90
|
state_path = (art_root / "build" / "state.json").resolve()
|
|
20
91
|
state = load_build_state(state_path)
|
|
21
92
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
93
|
+
effective_level = logging.getLogger().getEffectiveLevel()
|
|
94
|
+
backend = get_visuals_backend(effective_provider)
|
|
95
|
+
# Present headline before deciding to skip or run
|
|
96
|
+
try:
|
|
97
|
+
handled = backend.on_build_start(project_path)
|
|
98
|
+
except Exception:
|
|
99
|
+
handled = False
|
|
100
|
+
if not handled:
|
|
101
|
+
from os import getcwd as _getcwd
|
|
102
|
+
try:
|
|
103
|
+
cwd = Path(_getcwd())
|
|
104
|
+
rel = project_path.relative_to(cwd)
|
|
105
|
+
parts = [part for part in rel.as_posix().split("/") if part]
|
|
106
|
+
except Exception:
|
|
107
|
+
parts = [part for part in project_path.as_posix().split("/")
|
|
108
|
+
if part]
|
|
109
|
+
if len(parts) > 3:
|
|
110
|
+
parts = ["..."] + parts[-3:]
|
|
111
|
+
compact = "/".join(parts) if parts else project_path.name
|
|
112
|
+
logger.info("project: %s", compact)
|
|
113
|
+
|
|
114
|
+
_log_build_settings_debug(project_path, settings)
|
|
115
|
+
|
|
116
|
+
missing_required = set(required_artifacts or [])
|
|
117
|
+
if missing_required:
|
|
118
|
+
existing = state.artifacts.keys() if state else set()
|
|
119
|
+
missing_required = {art for art in missing_required if art not in existing}
|
|
120
|
+
if state and (state.config_hash == config_hash) and not force and not missing_required:
|
|
121
|
+
logger.info(
|
|
122
|
+
"Build is up-to-date (config hash matches); skipping rebuild.")
|
|
123
|
+
return False
|
|
124
|
+
if required_artifacts is not None and not required_artifacts:
|
|
125
|
+
logger.info("Build skipped (no artifacts required for this run).")
|
|
126
|
+
return False
|
|
25
127
|
|
|
128
|
+
task_configs = artifact_tasks(project_path)
|
|
129
|
+
_log_task_overview(task_configs)
|
|
26
130
|
runtime = bootstrap(project_path)
|
|
27
|
-
|
|
131
|
+
|
|
132
|
+
tasks_by_kind = {
|
|
133
|
+
task.kind: task
|
|
134
|
+
for task in task_configs
|
|
135
|
+
if task.enabled
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
artifacts = {}
|
|
139
|
+
|
|
140
|
+
def _work_scaler(task: ScalerTask):
|
|
141
|
+
res = materialize_scaler_statistics(runtime, task)
|
|
142
|
+
if not res:
|
|
143
|
+
return None
|
|
144
|
+
rel_path, meta = res
|
|
145
|
+
full_path = (runtime.artifacts_root / rel_path).resolve()
|
|
146
|
+
meta_out = {"relative_path": rel_path}
|
|
147
|
+
meta_out.update(meta)
|
|
148
|
+
details = ", ".join(f"{k}={v}" for k, v in meta.items())
|
|
149
|
+
suffix = f" ({details})" if details else ""
|
|
150
|
+
logger.info(
|
|
151
|
+
"Materialized %s -> %s%s",
|
|
152
|
+
SCALER_STATISTICS,
|
|
153
|
+
full_path,
|
|
154
|
+
suffix,
|
|
155
|
+
)
|
|
156
|
+
return meta_out
|
|
157
|
+
|
|
158
|
+
def _work_schema(task: SchemaTask):
|
|
159
|
+
res = materialize_vector_schema(runtime, task)
|
|
160
|
+
if not res:
|
|
161
|
+
return None
|
|
162
|
+
rel_path, meta = res
|
|
163
|
+
full_path = (runtime.artifacts_root / rel_path).resolve()
|
|
164
|
+
meta_out = {"relative_path": rel_path}
|
|
165
|
+
meta_out.update(meta)
|
|
166
|
+
details = ", ".join(f"{k}={v}" for k, v in meta.items())
|
|
167
|
+
suffix = f" ({details})" if details else ""
|
|
168
|
+
logger.info("Materialized %s -> %s%s", VECTOR_SCHEMA, full_path, suffix)
|
|
169
|
+
return meta_out
|
|
170
|
+
|
|
171
|
+
def _work_metadata(task: MetadataTask):
|
|
172
|
+
res = materialize_metadata(runtime, task)
|
|
173
|
+
if not res:
|
|
174
|
+
return None
|
|
175
|
+
rel_path, meta = res
|
|
176
|
+
full_path = (runtime.artifacts_root / rel_path).resolve()
|
|
177
|
+
meta_out = {"relative_path": rel_path}
|
|
178
|
+
meta_out.update(meta)
|
|
179
|
+
details = ", ".join(f"{k}={v}" for k, v in meta.items())
|
|
180
|
+
suffix = f" ({details})" if details else ""
|
|
181
|
+
logger.info("Materialized %s -> %s%s", VECTOR_SCHEMA_METADATA, full_path, suffix)
|
|
182
|
+
return meta_out
|
|
183
|
+
|
|
184
|
+
job_specs: list[tuple[str, str, Callable[[], object], Optional[Path]]] = []
|
|
185
|
+
|
|
186
|
+
schema_task = tasks_by_kind.get("schema")
|
|
187
|
+
if schema_task and (required_artifacts is None or VECTOR_SCHEMA in required_artifacts):
|
|
188
|
+
job_specs.append(
|
|
189
|
+
(
|
|
190
|
+
"schema",
|
|
191
|
+
VECTOR_SCHEMA,
|
|
192
|
+
lambda task=schema_task: _work_schema(task),
|
|
193
|
+
schema_task.source_path,
|
|
194
|
+
)
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
metadata_task = tasks_by_kind.get("metadata")
|
|
198
|
+
if metadata_task and (required_artifacts is None or VECTOR_SCHEMA_METADATA in required_artifacts):
|
|
199
|
+
job_specs.append(
|
|
200
|
+
(
|
|
201
|
+
"metadata",
|
|
202
|
+
VECTOR_SCHEMA_METADATA,
|
|
203
|
+
lambda task=metadata_task: _work_metadata(task),
|
|
204
|
+
metadata_task.source_path,
|
|
205
|
+
)
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
scaler_task = tasks_by_kind.get("scaler")
|
|
209
|
+
if scaler_task and (required_artifacts is None or SCALER_STATISTICS in required_artifacts):
|
|
210
|
+
job_specs.append(
|
|
211
|
+
(
|
|
212
|
+
"scaler",
|
|
213
|
+
SCALER_STATISTICS,
|
|
214
|
+
lambda task=scaler_task: _work_scaler(task),
|
|
215
|
+
scaler_task.source_path,
|
|
216
|
+
)
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
total_jobs = len(job_specs)
|
|
220
|
+
for idx, (job_label, artifact_key, job_work, config_path) in enumerate(job_specs, start=1):
|
|
221
|
+
# Prefix sections with a phase label for visuals; keep path-based detail.
|
|
222
|
+
path_sections = sections_from_path(tasks_root, config_path or tasks_root)
|
|
223
|
+
sections = ("Build Tasks",) + tuple(path_sections[1:])
|
|
224
|
+
result = run_job(
|
|
225
|
+
sections=sections,
|
|
226
|
+
label=job_label,
|
|
227
|
+
visuals=effective_provider,
|
|
228
|
+
progress_style=effective_style,
|
|
229
|
+
level=effective_level,
|
|
230
|
+
runtime=runtime,
|
|
231
|
+
work=job_work,
|
|
232
|
+
idx=idx,
|
|
233
|
+
total=total_jobs,
|
|
234
|
+
)
|
|
235
|
+
if result:
|
|
236
|
+
artifacts[artifact_key] = result
|
|
28
237
|
|
|
29
238
|
new_state = BuildState(config_hash=config_hash)
|
|
30
239
|
for key, info in artifacts.items():
|
|
31
240
|
relative_path = info["relative_path"]
|
|
32
241
|
meta = {k: v for k, v in info.items() if k != "relative_path"}
|
|
33
242
|
new_state.register(key, relative_path, meta=meta)
|
|
34
|
-
details = ", ".join(f"{k}={v}" for k, v in meta.items())
|
|
35
|
-
suffix = f" ({details})" if details else ""
|
|
36
|
-
print(f"[build] {key} -> {relative_path}{suffix}")
|
|
37
243
|
|
|
38
244
|
save_build_state(new_state, state_path)
|
|
39
|
-
|
|
245
|
+
return True
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def handle(
|
|
249
|
+
project: str,
|
|
250
|
+
*,
|
|
251
|
+
force: bool = False,
|
|
252
|
+
cli_visuals: str | None = None,
|
|
253
|
+
cli_progress: str | None = None,
|
|
254
|
+
workspace=None,
|
|
255
|
+
) -> None:
|
|
256
|
+
"""Materialize build artifacts for the configured project."""
|
|
257
|
+
run_build_if_needed(
|
|
258
|
+
project,
|
|
259
|
+
force=force,
|
|
260
|
+
cli_visuals=cli_visuals,
|
|
261
|
+
cli_progress=cli_progress,
|
|
262
|
+
workspace=workspace,
|
|
263
|
+
)
|
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from datapipeline.services.paths import pkg_root, resolve_base_pkg_dir
|
|
5
|
+
from datapipeline.services.entrypoints import read_group_entries, inject_ep
|
|
6
|
+
from datapipeline.services.constants import FILTERS_GROUP, MAPPERS_GROUP
|
|
7
|
+
from datapipeline.services.project_paths import (
|
|
8
|
+
sources_dir as resolve_sources_dir,
|
|
9
|
+
streams_dir as resolve_streams_dir,
|
|
10
|
+
ensure_project_scaffold,
|
|
11
|
+
resolve_project_yaml_path,
|
|
12
|
+
)
|
|
13
|
+
from datapipeline.services.scaffold.mappers import attach_source_to_domain
|
|
14
|
+
import re
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _pick_from_list(prompt: str, options: list[str]) -> str:
|
|
18
|
+
print(prompt, file=sys.stderr)
|
|
19
|
+
for i, opt in enumerate(options, 1):
|
|
20
|
+
print(f" [{i}] {opt}", file=sys.stderr)
|
|
21
|
+
while True:
|
|
22
|
+
sel = input("> ").strip()
|
|
23
|
+
if sel.isdigit():
|
|
24
|
+
idx = int(sel)
|
|
25
|
+
if 1 <= idx <= len(options):
|
|
26
|
+
return options[idx - 1]
|
|
27
|
+
print("Please enter a number from the list.", file=sys.stderr)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def handle(
|
|
31
|
+
*,
|
|
32
|
+
plugin_root: Path | None = None,
|
|
33
|
+
use_identity: bool = False,
|
|
34
|
+
) -> None:
|
|
35
|
+
root_dir, name, pyproject = pkg_root(plugin_root)
|
|
36
|
+
# Select contract type: Ingest (source->stream) or Composed (streams->stream)
|
|
37
|
+
print("Select contract type:", file=sys.stderr)
|
|
38
|
+
print(" [1] Ingest (source → stream)", file=sys.stderr)
|
|
39
|
+
print(" [2] Composed (streams → stream)", file=sys.stderr)
|
|
40
|
+
sel = input("> ").strip()
|
|
41
|
+
if sel == "2":
|
|
42
|
+
if use_identity:
|
|
43
|
+
print("[error] --identity is only supported for ingest contracts.", file=sys.stderr)
|
|
44
|
+
raise SystemExit(2)
|
|
45
|
+
# Defer to composed scaffolder (fully interactive)
|
|
46
|
+
scaffold_conflux(
|
|
47
|
+
stream_id=None,
|
|
48
|
+
inputs=None,
|
|
49
|
+
mapper_path=None,
|
|
50
|
+
with_mapper_stub=True,
|
|
51
|
+
plugin_root=plugin_root,
|
|
52
|
+
)
|
|
53
|
+
return
|
|
54
|
+
|
|
55
|
+
# Discover sources by scanning sources_dir YAMLs
|
|
56
|
+
# Default to dataset-scoped project config
|
|
57
|
+
proj_path = resolve_project_yaml_path(root_dir)
|
|
58
|
+
# Ensure a minimal project scaffold so we can resolve dirs interactively
|
|
59
|
+
ensure_project_scaffold(proj_path)
|
|
60
|
+
sources_dir = resolve_sources_dir(proj_path)
|
|
61
|
+
source_options: list[str] = []
|
|
62
|
+
if sources_dir.exists():
|
|
63
|
+
# Recursively scan YAMLs and read declared source id (alias)
|
|
64
|
+
from datapipeline.utils.load import load_yaml
|
|
65
|
+
from datapipeline.services.constants import PARSER_KEY, LOADER_KEY, SOURCE_ID_KEY
|
|
66
|
+
for p in sorted(sources_dir.rglob("*.y*ml")):
|
|
67
|
+
try:
|
|
68
|
+
data = load_yaml(p)
|
|
69
|
+
except Exception:
|
|
70
|
+
continue
|
|
71
|
+
if isinstance(data, dict) and isinstance(data.get(PARSER_KEY), dict) and isinstance(data.get(LOADER_KEY), dict):
|
|
72
|
+
alias = data.get(SOURCE_ID_KEY)
|
|
73
|
+
if isinstance(alias, str):
|
|
74
|
+
source_options.append(alias)
|
|
75
|
+
source_options = sorted(set(source_options))
|
|
76
|
+
if not source_options:
|
|
77
|
+
print("[error] No sources found. Create one first (jerry source add ...)")
|
|
78
|
+
raise SystemExit(2)
|
|
79
|
+
|
|
80
|
+
src_key = _pick_from_list(
|
|
81
|
+
"Select a source for the contract:", source_options)
|
|
82
|
+
# Expect aliases as 'provider.dataset' (from source file's id)
|
|
83
|
+
parts = src_key.split(".", 1)
|
|
84
|
+
if len(parts) != 2:
|
|
85
|
+
print("[error] Source alias must be 'provider.dataset' (from source file's id)", file=sys.stderr)
|
|
86
|
+
raise SystemExit(2)
|
|
87
|
+
provider, dataset = parts[0], parts[1]
|
|
88
|
+
|
|
89
|
+
# Discover domains by scanning the package, fallback to EPs if needed
|
|
90
|
+
base = resolve_base_pkg_dir(root_dir, name)
|
|
91
|
+
domain_options = []
|
|
92
|
+
for dirname in ("domains",):
|
|
93
|
+
dom_dir = base / dirname
|
|
94
|
+
if dom_dir.exists():
|
|
95
|
+
domain_options.extend(
|
|
96
|
+
[p.name for p in dom_dir.iterdir() if p.is_dir()
|
|
97
|
+
and (p / "model.py").exists()]
|
|
98
|
+
)
|
|
99
|
+
domain_options = sorted(set(domain_options))
|
|
100
|
+
if not domain_options:
|
|
101
|
+
domain_options = sorted(
|
|
102
|
+
read_group_entries(pyproject, FILTERS_GROUP).keys())
|
|
103
|
+
if not domain_options:
|
|
104
|
+
print("[error] No domains found. Create one first (jerry domain add ...)")
|
|
105
|
+
raise SystemExit(2)
|
|
106
|
+
|
|
107
|
+
dom_name = _pick_from_list(
|
|
108
|
+
"Select a domain to contract with:", domain_options)
|
|
109
|
+
|
|
110
|
+
def _slug(s: str) -> str:
|
|
111
|
+
s = s.strip().lower()
|
|
112
|
+
s = re.sub(r"[^a-z0-9]+", "_", s)
|
|
113
|
+
return s.strip("_")
|
|
114
|
+
|
|
115
|
+
if use_identity:
|
|
116
|
+
mapper_ep = "identity"
|
|
117
|
+
print("[ok] Using built-in mapper entry point 'identity'.")
|
|
118
|
+
else:
|
|
119
|
+
# create mapper + EP (domain.origin)
|
|
120
|
+
attach_source_to_domain(
|
|
121
|
+
domain=dom_name,
|
|
122
|
+
provider=provider,
|
|
123
|
+
dataset=dataset,
|
|
124
|
+
root=plugin_root,
|
|
125
|
+
)
|
|
126
|
+
ep_key = f"{_slug(dom_name)}.{_slug(dataset)}"
|
|
127
|
+
print(f"[ok] Registered mapper entry point as '{ep_key}'.")
|
|
128
|
+
mapper_ep = ep_key
|
|
129
|
+
|
|
130
|
+
# Derive canonical stream id as domain.dataset[.variant]
|
|
131
|
+
print("Optional variant suffix (press Enter to skip):", file=sys.stderr)
|
|
132
|
+
variant = input("> ").strip()
|
|
133
|
+
if variant:
|
|
134
|
+
canonical_alias = f"{_slug(dom_name)}.{_slug(dataset)}.{_slug(variant)}"
|
|
135
|
+
else:
|
|
136
|
+
canonical_alias = f"{_slug(dom_name)}.{_slug(dataset)}"
|
|
137
|
+
|
|
138
|
+
# Inject per-file canonical stream into streams directory
|
|
139
|
+
streams_path = resolve_streams_dir(proj_path)
|
|
140
|
+
|
|
141
|
+
# canonical_alias and mapper_ep defined above
|
|
142
|
+
# Write a single-file canonical spec into streams directory, matching
|
|
143
|
+
# ContractConfig schema with helpful commented placeholders per stage.
|
|
144
|
+
try:
|
|
145
|
+
# Ensure streams_path is a directory path
|
|
146
|
+
streams_dir = streams_path if streams_path.is_dir() else streams_path.parent
|
|
147
|
+
streams_dir.mkdir(parents=True, exist_ok=True)
|
|
148
|
+
cfile = streams_dir / f"{canonical_alias}.yaml"
|
|
149
|
+
# Build a richer scaffold as YAML text to preserve comments
|
|
150
|
+
scaffold = f"""
|
|
151
|
+
kind: ingest
|
|
152
|
+
source: {src_key}
|
|
153
|
+
id: {canonical_alias} # format: domain.dataset.(variant)
|
|
154
|
+
|
|
155
|
+
mapper:
|
|
156
|
+
entrypoint: {mapper_ep}
|
|
157
|
+
args: {{}}
|
|
158
|
+
|
|
159
|
+
# partition_by: <field or [fields]>
|
|
160
|
+
# sort_batch_size: 100000 # in-memory sort chunk size
|
|
161
|
+
|
|
162
|
+
record: # record-level transforms
|
|
163
|
+
- filter: {{ operator: ge, field: time, comparand: "${{start_time}}" }}
|
|
164
|
+
- filter: {{ operator: le, field: time, comparand: "${{end_time}}" }}
|
|
165
|
+
# - floor_time: {{ resolution: 10m }}
|
|
166
|
+
# - lag: {{ lag: 10m }}
|
|
167
|
+
|
|
168
|
+
# stream: # per-feature transforms (input sorted by id,time)
|
|
169
|
+
# - ensure_ticks: {{ tick: 10m }}
|
|
170
|
+
# - granularity: {{ mode: first }}
|
|
171
|
+
# - fill: {{ statistic: median, window: 6, min_samples: 1 }}
|
|
172
|
+
|
|
173
|
+
# debug: # optional validation-only checks
|
|
174
|
+
# - lint: {{ mode: warn, tick: 10m }}
|
|
175
|
+
"""
|
|
176
|
+
with cfile.open("w", encoding="utf-8") as f:
|
|
177
|
+
f.write(scaffold)
|
|
178
|
+
print(f"[new] canonical spec: {cfile}")
|
|
179
|
+
except Exception as e:
|
|
180
|
+
print(f"[error] Failed to write canonical spec: {e}", file=sys.stderr)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def scaffold_conflux(
|
|
184
|
+
*,
|
|
185
|
+
stream_id: str | None,
|
|
186
|
+
inputs: str | None,
|
|
187
|
+
mapper_path: str | None,
|
|
188
|
+
with_mapper_stub: bool,
|
|
189
|
+
plugin_root: Path | None,
|
|
190
|
+
) -> None:
|
|
191
|
+
"""Scaffold a composed (multi-input) contract and optional mapper stub.
|
|
192
|
+
|
|
193
|
+
inputs: comma-separated list of "[alias=]ref[@stage]" strings.
|
|
194
|
+
mapper_path default: <pkg>.domains.<domain>:mapper where domain = stream_id.split('.')[0]
|
|
195
|
+
"""
|
|
196
|
+
root_dir, name, _ = pkg_root(plugin_root)
|
|
197
|
+
# Resolve default project path early for interactive selections
|
|
198
|
+
proj_path = resolve_project_yaml_path(root_dir)
|
|
199
|
+
ensure_project_scaffold(proj_path)
|
|
200
|
+
# Defer target domain selection until after choosing inputs
|
|
201
|
+
|
|
202
|
+
# We will write the contract after selecting inputs and target domain
|
|
203
|
+
# Build inputs string first: interactive select, then target domain
|
|
204
|
+
if not inputs:
|
|
205
|
+
# Interactive selection of canonical streams (scan recursively, read ids)
|
|
206
|
+
streams: list[str] = []
|
|
207
|
+
sdir = resolve_streams_dir(proj_path)
|
|
208
|
+
if sdir.exists():
|
|
209
|
+
from datapipeline.utils.load import load_yaml
|
|
210
|
+
from datapipeline.services.constants import STREAM_ID_KEY
|
|
211
|
+
for p in sorted(sdir.rglob("*.y*ml")):
|
|
212
|
+
try:
|
|
213
|
+
data = load_yaml(p)
|
|
214
|
+
except Exception:
|
|
215
|
+
continue
|
|
216
|
+
if isinstance(data, dict) and data.get("kind") in {"ingest", "composed"}:
|
|
217
|
+
sid = data.get(STREAM_ID_KEY)
|
|
218
|
+
if isinstance(sid, str) and sid:
|
|
219
|
+
streams.append(sid)
|
|
220
|
+
streams = sorted(set(streams))
|
|
221
|
+
if not streams:
|
|
222
|
+
print(
|
|
223
|
+
"[error] No canonical streams found. Create them first via 'jerry contract' (ingest).", file=sys.stderr)
|
|
224
|
+
raise SystemExit(2)
|
|
225
|
+
print(
|
|
226
|
+
"Select one or more input streams (comma-separated numbers):", file=sys.stderr)
|
|
227
|
+
for i, sid in enumerate(streams, 1):
|
|
228
|
+
print(f" [{i}] {sid}", file=sys.stderr)
|
|
229
|
+
sel = input("> ").strip()
|
|
230
|
+
try:
|
|
231
|
+
idxs = [int(x) for x in sel.split(',') if x.strip()]
|
|
232
|
+
except ValueError:
|
|
233
|
+
print("[error] Invalid selection.", file=sys.stderr)
|
|
234
|
+
raise SystemExit(2)
|
|
235
|
+
picked = []
|
|
236
|
+
for i in idxs:
|
|
237
|
+
if 1 <= i <= len(streams):
|
|
238
|
+
picked.append(streams[i-1])
|
|
239
|
+
if not picked:
|
|
240
|
+
print("[error] No inputs selected.", file=sys.stderr)
|
|
241
|
+
raise SystemExit(2)
|
|
242
|
+
# Build default aliases using domain+variant to avoid collisions.
|
|
243
|
+
# Stream id format: domain.dataset.variant (variant optional)
|
|
244
|
+
built = []
|
|
245
|
+
for ref in picked:
|
|
246
|
+
parts = ref.split(".")
|
|
247
|
+
if len(parts) >= 3:
|
|
248
|
+
domain, variant = parts[0], parts[-1]
|
|
249
|
+
alias = f"{domain}_{variant}"
|
|
250
|
+
elif len(parts) == 2:
|
|
251
|
+
# No explicit variant -> use domain as alias
|
|
252
|
+
alias = parts[0]
|
|
253
|
+
else:
|
|
254
|
+
# Fallback to full ref if unexpected
|
|
255
|
+
alias = ref
|
|
256
|
+
built.append(f"{alias}={ref}")
|
|
257
|
+
inputs = ",".join(built)
|
|
258
|
+
|
|
259
|
+
# YAML list items do not need commas; avoid embedding commas in item text
|
|
260
|
+
inputs_list = "\n - ".join(
|
|
261
|
+
s.strip() for s in inputs.split(",") if s.strip()
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
# If no stream_id, select target domain now and derive stream id (mirror ingest flow)
|
|
265
|
+
if not stream_id:
|
|
266
|
+
base = resolve_base_pkg_dir(root_dir, name)
|
|
267
|
+
domain_options: list[str] = []
|
|
268
|
+
dom_dir = base / "domains"
|
|
269
|
+
if dom_dir.exists():
|
|
270
|
+
domain_options.extend(
|
|
271
|
+
[p.name for p in dom_dir.iterdir() if p.is_dir()
|
|
272
|
+
and (p / "model.py").exists()]
|
|
273
|
+
)
|
|
274
|
+
domain_options = sorted(set(domain_options))
|
|
275
|
+
if not domain_options:
|
|
276
|
+
print("[error] No domains found. Create one first (jerry domain add ...)")
|
|
277
|
+
raise SystemExit(2)
|
|
278
|
+
print("Select a target domain for the composed stream:", file=sys.stderr)
|
|
279
|
+
for i, opt in enumerate(domain_options, 1):
|
|
280
|
+
print(f" [{i}] {opt}", file=sys.stderr)
|
|
281
|
+
sel = input("> ").strip()
|
|
282
|
+
try:
|
|
283
|
+
idx = int(sel)
|
|
284
|
+
if idx < 1 or idx > len(domain_options):
|
|
285
|
+
raise ValueError
|
|
286
|
+
except Exception:
|
|
287
|
+
print("[error] Invalid selection.", file=sys.stderr)
|
|
288
|
+
raise SystemExit(2)
|
|
289
|
+
domain = domain_options[idx - 1]
|
|
290
|
+
stream_id = f"{domain}.processed"
|
|
291
|
+
# Default mapper path uses import-safe package dir, not project name
|
|
292
|
+
pkg_base = resolve_base_pkg_dir(root_dir, name).name
|
|
293
|
+
mapper_path = mapper_path or f"{pkg_base}.mappers.{domain}:mapper"
|
|
294
|
+
else:
|
|
295
|
+
domain = stream_id.split('.')[0]
|
|
296
|
+
pkg_base = resolve_base_pkg_dir(root_dir, name).name
|
|
297
|
+
mapper_path = mapper_path or f"{pkg_base}.mappers.{domain}:mapper"
|
|
298
|
+
|
|
299
|
+
# Optional mapper stub under mappers/
|
|
300
|
+
if with_mapper_stub:
|
|
301
|
+
base = resolve_base_pkg_dir(root_dir, name)
|
|
302
|
+
map_pkg_dir = base / "mappers"
|
|
303
|
+
map_pkg_dir.mkdir(parents=True, exist_ok=True)
|
|
304
|
+
(map_pkg_dir / "__init__.py").touch(exist_ok=True)
|
|
305
|
+
mapper_file = map_pkg_dir / f"{domain}.py"
|
|
306
|
+
if not mapper_file.exists():
|
|
307
|
+
mapper_file.write_text(
|
|
308
|
+
"""
|
|
309
|
+
from typing import Iterator, Mapping
|
|
310
|
+
from datapipeline.domain.record import TemporalRecord
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def mapper(
|
|
314
|
+
inputs: Mapping[str, Iterator[TemporalRecord]],
|
|
315
|
+
*, driver: str | None = None, aux: Mapping[str, Iterator[TemporalRecord]] | None = None, context=None, **params
|
|
316
|
+
) -> Iterator[TemporalRecord]:
|
|
317
|
+
# TODO: implement domain math; inputs are ordered/regularized; aux is raw
|
|
318
|
+
key = driver or next(iter(inputs.keys()))
|
|
319
|
+
for rec in inputs[key]:
|
|
320
|
+
yield rec # replace with your dataclass and computation
|
|
321
|
+
""".lstrip()
|
|
322
|
+
)
|
|
323
|
+
print(f"[new] {mapper_file}")
|
|
324
|
+
# Register mapper entry point under datapipeline.mappers
|
|
325
|
+
# Choose EP name equal to stream_id for clarity/reuse
|
|
326
|
+
ep_key = stream_id
|
|
327
|
+
# If mapper_path looks like a dotted target (module:attr), use it; otherwise build default target
|
|
328
|
+
package_name = base.name # filesystem package dir is import-safe (underscored)
|
|
329
|
+
default_target = f"{package_name}.mappers.{domain}:mapper"
|
|
330
|
+
ep_target = mapper_path if (
|
|
331
|
+
mapper_path and ":" in mapper_path) else default_target
|
|
332
|
+
pyproj_path = root_dir / "pyproject.toml"
|
|
333
|
+
try:
|
|
334
|
+
toml_text = pyproj_path.read_text()
|
|
335
|
+
updated = inject_ep(toml_text, MAPPERS_GROUP, ep_key, ep_target)
|
|
336
|
+
if updated != toml_text:
|
|
337
|
+
pyproj_path.write_text(updated)
|
|
338
|
+
print(
|
|
339
|
+
f"[ok] Registered mapper entry point '{ep_key}' -> {ep_target}")
|
|
340
|
+
except FileNotFoundError:
|
|
341
|
+
print(
|
|
342
|
+
"[info] pyproject.toml not found; skipping entry point registration", file=sys.stderr)
|
|
343
|
+
# From here on, reference the EP name in the YAML
|
|
344
|
+
mapper_path = ep_key
|
|
345
|
+
# Contract file path (now that stream_id is known)
|
|
346
|
+
ensure_project_scaffold(proj_path)
|
|
347
|
+
streams_path = resolve_streams_dir(proj_path)
|
|
348
|
+
streams_dir = streams_path if streams_path.is_dir() else streams_path.parent
|
|
349
|
+
streams_dir.mkdir(parents=True, exist_ok=True)
|
|
350
|
+
cfile = streams_dir / f"{stream_id}.yaml"
|
|
351
|
+
if cfile.exists():
|
|
352
|
+
print(f"[info] Contract already exists, skipping: {cfile}")
|
|
353
|
+
return
|
|
354
|
+
|
|
355
|
+
yaml_text = f"""
|
|
356
|
+
kind: composed
|
|
357
|
+
id: {stream_id} # format: domain.dataset.(variant)
|
|
358
|
+
# partition_by: <field or [fields]>
|
|
359
|
+
inputs:
|
|
360
|
+
- {inputs_list}
|
|
361
|
+
|
|
362
|
+
mapper:
|
|
363
|
+
entrypoint: {mapper_path}
|
|
364
|
+
args: {{ driver: {(inputs.split(',')[0].split('=')[0].strip() if '=' in inputs.split(',')[0] else inputs.split(',')[0].strip())} }}
|
|
365
|
+
"""
|
|
366
|
+
cfile.write_text(yaml_text.strip() + "\n", encoding="utf-8")
|
|
367
|
+
print(f"[new] composed contract: {cfile}")
|
|
@@ -1,9 +1,14 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
1
3
|
from datapipeline.services.scaffold.domain import create_domain
|
|
2
4
|
|
|
3
5
|
|
|
4
|
-
def handle(subcmd: str, domain: str | None) -> None:
|
|
6
|
+
def handle(subcmd: str, domain: str | None, *, plugin_root: Path | None = None) -> None:
|
|
5
7
|
if subcmd in {"create", "add"}:
|
|
6
8
|
if not domain:
|
|
7
|
-
print(
|
|
9
|
+
print(
|
|
10
|
+
"[error] Domain name is required. Use 'jerry domain add <name>' "
|
|
11
|
+
"or pass -n/--name."
|
|
12
|
+
)
|
|
8
13
|
raise SystemExit(2)
|
|
9
|
-
create_domain(domain=domain, root=
|
|
14
|
+
create_domain(domain=domain, root=plugin_root)
|