jerry-thomas 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. datapipeline/analysis/vector/collector.py +120 -17
  2. datapipeline/analysis/vector/matrix.py +33 -8
  3. datapipeline/analysis/vector/report.py +162 -32
  4. datapipeline/build/tasks/__init__.py +11 -0
  5. datapipeline/build/tasks/config.py +74 -0
  6. datapipeline/build/tasks/metadata.py +170 -0
  7. datapipeline/build/tasks/scaler.py +73 -0
  8. datapipeline/build/tasks/schema.py +60 -0
  9. datapipeline/build/tasks/utils.py +169 -0
  10. datapipeline/cli/app.py +304 -127
  11. datapipeline/cli/commands/build.py +240 -16
  12. datapipeline/cli/commands/contract.py +367 -0
  13. datapipeline/cli/commands/domain.py +8 -3
  14. datapipeline/cli/commands/inspect.py +401 -149
  15. datapipeline/cli/commands/list_.py +30 -7
  16. datapipeline/cli/commands/plugin.py +1 -1
  17. datapipeline/cli/commands/run.py +227 -241
  18. datapipeline/cli/commands/run_config.py +101 -0
  19. datapipeline/cli/commands/serve_pipeline.py +156 -0
  20. datapipeline/cli/commands/source.py +44 -8
  21. datapipeline/cli/visuals/__init__.py +4 -2
  22. datapipeline/cli/visuals/common.py +239 -0
  23. datapipeline/cli/visuals/labels.py +15 -15
  24. datapipeline/cli/visuals/runner.py +66 -0
  25. datapipeline/cli/visuals/sections.py +20 -0
  26. datapipeline/cli/visuals/sources.py +132 -119
  27. datapipeline/cli/visuals/sources_basic.py +260 -0
  28. datapipeline/cli/visuals/sources_off.py +76 -0
  29. datapipeline/cli/visuals/sources_rich.py +414 -0
  30. datapipeline/config/catalog.py +37 -3
  31. datapipeline/config/context.py +214 -0
  32. datapipeline/config/dataset/loader.py +21 -4
  33. datapipeline/config/dataset/normalize.py +4 -4
  34. datapipeline/config/metadata.py +43 -0
  35. datapipeline/config/postprocess.py +2 -2
  36. datapipeline/config/project.py +3 -2
  37. datapipeline/config/resolution.py +129 -0
  38. datapipeline/config/tasks.py +309 -0
  39. datapipeline/config/workspace.py +155 -0
  40. datapipeline/domain/__init__.py +12 -0
  41. datapipeline/domain/record.py +11 -0
  42. datapipeline/domain/sample.py +54 -0
  43. datapipeline/integrations/ml/adapter.py +34 -20
  44. datapipeline/integrations/ml/pandas_support.py +0 -2
  45. datapipeline/integrations/ml/rows.py +1 -6
  46. datapipeline/integrations/ml/torch_support.py +1 -3
  47. datapipeline/io/factory.py +112 -0
  48. datapipeline/io/output.py +132 -0
  49. datapipeline/io/protocols.py +21 -0
  50. datapipeline/io/serializers.py +219 -0
  51. datapipeline/io/sinks/__init__.py +23 -0
  52. datapipeline/io/sinks/base.py +2 -0
  53. datapipeline/io/sinks/files.py +79 -0
  54. datapipeline/io/sinks/rich.py +57 -0
  55. datapipeline/io/sinks/stdout.py +18 -0
  56. datapipeline/io/writers/__init__.py +14 -0
  57. datapipeline/io/writers/base.py +28 -0
  58. datapipeline/io/writers/csv_writer.py +25 -0
  59. datapipeline/io/writers/jsonl.py +52 -0
  60. datapipeline/io/writers/pickle_writer.py +30 -0
  61. datapipeline/pipeline/artifacts.py +58 -0
  62. datapipeline/pipeline/context.py +66 -7
  63. datapipeline/pipeline/observability.py +65 -0
  64. datapipeline/pipeline/pipelines.py +65 -13
  65. datapipeline/pipeline/split.py +11 -10
  66. datapipeline/pipeline/stages.py +127 -16
  67. datapipeline/pipeline/utils/keygen.py +20 -7
  68. datapipeline/pipeline/utils/memory_sort.py +22 -10
  69. datapipeline/pipeline/utils/transform_utils.py +22 -0
  70. datapipeline/runtime.py +5 -2
  71. datapipeline/services/artifacts.py +12 -6
  72. datapipeline/services/bootstrap/config.py +25 -0
  73. datapipeline/services/bootstrap/core.py +52 -37
  74. datapipeline/services/constants.py +6 -5
  75. datapipeline/services/factories.py +123 -1
  76. datapipeline/services/project_paths.py +43 -16
  77. datapipeline/services/runs.py +208 -0
  78. datapipeline/services/scaffold/domain.py +3 -2
  79. datapipeline/services/scaffold/filter.py +3 -2
  80. datapipeline/services/scaffold/mappers.py +9 -6
  81. datapipeline/services/scaffold/plugin.py +3 -3
  82. datapipeline/services/scaffold/source.py +93 -56
  83. datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
  84. datapipeline/sources/decoders.py +83 -18
  85. datapipeline/sources/factory.py +26 -16
  86. datapipeline/sources/models/__init__.py +2 -2
  87. datapipeline/sources/models/generator.py +0 -7
  88. datapipeline/sources/models/loader.py +3 -3
  89. datapipeline/sources/models/parsing_error.py +24 -0
  90. datapipeline/sources/models/source.py +6 -6
  91. datapipeline/sources/synthetic/time/loader.py +14 -2
  92. datapipeline/sources/transports.py +74 -37
  93. datapipeline/templates/plugin_skeleton/README.md +74 -30
  94. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
  95. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
  96. datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
  97. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
  98. datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
  99. datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
  100. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
  101. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
  102. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
  103. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
  104. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
  105. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
  106. datapipeline/templates/plugin_skeleton/jerry.yaml +28 -0
  107. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
  108. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
  109. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
  110. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
  111. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
  112. datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
  113. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
  114. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
  115. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
  116. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
  117. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
  118. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
  119. datapipeline/templates/stubs/dto.py.j2 +2 -0
  120. datapipeline/templates/stubs/mapper.py.j2 +5 -4
  121. datapipeline/templates/stubs/parser.py.j2 +2 -0
  122. datapipeline/templates/stubs/record.py.j2 +2 -0
  123. datapipeline/templates/stubs/source.yaml.j2 +2 -3
  124. datapipeline/transforms/debug/lint.py +26 -41
  125. datapipeline/transforms/feature/scaler.py +89 -13
  126. datapipeline/transforms/record/floor_time.py +4 -4
  127. datapipeline/transforms/sequence.py +2 -35
  128. datapipeline/transforms/stream/dedupe.py +24 -0
  129. datapipeline/transforms/stream/ensure_ticks.py +7 -6
  130. datapipeline/transforms/vector/__init__.py +5 -0
  131. datapipeline/transforms/vector/common.py +98 -0
  132. datapipeline/transforms/vector/drop/__init__.py +4 -0
  133. datapipeline/transforms/vector/drop/horizontal.py +79 -0
  134. datapipeline/transforms/vector/drop/orchestrator.py +59 -0
  135. datapipeline/transforms/vector/drop/vertical.py +182 -0
  136. datapipeline/transforms/vector/ensure_schema.py +184 -0
  137. datapipeline/transforms/vector/fill.py +87 -0
  138. datapipeline/transforms/vector/replace.py +62 -0
  139. datapipeline/utils/load.py +24 -3
  140. datapipeline/utils/rich_compat.py +38 -0
  141. datapipeline/utils/window.py +76 -0
  142. jerry_thomas-1.0.0.dist-info/METADATA +825 -0
  143. jerry_thomas-1.0.0.dist-info/RECORD +199 -0
  144. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/entry_points.txt +9 -8
  145. datapipeline/build/tasks.py +0 -186
  146. datapipeline/cli/commands/link.py +0 -128
  147. datapipeline/cli/commands/writers.py +0 -138
  148. datapipeline/config/build.py +0 -64
  149. datapipeline/config/run.py +0 -116
  150. datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
  151. datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
  152. datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
  153. datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
  154. datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
  155. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
  156. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
  157. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
  158. datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
  159. datapipeline/transforms/vector.py +0 -210
  160. jerry_thomas-0.3.0.dist-info/METADATA +0 -502
  161. jerry_thomas-0.3.0.dist-info/RECORD +0 -139
  162. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/WHEEL +0 -0
  163. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/licenses/LICENSE +0 -0
  164. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,39 +1,263 @@
1
+ import json
2
+ import logging
1
3
  from pathlib import Path
4
+ from typing import Callable, Optional
2
5
 
3
6
  from datapipeline.build.state import BuildState, load_build_state, save_build_state
4
- from datapipeline.build.tasks import compute_config_hash, execute_build
5
- from datapipeline.config.build import load_build_config
7
+ from datapipeline.build.tasks import (
8
+ compute_config_hash,
9
+ materialize_scaler_statistics,
10
+ materialize_vector_schema,
11
+ materialize_metadata,
12
+ )
13
+ from datapipeline.cli.visuals import get_visuals_backend
14
+ from datapipeline.cli.visuals.runner import run_job
15
+ from datapipeline.cli.visuals.sections import sections_from_path
16
+ from datapipeline.config.tasks import ArtifactTask, MetadataTask, ScalerTask, SchemaTask, artifact_tasks
17
+ from datapipeline.config.context import resolve_build_settings
6
18
  from datapipeline.services.bootstrap import artifacts_root, bootstrap
7
- from datapipeline.services.project_paths import build_config_path
19
+ from datapipeline.services.constants import (
20
+ SCALER_STATISTICS,
21
+ VECTOR_SCHEMA,
22
+ VECTOR_SCHEMA_METADATA,
23
+ )
24
+ from datapipeline.services.project_paths import tasks_dir
8
25
 
9
26
 
10
- def handle(project: str, *, force: bool = False) -> None:
11
- """Materialize build artifacts for the configured project."""
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ def _log_build_settings_debug(project_path: Path, settings) -> None:
31
+ if not logger.isEnabledFor(logging.DEBUG):
32
+ return
33
+ payload = {
34
+ "project": str(project_path),
35
+ "mode": settings.mode,
36
+ "force": settings.force,
37
+ "visuals": settings.visuals,
38
+ "progress": settings.progress,
39
+ }
40
+ logger.debug("Build settings:\n%s", json.dumps(
41
+ payload, indent=2, default=str))
42
+
43
+
44
+ def _log_task_overview(tasks: list[ArtifactTask]) -> None:
45
+ if not logger.isEnabledFor(logging.DEBUG):
46
+ return
47
+ payload = [
48
+ {
49
+ "name": task.effective_name(),
50
+ "kind": task.kind,
51
+ "enabled": task.enabled,
52
+ "output": getattr(task, "output", None),
53
+ }
54
+ for task in tasks
55
+ ]
56
+ logger.debug("Artifact tasks:\n%s", json.dumps(payload, indent=2, default=str))
57
+
58
+
59
+ def run_build_if_needed(
60
+ project: Path | str,
61
+ *,
62
+ force: bool = False,
63
+ cli_visuals: str | None = None,
64
+ cli_progress: str | None = None,
65
+ workspace=None,
66
+ required_artifacts: set[str] | None = None,
67
+ ) -> bool:
68
+ """Execute the build workflow when the cached config hash has changed.
12
69
 
70
+ Returns True when a build was performed, False if skipped.
71
+ """
13
72
  project_path = Path(project).resolve()
14
- cfg_path = build_config_path(project_path)
15
- build_config = load_build_config(project_path)
16
- config_hash = compute_config_hash(project_path, cfg_path)
73
+ settings = resolve_build_settings(
74
+ workspace=workspace,
75
+ cli_visuals=cli_visuals,
76
+ cli_progress=cli_progress,
77
+ force_flag=force,
78
+ )
79
+ effective_provider = settings.visuals
80
+ effective_style = settings.progress
81
+
82
+ if settings.mode == "OFF":
83
+ logger.info("Build skipped (jerry.yaml build.mode=OFF).")
84
+ return False
85
+ force = settings.force
86
+ tasks_root = tasks_dir(project_path)
87
+ config_hash = compute_config_hash(project_path, tasks_root)
17
88
 
18
89
  art_root = artifacts_root(project_path)
19
90
  state_path = (art_root / "build" / "state.json").resolve()
20
91
  state = load_build_state(state_path)
21
92
 
22
- if state and (state.config_hash == config_hash) and not force:
23
- print("[ok] Build is up-to-date (config hash matches). Use --force to rebuild.")
24
- return
93
+ effective_level = logging.getLogger().getEffectiveLevel()
94
+ backend = get_visuals_backend(effective_provider)
95
+ # Present headline before deciding to skip or run
96
+ try:
97
+ handled = backend.on_build_start(project_path)
98
+ except Exception:
99
+ handled = False
100
+ if not handled:
101
+ from os import getcwd as _getcwd
102
+ try:
103
+ cwd = Path(_getcwd())
104
+ rel = project_path.relative_to(cwd)
105
+ parts = [part for part in rel.as_posix().split("/") if part]
106
+ except Exception:
107
+ parts = [part for part in project_path.as_posix().split("/")
108
+ if part]
109
+ if len(parts) > 3:
110
+ parts = ["..."] + parts[-3:]
111
+ compact = "/".join(parts) if parts else project_path.name
112
+ logger.info("project: %s", compact)
113
+
114
+ _log_build_settings_debug(project_path, settings)
115
+
116
+ missing_required = set(required_artifacts or [])
117
+ if missing_required:
118
+ existing = state.artifacts.keys() if state else set()
119
+ missing_required = {art for art in missing_required if art not in existing}
120
+ if state and (state.config_hash == config_hash) and not force and not missing_required:
121
+ logger.info(
122
+ "Build is up-to-date (config hash matches); skipping rebuild.")
123
+ return False
124
+ if required_artifacts is not None and not required_artifacts:
125
+ logger.info("Build skipped (no artifacts required for this run).")
126
+ return False
25
127
 
128
+ task_configs = artifact_tasks(project_path)
129
+ _log_task_overview(task_configs)
26
130
  runtime = bootstrap(project_path)
27
- artifacts = execute_build(runtime, build_config)
131
+
132
+ tasks_by_kind = {
133
+ task.kind: task
134
+ for task in task_configs
135
+ if task.enabled
136
+ }
137
+
138
+ artifacts = {}
139
+
140
+ def _work_scaler(task: ScalerTask):
141
+ res = materialize_scaler_statistics(runtime, task)
142
+ if not res:
143
+ return None
144
+ rel_path, meta = res
145
+ full_path = (runtime.artifacts_root / rel_path).resolve()
146
+ meta_out = {"relative_path": rel_path}
147
+ meta_out.update(meta)
148
+ details = ", ".join(f"{k}={v}" for k, v in meta.items())
149
+ suffix = f" ({details})" if details else ""
150
+ logger.info(
151
+ "Materialized %s -> %s%s",
152
+ SCALER_STATISTICS,
153
+ full_path,
154
+ suffix,
155
+ )
156
+ return meta_out
157
+
158
+ def _work_schema(task: SchemaTask):
159
+ res = materialize_vector_schema(runtime, task)
160
+ if not res:
161
+ return None
162
+ rel_path, meta = res
163
+ full_path = (runtime.artifacts_root / rel_path).resolve()
164
+ meta_out = {"relative_path": rel_path}
165
+ meta_out.update(meta)
166
+ details = ", ".join(f"{k}={v}" for k, v in meta.items())
167
+ suffix = f" ({details})" if details else ""
168
+ logger.info("Materialized %s -> %s%s", VECTOR_SCHEMA, full_path, suffix)
169
+ return meta_out
170
+
171
+ def _work_metadata(task: MetadataTask):
172
+ res = materialize_metadata(runtime, task)
173
+ if not res:
174
+ return None
175
+ rel_path, meta = res
176
+ full_path = (runtime.artifacts_root / rel_path).resolve()
177
+ meta_out = {"relative_path": rel_path}
178
+ meta_out.update(meta)
179
+ details = ", ".join(f"{k}={v}" for k, v in meta.items())
180
+ suffix = f" ({details})" if details else ""
181
+ logger.info("Materialized %s -> %s%s", VECTOR_SCHEMA_METADATA, full_path, suffix)
182
+ return meta_out
183
+
184
+ job_specs: list[tuple[str, str, Callable[[], object], Optional[Path]]] = []
185
+
186
+ schema_task = tasks_by_kind.get("schema")
187
+ if schema_task and (required_artifacts is None or VECTOR_SCHEMA in required_artifacts):
188
+ job_specs.append(
189
+ (
190
+ "schema",
191
+ VECTOR_SCHEMA,
192
+ lambda task=schema_task: _work_schema(task),
193
+ schema_task.source_path,
194
+ )
195
+ )
196
+
197
+ metadata_task = tasks_by_kind.get("metadata")
198
+ if metadata_task and (required_artifacts is None or VECTOR_SCHEMA_METADATA in required_artifacts):
199
+ job_specs.append(
200
+ (
201
+ "metadata",
202
+ VECTOR_SCHEMA_METADATA,
203
+ lambda task=metadata_task: _work_metadata(task),
204
+ metadata_task.source_path,
205
+ )
206
+ )
207
+
208
+ scaler_task = tasks_by_kind.get("scaler")
209
+ if scaler_task and (required_artifacts is None or SCALER_STATISTICS in required_artifacts):
210
+ job_specs.append(
211
+ (
212
+ "scaler",
213
+ SCALER_STATISTICS,
214
+ lambda task=scaler_task: _work_scaler(task),
215
+ scaler_task.source_path,
216
+ )
217
+ )
218
+
219
+ total_jobs = len(job_specs)
220
+ for idx, (job_label, artifact_key, job_work, config_path) in enumerate(job_specs, start=1):
221
+ # Prefix sections with a phase label for visuals; keep path-based detail.
222
+ path_sections = sections_from_path(tasks_root, config_path or tasks_root)
223
+ sections = ("Build Tasks",) + tuple(path_sections[1:])
224
+ result = run_job(
225
+ sections=sections,
226
+ label=job_label,
227
+ visuals=effective_provider,
228
+ progress_style=effective_style,
229
+ level=effective_level,
230
+ runtime=runtime,
231
+ work=job_work,
232
+ idx=idx,
233
+ total=total_jobs,
234
+ )
235
+ if result:
236
+ artifacts[artifact_key] = result
28
237
 
29
238
  new_state = BuildState(config_hash=config_hash)
30
239
  for key, info in artifacts.items():
31
240
  relative_path = info["relative_path"]
32
241
  meta = {k: v for k, v in info.items() if k != "relative_path"}
33
242
  new_state.register(key, relative_path, meta=meta)
34
- details = ", ".join(f"{k}={v}" for k, v in meta.items())
35
- suffix = f" ({details})" if details else ""
36
- print(f"[build] {key} -> {relative_path}{suffix}")
37
243
 
38
244
  save_build_state(new_state, state_path)
39
- print("[ok] Build completed.")
245
+ return True
246
+
247
+
248
+ def handle(
249
+ project: str,
250
+ *,
251
+ force: bool = False,
252
+ cli_visuals: str | None = None,
253
+ cli_progress: str | None = None,
254
+ workspace=None,
255
+ ) -> None:
256
+ """Materialize build artifacts for the configured project."""
257
+ run_build_if_needed(
258
+ project,
259
+ force=force,
260
+ cli_visuals=cli_visuals,
261
+ cli_progress=cli_progress,
262
+ workspace=workspace,
263
+ )
@@ -0,0 +1,367 @@
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ from datapipeline.services.paths import pkg_root, resolve_base_pkg_dir
5
+ from datapipeline.services.entrypoints import read_group_entries, inject_ep
6
+ from datapipeline.services.constants import FILTERS_GROUP, MAPPERS_GROUP
7
+ from datapipeline.services.project_paths import (
8
+ sources_dir as resolve_sources_dir,
9
+ streams_dir as resolve_streams_dir,
10
+ ensure_project_scaffold,
11
+ resolve_project_yaml_path,
12
+ )
13
+ from datapipeline.services.scaffold.mappers import attach_source_to_domain
14
+ import re
15
+
16
+
17
+ def _pick_from_list(prompt: str, options: list[str]) -> str:
18
+ print(prompt, file=sys.stderr)
19
+ for i, opt in enumerate(options, 1):
20
+ print(f" [{i}] {opt}", file=sys.stderr)
21
+ while True:
22
+ sel = input("> ").strip()
23
+ if sel.isdigit():
24
+ idx = int(sel)
25
+ if 1 <= idx <= len(options):
26
+ return options[idx - 1]
27
+ print("Please enter a number from the list.", file=sys.stderr)
28
+
29
+
30
+ def handle(
31
+ *,
32
+ plugin_root: Path | None = None,
33
+ use_identity: bool = False,
34
+ ) -> None:
35
+ root_dir, name, pyproject = pkg_root(plugin_root)
36
+ # Select contract type: Ingest (source->stream) or Composed (streams->stream)
37
+ print("Select contract type:", file=sys.stderr)
38
+ print(" [1] Ingest (source → stream)", file=sys.stderr)
39
+ print(" [2] Composed (streams → stream)", file=sys.stderr)
40
+ sel = input("> ").strip()
41
+ if sel == "2":
42
+ if use_identity:
43
+ print("[error] --identity is only supported for ingest contracts.", file=sys.stderr)
44
+ raise SystemExit(2)
45
+ # Defer to composed scaffolder (fully interactive)
46
+ scaffold_conflux(
47
+ stream_id=None,
48
+ inputs=None,
49
+ mapper_path=None,
50
+ with_mapper_stub=True,
51
+ plugin_root=plugin_root,
52
+ )
53
+ return
54
+
55
+ # Discover sources by scanning sources_dir YAMLs
56
+ # Default to dataset-scoped project config
57
+ proj_path = resolve_project_yaml_path(root_dir)
58
+ # Ensure a minimal project scaffold so we can resolve dirs interactively
59
+ ensure_project_scaffold(proj_path)
60
+ sources_dir = resolve_sources_dir(proj_path)
61
+ source_options: list[str] = []
62
+ if sources_dir.exists():
63
+ # Recursively scan YAMLs and read declared source id (alias)
64
+ from datapipeline.utils.load import load_yaml
65
+ from datapipeline.services.constants import PARSER_KEY, LOADER_KEY, SOURCE_ID_KEY
66
+ for p in sorted(sources_dir.rglob("*.y*ml")):
67
+ try:
68
+ data = load_yaml(p)
69
+ except Exception:
70
+ continue
71
+ if isinstance(data, dict) and isinstance(data.get(PARSER_KEY), dict) and isinstance(data.get(LOADER_KEY), dict):
72
+ alias = data.get(SOURCE_ID_KEY)
73
+ if isinstance(alias, str):
74
+ source_options.append(alias)
75
+ source_options = sorted(set(source_options))
76
+ if not source_options:
77
+ print("[error] No sources found. Create one first (jerry source add ...)")
78
+ raise SystemExit(2)
79
+
80
+ src_key = _pick_from_list(
81
+ "Select a source for the contract:", source_options)
82
+ # Expect aliases as 'provider.dataset' (from source file's id)
83
+ parts = src_key.split(".", 1)
84
+ if len(parts) != 2:
85
+ print("[error] Source alias must be 'provider.dataset' (from source file's id)", file=sys.stderr)
86
+ raise SystemExit(2)
87
+ provider, dataset = parts[0], parts[1]
88
+
89
+ # Discover domains by scanning the package, fallback to EPs if needed
90
+ base = resolve_base_pkg_dir(root_dir, name)
91
+ domain_options = []
92
+ for dirname in ("domains",):
93
+ dom_dir = base / dirname
94
+ if dom_dir.exists():
95
+ domain_options.extend(
96
+ [p.name for p in dom_dir.iterdir() if p.is_dir()
97
+ and (p / "model.py").exists()]
98
+ )
99
+ domain_options = sorted(set(domain_options))
100
+ if not domain_options:
101
+ domain_options = sorted(
102
+ read_group_entries(pyproject, FILTERS_GROUP).keys())
103
+ if not domain_options:
104
+ print("[error] No domains found. Create one first (jerry domain add ...)")
105
+ raise SystemExit(2)
106
+
107
+ dom_name = _pick_from_list(
108
+ "Select a domain to contract with:", domain_options)
109
+
110
+ def _slug(s: str) -> str:
111
+ s = s.strip().lower()
112
+ s = re.sub(r"[^a-z0-9]+", "_", s)
113
+ return s.strip("_")
114
+
115
+ if use_identity:
116
+ mapper_ep = "identity"
117
+ print("[ok] Using built-in mapper entry point 'identity'.")
118
+ else:
119
+ # create mapper + EP (domain.origin)
120
+ attach_source_to_domain(
121
+ domain=dom_name,
122
+ provider=provider,
123
+ dataset=dataset,
124
+ root=plugin_root,
125
+ )
126
+ ep_key = f"{_slug(dom_name)}.{_slug(dataset)}"
127
+ print(f"[ok] Registered mapper entry point as '{ep_key}'.")
128
+ mapper_ep = ep_key
129
+
130
+ # Derive canonical stream id as domain.dataset[.variant]
131
+ print("Optional variant suffix (press Enter to skip):", file=sys.stderr)
132
+ variant = input("> ").strip()
133
+ if variant:
134
+ canonical_alias = f"{_slug(dom_name)}.{_slug(dataset)}.{_slug(variant)}"
135
+ else:
136
+ canonical_alias = f"{_slug(dom_name)}.{_slug(dataset)}"
137
+
138
+ # Inject per-file canonical stream into streams directory
139
+ streams_path = resolve_streams_dir(proj_path)
140
+
141
+ # canonical_alias and mapper_ep defined above
142
+ # Write a single-file canonical spec into streams directory, matching
143
+ # ContractConfig schema with helpful commented placeholders per stage.
144
+ try:
145
+ # Ensure streams_path is a directory path
146
+ streams_dir = streams_path if streams_path.is_dir() else streams_path.parent
147
+ streams_dir.mkdir(parents=True, exist_ok=True)
148
+ cfile = streams_dir / f"{canonical_alias}.yaml"
149
+ # Build a richer scaffold as YAML text to preserve comments
150
+ scaffold = f"""
151
+ kind: ingest
152
+ source: {src_key}
153
+ id: {canonical_alias} # format: domain.dataset.(variant)
154
+
155
+ mapper:
156
+ entrypoint: {mapper_ep}
157
+ args: {{}}
158
+
159
+ # partition_by: <field or [fields]>
160
+ # sort_batch_size: 100000 # in-memory sort chunk size
161
+
162
+ record: # record-level transforms
163
+ - filter: {{ operator: ge, field: time, comparand: "${{start_time}}" }}
164
+ - filter: {{ operator: le, field: time, comparand: "${{end_time}}" }}
165
+ # - floor_time: {{ resolution: 10m }}
166
+ # - lag: {{ lag: 10m }}
167
+
168
+ # stream: # per-feature transforms (input sorted by id,time)
169
+ # - ensure_ticks: {{ tick: 10m }}
170
+ # - granularity: {{ mode: first }}
171
+ # - fill: {{ statistic: median, window: 6, min_samples: 1 }}
172
+
173
+ # debug: # optional validation-only checks
174
+ # - lint: {{ mode: warn, tick: 10m }}
175
+ """
176
+ with cfile.open("w", encoding="utf-8") as f:
177
+ f.write(scaffold)
178
+ print(f"[new] canonical spec: {cfile}")
179
+ except Exception as e:
180
+ print(f"[error] Failed to write canonical spec: {e}", file=sys.stderr)
181
+
182
+
183
+ def scaffold_conflux(
184
+ *,
185
+ stream_id: str | None,
186
+ inputs: str | None,
187
+ mapper_path: str | None,
188
+ with_mapper_stub: bool,
189
+ plugin_root: Path | None,
190
+ ) -> None:
191
+ """Scaffold a composed (multi-input) contract and optional mapper stub.
192
+
193
+ inputs: comma-separated list of "[alias=]ref[@stage]" strings.
194
+ mapper_path default: <pkg>.domains.<domain>:mapper where domain = stream_id.split('.')[0]
195
+ """
196
+ root_dir, name, _ = pkg_root(plugin_root)
197
+ # Resolve default project path early for interactive selections
198
+ proj_path = resolve_project_yaml_path(root_dir)
199
+ ensure_project_scaffold(proj_path)
200
+ # Defer target domain selection until after choosing inputs
201
+
202
+ # We will write the contract after selecting inputs and target domain
203
+ # Build inputs string first: interactive select, then target domain
204
+ if not inputs:
205
+ # Interactive selection of canonical streams (scan recursively, read ids)
206
+ streams: list[str] = []
207
+ sdir = resolve_streams_dir(proj_path)
208
+ if sdir.exists():
209
+ from datapipeline.utils.load import load_yaml
210
+ from datapipeline.services.constants import STREAM_ID_KEY
211
+ for p in sorted(sdir.rglob("*.y*ml")):
212
+ try:
213
+ data = load_yaml(p)
214
+ except Exception:
215
+ continue
216
+ if isinstance(data, dict) and data.get("kind") in {"ingest", "composed"}:
217
+ sid = data.get(STREAM_ID_KEY)
218
+ if isinstance(sid, str) and sid:
219
+ streams.append(sid)
220
+ streams = sorted(set(streams))
221
+ if not streams:
222
+ print(
223
+ "[error] No canonical streams found. Create them first via 'jerry contract' (ingest).", file=sys.stderr)
224
+ raise SystemExit(2)
225
+ print(
226
+ "Select one or more input streams (comma-separated numbers):", file=sys.stderr)
227
+ for i, sid in enumerate(streams, 1):
228
+ print(f" [{i}] {sid}", file=sys.stderr)
229
+ sel = input("> ").strip()
230
+ try:
231
+ idxs = [int(x) for x in sel.split(',') if x.strip()]
232
+ except ValueError:
233
+ print("[error] Invalid selection.", file=sys.stderr)
234
+ raise SystemExit(2)
235
+ picked = []
236
+ for i in idxs:
237
+ if 1 <= i <= len(streams):
238
+ picked.append(streams[i-1])
239
+ if not picked:
240
+ print("[error] No inputs selected.", file=sys.stderr)
241
+ raise SystemExit(2)
242
+ # Build default aliases using domain+variant to avoid collisions.
243
+ # Stream id format: domain.dataset.variant (variant optional)
244
+ built = []
245
+ for ref in picked:
246
+ parts = ref.split(".")
247
+ if len(parts) >= 3:
248
+ domain, variant = parts[0], parts[-1]
249
+ alias = f"{domain}_{variant}"
250
+ elif len(parts) == 2:
251
+ # No explicit variant -> use domain as alias
252
+ alias = parts[0]
253
+ else:
254
+ # Fallback to full ref if unexpected
255
+ alias = ref
256
+ built.append(f"{alias}={ref}")
257
+ inputs = ",".join(built)
258
+
259
+ # YAML list items do not need commas; avoid embedding commas in item text
260
+ inputs_list = "\n - ".join(
261
+ s.strip() for s in inputs.split(",") if s.strip()
262
+ )
263
+
264
+ # If no stream_id, select target domain now and derive stream id (mirror ingest flow)
265
+ if not stream_id:
266
+ base = resolve_base_pkg_dir(root_dir, name)
267
+ domain_options: list[str] = []
268
+ dom_dir = base / "domains"
269
+ if dom_dir.exists():
270
+ domain_options.extend(
271
+ [p.name for p in dom_dir.iterdir() if p.is_dir()
272
+ and (p / "model.py").exists()]
273
+ )
274
+ domain_options = sorted(set(domain_options))
275
+ if not domain_options:
276
+ print("[error] No domains found. Create one first (jerry domain add ...)")
277
+ raise SystemExit(2)
278
+ print("Select a target domain for the composed stream:", file=sys.stderr)
279
+ for i, opt in enumerate(domain_options, 1):
280
+ print(f" [{i}] {opt}", file=sys.stderr)
281
+ sel = input("> ").strip()
282
+ try:
283
+ idx = int(sel)
284
+ if idx < 1 or idx > len(domain_options):
285
+ raise ValueError
286
+ except Exception:
287
+ print("[error] Invalid selection.", file=sys.stderr)
288
+ raise SystemExit(2)
289
+ domain = domain_options[idx - 1]
290
+ stream_id = f"{domain}.processed"
291
+ # Default mapper path uses import-safe package dir, not project name
292
+ pkg_base = resolve_base_pkg_dir(root_dir, name).name
293
+ mapper_path = mapper_path or f"{pkg_base}.mappers.{domain}:mapper"
294
+ else:
295
+ domain = stream_id.split('.')[0]
296
+ pkg_base = resolve_base_pkg_dir(root_dir, name).name
297
+ mapper_path = mapper_path or f"{pkg_base}.mappers.{domain}:mapper"
298
+
299
+ # Optional mapper stub under mappers/
300
+ if with_mapper_stub:
301
+ base = resolve_base_pkg_dir(root_dir, name)
302
+ map_pkg_dir = base / "mappers"
303
+ map_pkg_dir.mkdir(parents=True, exist_ok=True)
304
+ (map_pkg_dir / "__init__.py").touch(exist_ok=True)
305
+ mapper_file = map_pkg_dir / f"{domain}.py"
306
+ if not mapper_file.exists():
307
+ mapper_file.write_text(
308
+ """
309
+ from typing import Iterator, Mapping
310
+ from datapipeline.domain.record import TemporalRecord
311
+
312
+
313
+ def mapper(
314
+ inputs: Mapping[str, Iterator[TemporalRecord]],
315
+ *, driver: str | None = None, aux: Mapping[str, Iterator[TemporalRecord]] | None = None, context=None, **params
316
+ ) -> Iterator[TemporalRecord]:
317
+ # TODO: implement domain math; inputs are ordered/regularized; aux is raw
318
+ key = driver or next(iter(inputs.keys()))
319
+ for rec in inputs[key]:
320
+ yield rec # replace with your dataclass and computation
321
+ """.lstrip()
322
+ )
323
+ print(f"[new] {mapper_file}")
324
+ # Register mapper entry point under datapipeline.mappers
325
+ # Choose EP name equal to stream_id for clarity/reuse
326
+ ep_key = stream_id
327
+ # If mapper_path looks like a dotted target (module:attr), use it; otherwise build default target
328
+ package_name = base.name # filesystem package dir is import-safe (underscored)
329
+ default_target = f"{package_name}.mappers.{domain}:mapper"
330
+ ep_target = mapper_path if (
331
+ mapper_path and ":" in mapper_path) else default_target
332
+ pyproj_path = root_dir / "pyproject.toml"
333
+ try:
334
+ toml_text = pyproj_path.read_text()
335
+ updated = inject_ep(toml_text, MAPPERS_GROUP, ep_key, ep_target)
336
+ if updated != toml_text:
337
+ pyproj_path.write_text(updated)
338
+ print(
339
+ f"[ok] Registered mapper entry point '{ep_key}' -> {ep_target}")
340
+ except FileNotFoundError:
341
+ print(
342
+ "[info] pyproject.toml not found; skipping entry point registration", file=sys.stderr)
343
+ # From here on, reference the EP name in the YAML
344
+ mapper_path = ep_key
345
+ # Contract file path (now that stream_id is known)
346
+ ensure_project_scaffold(proj_path)
347
+ streams_path = resolve_streams_dir(proj_path)
348
+ streams_dir = streams_path if streams_path.is_dir() else streams_path.parent
349
+ streams_dir.mkdir(parents=True, exist_ok=True)
350
+ cfile = streams_dir / f"{stream_id}.yaml"
351
+ if cfile.exists():
352
+ print(f"[info] Contract already exists, skipping: {cfile}")
353
+ return
354
+
355
+ yaml_text = f"""
356
+ kind: composed
357
+ id: {stream_id} # format: domain.dataset.(variant)
358
+ # partition_by: <field or [fields]>
359
+ inputs:
360
+ - {inputs_list}
361
+
362
+ mapper:
363
+ entrypoint: {mapper_path}
364
+ args: {{ driver: {(inputs.split(',')[0].split('=')[0].strip() if '=' in inputs.split(',')[0] else inputs.split(',')[0].strip())} }}
365
+ """
366
+ cfile.write_text(yaml_text.strip() + "\n", encoding="utf-8")
367
+ print(f"[new] composed contract: {cfile}")
@@ -1,9 +1,14 @@
1
+ from pathlib import Path
2
+
1
3
  from datapipeline.services.scaffold.domain import create_domain
2
4
 
3
5
 
4
- def handle(subcmd: str, domain: str | None) -> None:
6
+ def handle(subcmd: str, domain: str | None, *, plugin_root: Path | None = None) -> None:
5
7
  if subcmd in {"create", "add"}:
6
8
  if not domain:
7
- print("[error] --domain is required")
9
+ print(
10
+ "[error] Domain name is required. Use 'jerry domain add <name>' "
11
+ "or pass -n/--name."
12
+ )
8
13
  raise SystemExit(2)
9
- create_domain(domain=domain, root=None)
14
+ create_domain(domain=domain, root=plugin_root)