jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. datapipeline/analysis/vector/collector.py +120 -17
  2. datapipeline/analysis/vector/matrix.py +33 -8
  3. datapipeline/analysis/vector/report.py +162 -32
  4. datapipeline/build/tasks/__init__.py +11 -0
  5. datapipeline/build/tasks/config.py +74 -0
  6. datapipeline/build/tasks/metadata.py +170 -0
  7. datapipeline/build/tasks/scaler.py +73 -0
  8. datapipeline/build/tasks/schema.py +60 -0
  9. datapipeline/build/tasks/utils.py +169 -0
  10. datapipeline/cli/app.py +304 -127
  11. datapipeline/cli/commands/build.py +240 -16
  12. datapipeline/cli/commands/contract.py +367 -0
  13. datapipeline/cli/commands/domain.py +8 -3
  14. datapipeline/cli/commands/inspect.py +401 -149
  15. datapipeline/cli/commands/list_.py +30 -7
  16. datapipeline/cli/commands/plugin.py +5 -1
  17. datapipeline/cli/commands/run.py +227 -241
  18. datapipeline/cli/commands/run_config.py +101 -0
  19. datapipeline/cli/commands/serve_pipeline.py +156 -0
  20. datapipeline/cli/commands/source.py +44 -8
  21. datapipeline/cli/visuals/__init__.py +4 -2
  22. datapipeline/cli/visuals/common.py +239 -0
  23. datapipeline/cli/visuals/labels.py +15 -15
  24. datapipeline/cli/visuals/runner.py +66 -0
  25. datapipeline/cli/visuals/sections.py +20 -0
  26. datapipeline/cli/visuals/sources.py +132 -119
  27. datapipeline/cli/visuals/sources_basic.py +260 -0
  28. datapipeline/cli/visuals/sources_off.py +76 -0
  29. datapipeline/cli/visuals/sources_rich.py +414 -0
  30. datapipeline/config/catalog.py +37 -3
  31. datapipeline/config/context.py +214 -0
  32. datapipeline/config/dataset/loader.py +21 -4
  33. datapipeline/config/dataset/normalize.py +4 -4
  34. datapipeline/config/metadata.py +43 -0
  35. datapipeline/config/postprocess.py +2 -2
  36. datapipeline/config/project.py +3 -2
  37. datapipeline/config/resolution.py +129 -0
  38. datapipeline/config/tasks.py +309 -0
  39. datapipeline/config/workspace.py +155 -0
  40. datapipeline/domain/__init__.py +12 -0
  41. datapipeline/domain/record.py +11 -0
  42. datapipeline/domain/sample.py +54 -0
  43. datapipeline/integrations/ml/adapter.py +34 -20
  44. datapipeline/integrations/ml/pandas_support.py +0 -2
  45. datapipeline/integrations/ml/rows.py +1 -6
  46. datapipeline/integrations/ml/torch_support.py +1 -3
  47. datapipeline/io/factory.py +112 -0
  48. datapipeline/io/output.py +132 -0
  49. datapipeline/io/protocols.py +21 -0
  50. datapipeline/io/serializers.py +219 -0
  51. datapipeline/io/sinks/__init__.py +23 -0
  52. datapipeline/io/sinks/base.py +2 -0
  53. datapipeline/io/sinks/files.py +79 -0
  54. datapipeline/io/sinks/rich.py +57 -0
  55. datapipeline/io/sinks/stdout.py +18 -0
  56. datapipeline/io/writers/__init__.py +14 -0
  57. datapipeline/io/writers/base.py +28 -0
  58. datapipeline/io/writers/csv_writer.py +25 -0
  59. datapipeline/io/writers/jsonl.py +52 -0
  60. datapipeline/io/writers/pickle_writer.py +30 -0
  61. datapipeline/pipeline/artifacts.py +58 -0
  62. datapipeline/pipeline/context.py +66 -7
  63. datapipeline/pipeline/observability.py +65 -0
  64. datapipeline/pipeline/pipelines.py +65 -13
  65. datapipeline/pipeline/split.py +11 -10
  66. datapipeline/pipeline/stages.py +127 -16
  67. datapipeline/pipeline/utils/keygen.py +20 -7
  68. datapipeline/pipeline/utils/memory_sort.py +22 -10
  69. datapipeline/pipeline/utils/transform_utils.py +22 -0
  70. datapipeline/runtime.py +5 -2
  71. datapipeline/services/artifacts.py +12 -6
  72. datapipeline/services/bootstrap/config.py +25 -0
  73. datapipeline/services/bootstrap/core.py +52 -37
  74. datapipeline/services/constants.py +6 -5
  75. datapipeline/services/factories.py +123 -1
  76. datapipeline/services/project_paths.py +43 -16
  77. datapipeline/services/runs.py +208 -0
  78. datapipeline/services/scaffold/domain.py +3 -2
  79. datapipeline/services/scaffold/filter.py +3 -2
  80. datapipeline/services/scaffold/mappers.py +9 -6
  81. datapipeline/services/scaffold/plugin.py +54 -10
  82. datapipeline/services/scaffold/source.py +93 -56
  83. datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
  84. datapipeline/sources/decoders.py +83 -18
  85. datapipeline/sources/factory.py +26 -16
  86. datapipeline/sources/models/__init__.py +2 -2
  87. datapipeline/sources/models/generator.py +0 -7
  88. datapipeline/sources/models/loader.py +3 -3
  89. datapipeline/sources/models/parsing_error.py +24 -0
  90. datapipeline/sources/models/source.py +6 -6
  91. datapipeline/sources/synthetic/time/loader.py +14 -2
  92. datapipeline/sources/transports.py +74 -37
  93. datapipeline/templates/plugin_skeleton/README.md +76 -30
  94. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
  95. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
  96. datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
  97. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
  98. datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
  99. datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
  100. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
  101. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
  102. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
  103. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
  104. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
  105. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
  106. datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
  107. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
  108. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
  109. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
  110. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
  111. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
  112. datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
  113. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
  114. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
  115. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
  116. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
  117. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
  118. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
  119. datapipeline/templates/stubs/dto.py.j2 +2 -0
  120. datapipeline/templates/stubs/mapper.py.j2 +5 -4
  121. datapipeline/templates/stubs/parser.py.j2 +2 -0
  122. datapipeline/templates/stubs/record.py.j2 +2 -0
  123. datapipeline/templates/stubs/source.yaml.j2 +2 -3
  124. datapipeline/transforms/debug/lint.py +26 -41
  125. datapipeline/transforms/feature/scaler.py +89 -13
  126. datapipeline/transforms/record/floor_time.py +4 -4
  127. datapipeline/transforms/sequence.py +2 -35
  128. datapipeline/transforms/stream/dedupe.py +24 -0
  129. datapipeline/transforms/stream/ensure_ticks.py +7 -6
  130. datapipeline/transforms/vector/__init__.py +5 -0
  131. datapipeline/transforms/vector/common.py +98 -0
  132. datapipeline/transforms/vector/drop/__init__.py +4 -0
  133. datapipeline/transforms/vector/drop/horizontal.py +79 -0
  134. datapipeline/transforms/vector/drop/orchestrator.py +59 -0
  135. datapipeline/transforms/vector/drop/vertical.py +182 -0
  136. datapipeline/transforms/vector/ensure_schema.py +184 -0
  137. datapipeline/transforms/vector/fill.py +87 -0
  138. datapipeline/transforms/vector/replace.py +62 -0
  139. datapipeline/utils/load.py +24 -3
  140. datapipeline/utils/rich_compat.py +38 -0
  141. datapipeline/utils/window.py +76 -0
  142. jerry_thomas-1.0.1.dist-info/METADATA +825 -0
  143. jerry_thomas-1.0.1.dist-info/RECORD +199 -0
  144. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
  145. datapipeline/build/tasks.py +0 -186
  146. datapipeline/cli/commands/link.py +0 -128
  147. datapipeline/cli/commands/writers.py +0 -138
  148. datapipeline/config/build.py +0 -64
  149. datapipeline/config/run.py +0 -116
  150. datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
  151. datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
  152. datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
  153. datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
  154. datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
  155. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
  156. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
  157. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
  158. datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
  159. datapipeline/transforms/vector.py +0 -210
  160. jerry_thomas-0.3.0.dist-info/METADATA +0 -502
  161. jerry_thomas-0.3.0.dist-info/RECORD +0 -139
  162. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
  163. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
  164. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,414 @@
1
+ from contextlib import contextmanager
2
+ from typing import Iterator, Any, Optional, Deque, Dict, Tuple
3
+ from pathlib import Path
4
+ from math import ceil
5
+ import logging
6
+ import os
7
+
8
+ from collections import deque
9
+
10
+ from rich.live import Live
11
+ from rich.progress import (
12
+ Progress,
13
+ ProgressColumn,
14
+ SpinnerColumn,
15
+ TextColumn,
16
+ BarColumn,
17
+ MofNCompleteColumn,
18
+ TaskProgressColumn,
19
+ TimeElapsedColumn,
20
+ Task,
21
+ )
22
+ from rich.text import Text
23
+
24
+ from .labels import progress_meta_for_loader
25
+ from .common import (
26
+ compute_glob_root,
27
+ current_transport_label,
28
+ log_combined_stream,
29
+ transport_debug_lines,
30
+ transport_info_lines,
31
+ )
32
+ from datapipeline.runtime import Runtime
33
+ from datapipeline.sources.models.source import Source
34
+ from datapipeline.sources.transports import FsGlobTransport, FsFileTransport, HttpTransport
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ class AverageTimeRemainingColumn(ProgressColumn):
39
+ """ETA column that blends long-term and recent throughput for stability."""
40
+
41
+ max_refresh = 0.5
42
+
43
+ def __init__(
44
+ self,
45
+ compact: bool = False,
46
+ elapsed_when_finished: bool = False,
47
+ table_column: Optional[Any] = None,
48
+ window_seconds: float = 300.0,
49
+ ) -> None:
50
+ self.compact = compact
51
+ self.elapsed_when_finished = elapsed_when_finished
52
+ self.window_seconds = max(0.0, float(window_seconds))
53
+ self._history: Dict[int, Deque[Tuple[float, float]]] = {}
54
+ super().__init__(table_column=table_column)
55
+
56
+ def _format_seconds(self, seconds: int) -> str:
57
+ minutes, secs = divmod(seconds, 60)
58
+ hours, minutes = divmod(minutes, 60)
59
+ if self.compact and not hours:
60
+ return f"{minutes:02d}:{secs:02d}"
61
+ return f"{hours:d}:{minutes:02d}:{secs:02d}"
62
+
63
+ def _recent_seconds_per_item(self, task: Task) -> Optional[float]:
64
+ if self.window_seconds <= 0:
65
+ return None
66
+ if task.start_time is None:
67
+ return None
68
+ history = self._history.setdefault(int(task.id), deque())
69
+ now = task.get_time()
70
+ completed = float(task.completed)
71
+ if not history or history[-1][1] != completed:
72
+ history.append((now, completed))
73
+ cutoff = now - self.window_seconds
74
+ while history and history[0][0] < cutoff:
75
+ history.popleft()
76
+ if len(history) < 2:
77
+ return None
78
+ start_time, start_completed = history[0]
79
+ delta_completed = completed - start_completed
80
+ delta_time = now - start_time
81
+ if delta_completed <= 0 or delta_time <= 0:
82
+ return None
83
+ return delta_time / delta_completed
84
+
85
+ def render(self, task: Task) -> Text:
86
+ if self.elapsed_when_finished and task.finished:
87
+ self._history.pop(int(task.id), None)
88
+ elapsed = task.finished_time
89
+ if elapsed is None:
90
+ return Text("-:--:--", style="progress.elapsed")
91
+ return Text(self._format_seconds(int(elapsed)), style="progress.elapsed")
92
+
93
+ style = "progress.remaining"
94
+ total = task.total
95
+ if total is None:
96
+ return Text("", style=style)
97
+ elapsed = task.elapsed
98
+ completed = task.completed
99
+ remaining = task.remaining
100
+ if not completed or elapsed is None or remaining is None:
101
+ return Text("--:--" if self.compact else "-:--:--", style=style)
102
+ recent = self._recent_seconds_per_item(task)
103
+ avg_seconds_per_item = recent if recent is not None else (elapsed / completed)
104
+ if avg_seconds_per_item <= 0:
105
+ return Text("--:--" if self.compact else "-:--:--", style=style)
106
+ eta_seconds = int(max(0, ceil(remaining * avg_seconds_per_item)))
107
+ return Text(self._format_seconds(eta_seconds), style=style)
108
+
109
+
110
+ class _RichSourceProxy(Source):
111
+ def __init__(self, *, inner: Source, alias: str, verbosity: int, progress: Progress, unit: Optional[str] = None, shared_task_id: Optional[int] = None, finalize: Optional[callable] = None, started: Optional[callable] = None):
112
+ self._inner = inner
113
+ self._alias = alias
114
+ self._verbosity = max(0, min(verbosity, 2))
115
+ self._progress = progress
116
+ self._task_id = None
117
+ self._shared_task_id = shared_task_id
118
+ self._unit = unit
119
+ self._emitted = 0
120
+ self._finalize = finalize
121
+ self._started = started
122
+
123
+ def _format_text(self, message: str) -> str:
124
+ # Plain alias prefix to avoid Rich markup issues
125
+ return f"[{self._alias}] {message}" if message else f"[{self._alias}]"
126
+
127
+ def _safe_count(self) -> Optional[int]:
128
+ try:
129
+ return self._inner.count()
130
+ except Exception:
131
+ return None
132
+
133
+ def stream(self) -> Iterator[Any]:
134
+ loader = getattr(self._inner, "loader", None)
135
+ desc, unit = progress_meta_for_loader(loader)
136
+ self._unit = unit
137
+ prefix, sep, suffix = desc.partition(": ")
138
+ header = f"{prefix}:" if sep else desc
139
+ tail = suffix if sep else None
140
+
141
+ transport = getattr(loader, "transport", None)
142
+ glob_root: Optional[Path] = None
143
+ if isinstance(transport, FsGlobTransport):
144
+ glob_root = compute_glob_root(
145
+ getattr(transport, "files", []))
146
+
147
+ def compose_text(name: Optional[str]) -> str:
148
+ if name:
149
+ base = header if sep else desc
150
+ return f"{base} {name}".rstrip()
151
+ if tail:
152
+ return f"{header} {tail}".rstrip()
153
+ return f"{desc}"
154
+
155
+ # Create task lazily with no total (DEBUG) or reuse shared spinner (INFO)
156
+ if self._verbosity >= 2 or self._shared_task_id is None:
157
+ self._task_id = self._progress.add_task(
158
+ "", start=False, total=None, text=self._format_text(compose_text(None)))
159
+
160
+ # If verbose, try to resolve total and show a real bar
161
+ if self._verbosity >= 2 and self._task_id is not None:
162
+ total = self._safe_count()
163
+ if total is not None:
164
+ self._progress.update(self._task_id, total=total)
165
+
166
+ emitted = 0
167
+ last_path_label: Optional[str] = None
168
+ shared_init_done = False
169
+ started_logged = False
170
+
171
+ if self._task_id is not None:
172
+ self._progress.start_task(self._task_id)
173
+
174
+ try:
175
+ for item in self._inner.stream():
176
+ current_label = current_transport_label(
177
+ transport, glob_root=glob_root
178
+ )
179
+ # On first item: emit Start + transport details
180
+ if not started_logged:
181
+ try:
182
+ if callable(self._started):
183
+ info_lines = transport_info_lines(transport)
184
+ debug_lines = transport_debug_lines(
185
+ transport) if self._verbosity >= 2 else []
186
+ self._started(self._alias, info_lines, debug_lines)
187
+ except Exception:
188
+ pass
189
+ started_logged = True
190
+ # Initialize shared spinner text on first item (INFO)
191
+ if not shared_init_done and self._shared_task_id is not None:
192
+ base = current_label if current_label else None
193
+ text0 = self._format_text(compose_text(base))
194
+ self._progress.update(self._shared_task_id, text=text0)
195
+ shared_init_done = True
196
+ if current_label and current_label != last_path_label:
197
+ last_path_label = current_label
198
+ text = self._format_text(compose_text(current_label))
199
+ if self._verbosity >= 2 and self._task_id is not None:
200
+ self._progress.update(self._task_id, text=text)
201
+ elif self._shared_task_id is not None:
202
+ self._progress.update(self._shared_task_id, text=text)
203
+ if self._verbosity >= 2 and self._task_id is not None:
204
+ self._progress.advance(self._task_id, 1)
205
+ emitted += 1
206
+ yield item
207
+ finally:
208
+ try:
209
+ if self._verbosity >= 2 and self._task_id is not None:
210
+ self._progress.update(self._task_id, completed=emitted)
211
+ self._progress.stop_task(self._task_id)
212
+ unit = self._unit or "item"
213
+ unit_suffix = "" if emitted == 1 else "s"
214
+ completed_text = f"[{self._alias}] Stream complete ({emitted} {unit}{unit_suffix})"
215
+ if callable(self._finalize):
216
+ try:
217
+ self._finalize(self._alias, completed_text)
218
+ except Exception:
219
+ pass
220
+ except Exception:
221
+ pass
222
+ # Defer logging of completion to the session footer to avoid interleaving
223
+ self._emitted = emitted
224
+ # No explicit end separator; completion line is sufficient
225
+
226
+
227
+ def _style_mode(progress_style: str, log_level: int | None) -> str:
228
+ mode = (progress_style or "auto").lower()
229
+ if mode == "auto":
230
+ level = log_level if log_level is not None else logging.INFO
231
+ return "bars" if level <= logging.DEBUG else "spinner"
232
+ return mode
233
+
234
+
235
+ @contextmanager
236
+ def visual_sources(runtime: Runtime, log_level: int | None, progress_style: str = "auto"):
237
+ level = log_level if log_level is not None else logging.INFO
238
+ if level > logging.INFO:
239
+ yield
240
+ return
241
+
242
+ style_mode = _style_mode(progress_style, log_level)
243
+ if style_mode == "off":
244
+ yield
245
+ return
246
+
247
+ verbosity = 2 if style_mode == "bars" else 1
248
+
249
+ # Build a console on stderr for visuals/logs
250
+ from rich.console import Console as _Console
251
+ import sys as _sys
252
+ _vis_console = _Console(file=_sys.stderr, markup=False,
253
+ highlight=False, soft_wrap=True)
254
+
255
+ # Columns tuned by style; alias is embedded in text
256
+ if verbosity >= 2:
257
+ columns = [
258
+ TextColumn("{task.fields[text]}", markup=False),
259
+ BarColumn(),
260
+ MofNCompleteColumn(),
261
+ TaskProgressColumn(),
262
+ TimeElapsedColumn(),
263
+ AverageTimeRemainingColumn(),
264
+ ]
265
+ else:
266
+ columns = [
267
+ TextColumn("{task.fields[text]}", markup=False),
268
+ SpinnerColumn(spinner_name="runner"),
269
+ ]
270
+
271
+ # Keep Live output transient so the spinner/bars disappear once completed
272
+ progress = Progress(*columns, transient=True, console=_vis_console)
273
+
274
+ # Install a temporary Rich logging handler for clean log rendering during Live
275
+ class _DedupFilter(logging.Filter):
276
+ def __init__(self):
277
+ super().__init__()
278
+ self._last: tuple[int, str] | None = None
279
+
280
+ # type: ignore[override]
281
+ def filter(self, record: logging.LogRecord) -> bool:
282
+ try:
283
+ msg = record.getMessage()
284
+ except Exception:
285
+ msg = record.msg if isinstance(
286
+ record.msg, str) else str(record.msg)
287
+ key = (record.levelno, msg)
288
+ if self._last == key:
289
+ return False
290
+ self._last = key
291
+ return True
292
+
293
+ rich_handler = None
294
+ root_logger = logging.getLogger()
295
+ old_handlers = list(root_logger.handlers)
296
+ old_filters = list(root_logger.filters)
297
+ try:
298
+ from rich.logging import RichHandler
299
+ console = _vis_console
300
+ rich_handler = RichHandler(
301
+ console=console,
302
+ show_time=False,
303
+ show_level=False,
304
+ show_path=False,
305
+ markup=False,
306
+ rich_tracebacks=False,
307
+ )
308
+ except Exception:
309
+ rich_handler = None
310
+
311
+ reg = runtime.registries.stream_sources
312
+ originals = dict(reg.items())
313
+ proxies: dict[str, _RichSourceProxy] = {}
314
+
315
+ # Swap handlers if RichHandler is available
316
+ if rich_handler is not None:
317
+ # Replace handlers with Rich and add a simple de-dup filter to avoid
318
+ # double-rendered lines if another handler slips in.
319
+ root_logger.handlers = [rich_handler]
320
+ dedup = _DedupFilter()
321
+ root_logger.addFilter(dedup)
322
+
323
+ renderable = progress
324
+
325
+ with Live(renderable, console=_vis_console, refresh_per_second=10, transient=True) as live:
326
+ try:
327
+ shared_task_id: Optional[int] = None
328
+ active_alias: Optional[str] = None
329
+ pending_starts: list[tuple[str, list[tuple[str, str]]]] = []
330
+ seen_messages: set[str] = set()
331
+
332
+ def _emit_entries(entries: list[tuple[str, str]]) -> None:
333
+ for level, line in entries:
334
+ key = f"{level}:{line}"
335
+ if key in seen_messages:
336
+ continue
337
+ seen_messages.add(key)
338
+ if level == "debug":
339
+ logger.debug(line)
340
+ else:
341
+ logger.info(line)
342
+
343
+ def _flush_next_start() -> None:
344
+ nonlocal active_alias
345
+ if active_alias is not None:
346
+ return
347
+ while pending_starts:
348
+ next_alias, entries = pending_starts.pop(0)
349
+ if not entries:
350
+ continue
351
+ active_alias = next_alias
352
+ _emit_entries(entries)
353
+ break
354
+
355
+ def _append_completed(alias: str, text: str):
356
+ _emit_entries([("info", f"{text} ✔")])
357
+ nonlocal active_alias
358
+ if active_alias == alias:
359
+ active_alias = None
360
+ _flush_next_start()
361
+
362
+ def _append_started(alias: str, info_lines: list[str], debug_lines: list[str]):
363
+ nonlocal active_alias
364
+ entries: list[tuple[str, str]] = []
365
+ for line in info_lines:
366
+ entries.append(("info", f"[{alias}] {line}"))
367
+ for line in debug_lines:
368
+ entries.append(("debug", f"[{alias}] {line}"))
369
+ if not entries:
370
+ entries = [("info", f"[{alias}] Stream starting")]
371
+ if active_alias is None:
372
+ active_alias = alias
373
+ _emit_entries(entries)
374
+ return
375
+ pending_starts.append((alias, entries))
376
+ if verbosity < 2:
377
+ shared_task_id = progress.add_task("", total=None, text="")
378
+ for alias, src in originals.items():
379
+ # Composed/virtual sources (no loader): attach header-only proxy to emit when streamed
380
+ if getattr(src, "loader", None) is None:
381
+ class _ComposedHeaderProxy:
382
+ def __init__(self, inner, alias: str):
383
+ self._inner = inner
384
+ self._alias = alias
385
+
386
+ def stream(self):
387
+ detail_entries: Optional[list[str]] = None
388
+ try:
389
+ spec = getattr(self._inner, "_spec", None)
390
+ inputs = getattr(spec, "inputs", None)
391
+ if isinstance(inputs, (list, tuple)) and inputs:
392
+ detail_entries = [str(item)
393
+ for item in inputs]
394
+ except Exception:
395
+ detail_entries = None
396
+ log_combined_stream(self._alias, detail_entries)
397
+ yield from self._inner.stream()
398
+
399
+ reg.register(alias, _ComposedHeaderProxy(src, alias))
400
+ else:
401
+ proxy = _RichSourceProxy(inner=src, alias=alias, verbosity=verbosity, progress=progress,
402
+ shared_task_id=shared_task_id, finalize=_append_completed, started=_append_started)
403
+ proxies[alias] = proxy
404
+ reg.register(alias, proxy)
405
+ yield
406
+ finally:
407
+ # Restore original sources
408
+ for alias, src in originals.items():
409
+ reg.register(alias, src)
410
+ # After Live finishes: restore logging handlers
411
+ if rich_handler is not None:
412
+ # Restore original handlers and filters
413
+ root_logger.handlers = old_handlers
414
+ root_logger.filters = old_filters
@@ -1,5 +1,5 @@
1
1
  from typing import Dict, Optional, Any, List, Mapping, Union, Literal
2
- from pydantic import BaseModel, Field, ConfigDict
2
+ from pydantic import BaseModel, Field, ConfigDict, model_validator
3
3
 
4
4
 
5
5
  class EPArgs(BaseModel):
@@ -14,8 +14,20 @@ class SourceConfig(BaseModel):
14
14
 
15
15
 
16
16
  class ContractConfig(BaseModel):
17
- source_id: str
18
- stream_id: str
17
+ """Unified contract model with explicit kind.
18
+
19
+ - kind = 'ingest': exactly one raw source via source alias
20
+ - kind = 'composed': inputs must reference canonical streams only
21
+ """
22
+ kind: Literal['ingest', 'composed']
23
+ id: str
24
+
25
+ # Ingest-only
26
+ source: Optional[str] = Field(default=None)
27
+
28
+ # Composed-only: list of "[alias=]stream_id" (streams only)
29
+ inputs: Optional[List[str]] = Field(default=None)
30
+
19
31
  mapper: Optional[EPArgs] = None
20
32
  partition_by: Optional[Union[str, List[str]]] = Field(default=None)
21
33
  sort_batch_size: int = Field(default=100_000)
@@ -24,6 +36,28 @@ class ContractConfig(BaseModel):
24
36
  # Optional debug-only transforms (applied after stream transforms)
25
37
  debug: Optional[List[Mapping[str, Any]]] = Field(default=None)
26
38
 
39
+ @model_validator(mode='after')
40
+ def _validate_mode(self):
41
+ if self.kind == 'ingest':
42
+ if not self.source:
43
+ raise ValueError("ingest contract requires 'source'")
44
+ if self.inputs:
45
+ raise ValueError("ingest contract cannot define 'inputs'")
46
+ elif self.kind == 'composed':
47
+ if not self.inputs or not isinstance(self.inputs, list):
48
+ raise ValueError("composed contract requires 'inputs' (list of stream ids)")
49
+ if self.source:
50
+ raise ValueError("composed contract cannot define 'source'")
51
+ # Enforce simple grammar: alias=stream_id or stream_id, no stages/prefixes
52
+ for item in self.inputs:
53
+ if '@' in item:
54
+ raise ValueError("composed inputs may not include '@stage'; streams are aligned by default")
55
+ # allow alias=ref
56
+ ref = item.split('=', 1)[1] if '=' in item else item
57
+ if ':' in ref:
58
+ raise ValueError("composed inputs must reference canonical stream ids only")
59
+ return self
60
+
27
61
 
28
62
  class StreamsConfig(BaseModel):
29
63
  raw: Dict[str, SourceConfig] = Field(default_factory=dict)
@@ -0,0 +1,214 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Optional, Sequence
6
+
7
+ from datapipeline.cli.commands.run_config import RunEntry, iter_runtime_runs
8
+ from datapipeline.config.dataset.dataset import FeatureDatasetConfig
9
+ from datapipeline.config.dataset.loader import load_dataset
10
+ from datapipeline.config.resolution import (
11
+ LogLevelDecision,
12
+ VisualSettings,
13
+ cascade,
14
+ resolve_log_level,
15
+ resolve_visuals,
16
+ workspace_output_defaults,
17
+ )
18
+ from datapipeline.config.workspace import WorkspaceContext
19
+ from datapipeline.io.output import (
20
+ OutputTarget,
21
+ resolve_output_target,
22
+ )
23
+ from datapipeline.pipeline.context import PipelineContext
24
+ from datapipeline.runtime import Runtime
25
+ from datapipeline.services.bootstrap import bootstrap
26
+
27
+
28
+ def _run_config_value(run_cfg, field: str):
29
+ """Return a run config field only when it was explicitly provided."""
30
+ if run_cfg is None:
31
+ return None
32
+ fields_set = getattr(run_cfg, "model_fields_set", None)
33
+ if fields_set is not None and field not in fields_set:
34
+ return None
35
+ return getattr(run_cfg, field, None)
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class RunProfile:
40
+ idx: int
41
+ total: int
42
+ entry: RunEntry
43
+ runtime: Runtime
44
+ stage: Optional[int]
45
+ limit: Optional[int]
46
+ throttle_ms: Optional[float]
47
+ log_decision: LogLevelDecision
48
+ visuals: VisualSettings
49
+ output: OutputTarget
50
+
51
+ @property
52
+ def label(self) -> str:
53
+ return self.entry.name or f"run{self.idx}"
54
+
55
+
56
+ @dataclass(frozen=True)
57
+ class BuildSettings:
58
+ visuals: str
59
+ progress: str
60
+ mode: str
61
+ force: bool
62
+
63
+
64
+ @dataclass(frozen=True)
65
+ class DatasetContext:
66
+ project: Path
67
+ dataset: FeatureDatasetConfig
68
+ runtime: Runtime
69
+ pipeline_context: PipelineContext
70
+
71
+ @property
72
+ def features(self):
73
+ return list(self.dataset.features or [])
74
+
75
+ @property
76
+ def targets(self):
77
+ return list(self.dataset.targets or [])
78
+
79
+
80
+ def load_dataset_context(project: Path | str) -> DatasetContext:
81
+ project_path = Path(project)
82
+ dataset = load_dataset(project_path, "vectors")
83
+ runtime = bootstrap(project_path)
84
+ context = PipelineContext(runtime)
85
+ return DatasetContext(
86
+ project=project_path,
87
+ dataset=dataset,
88
+ runtime=runtime,
89
+ pipeline_context=context,
90
+ )
91
+
92
+
93
+ def resolve_build_settings(
94
+ *,
95
+ workspace: WorkspaceContext | None,
96
+ cli_visuals: Optional[str],
97
+ cli_progress: Optional[str],
98
+ force_flag: bool,
99
+ ) -> BuildSettings:
100
+ shared = workspace.config.shared if workspace else None
101
+ build_defaults = workspace.config.build if workspace else None
102
+ shared_visuals = shared.visuals if shared else None
103
+ shared_progress = shared.progress if shared else None
104
+ build_mode_default = (
105
+ build_defaults.mode.upper() if build_defaults and build_defaults.mode else None
106
+ )
107
+ visuals = resolve_visuals(
108
+ cli_visuals=cli_visuals,
109
+ config_visuals=None,
110
+ workspace_visuals=shared_visuals,
111
+ cli_progress=cli_progress,
112
+ config_progress=None,
113
+ workspace_progress=shared_progress,
114
+ )
115
+ effective_mode = "FORCE" if force_flag else (
116
+ cascade(build_mode_default, "AUTO") or "AUTO")
117
+ effective_mode = effective_mode.upper()
118
+ force_build = force_flag or effective_mode == "FORCE"
119
+ return BuildSettings(
120
+ visuals=visuals.visuals,
121
+ progress=visuals.progress,
122
+ mode=effective_mode,
123
+ force=force_build,
124
+ )
125
+
126
+
127
+ def resolve_run_profiles(
128
+ project_path: Path,
129
+ run_entries: Sequence[RunEntry],
130
+ *,
131
+ keep: Optional[str],
132
+ stage: Optional[int],
133
+ limit: Optional[int],
134
+ cli_output,
135
+ cli_payload: Optional[str],
136
+ workspace: WorkspaceContext | None,
137
+ cli_log_level: Optional[str],
138
+ base_log_level: str,
139
+ cli_visuals: Optional[str],
140
+ cli_progress: Optional[str],
141
+ create_run: bool = False,
142
+ ) -> list[RunProfile]:
143
+ shared = workspace.config.shared if workspace else None
144
+ serve_defaults = workspace.config.serve if workspace else None
145
+ shared_visuals_default = shared.visuals if shared else None
146
+ shared_progress_default = shared.progress if shared else None
147
+ shared_log_level_default = shared.log_level if shared else None
148
+ serve_log_level_default = serve_defaults.log_level if serve_defaults else None
149
+ serve_limit_default = serve_defaults.limit if serve_defaults else None
150
+ serve_stage_default = serve_defaults.stage if serve_defaults else None
151
+ serve_throttle_default = serve_defaults.throttle_ms if serve_defaults else None
152
+ workspace_output_cfg = workspace_output_defaults(workspace)
153
+
154
+ profiles: list[RunProfile] = []
155
+ for idx, total_runs, entry, runtime in iter_runtime_runs(
156
+ project_path, run_entries, keep
157
+ ):
158
+ entry_name = entry.name
159
+ run_cfg = getattr(runtime, "run", None)
160
+
161
+ resolved_stage = cascade(stage, _run_config_value(
162
+ run_cfg, "stage"), serve_stage_default)
163
+ resolved_limit = cascade(limit, _run_config_value(
164
+ run_cfg, "limit"), serve_limit_default)
165
+ throttle_ms = cascade(
166
+ _run_config_value(run_cfg, "throttle_ms"),
167
+ serve_throttle_default,
168
+ )
169
+ log_decision = resolve_log_level(
170
+ cli_log_level,
171
+ _run_config_value(run_cfg, "log_level"),
172
+ serve_log_level_default,
173
+ shared_log_level_default,
174
+ fallback=str(base_log_level).upper(),
175
+ )
176
+
177
+ run_visuals = _run_config_value(run_cfg, "visuals")
178
+ run_progress = _run_config_value(run_cfg, "progress")
179
+ visuals = resolve_visuals(
180
+ cli_visuals=cli_visuals,
181
+ config_visuals=run_visuals,
182
+ workspace_visuals=shared_visuals_default,
183
+ cli_progress=cli_progress,
184
+ config_progress=run_progress,
185
+ workspace_progress=shared_progress_default,
186
+ )
187
+
188
+ runtime_output_cfg = workspace_output_cfg.model_copy() if workspace_output_cfg else None
189
+ target = resolve_output_target(
190
+ cli_output=cli_output,
191
+ config_output=getattr(run_cfg, "output", None),
192
+ default=runtime_output_cfg,
193
+ base_path=project_path.parent,
194
+ run_name=entry_name or f"run{idx}",
195
+ payload_override=cli_payload,
196
+ stage=resolved_stage,
197
+ create_run=create_run,
198
+ )
199
+
200
+ profiles.append(
201
+ RunProfile(
202
+ idx=idx,
203
+ total=total_runs,
204
+ entry=entry,
205
+ runtime=runtime,
206
+ stage=resolved_stage,
207
+ limit=resolved_limit,
208
+ throttle_ms=throttle_ms,
209
+ log_decision=log_decision,
210
+ visuals=visuals,
211
+ output=target,
212
+ )
213
+ )
214
+ return profiles