jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +120 -17
- datapipeline/analysis/vector/matrix.py +33 -8
- datapipeline/analysis/vector/report.py +162 -32
- datapipeline/build/tasks/__init__.py +11 -0
- datapipeline/build/tasks/config.py +74 -0
- datapipeline/build/tasks/metadata.py +170 -0
- datapipeline/build/tasks/scaler.py +73 -0
- datapipeline/build/tasks/schema.py +60 -0
- datapipeline/build/tasks/utils.py +169 -0
- datapipeline/cli/app.py +304 -127
- datapipeline/cli/commands/build.py +240 -16
- datapipeline/cli/commands/contract.py +367 -0
- datapipeline/cli/commands/domain.py +8 -3
- datapipeline/cli/commands/inspect.py +401 -149
- datapipeline/cli/commands/list_.py +30 -7
- datapipeline/cli/commands/plugin.py +5 -1
- datapipeline/cli/commands/run.py +227 -241
- datapipeline/cli/commands/run_config.py +101 -0
- datapipeline/cli/commands/serve_pipeline.py +156 -0
- datapipeline/cli/commands/source.py +44 -8
- datapipeline/cli/visuals/__init__.py +4 -2
- datapipeline/cli/visuals/common.py +239 -0
- datapipeline/cli/visuals/labels.py +15 -15
- datapipeline/cli/visuals/runner.py +66 -0
- datapipeline/cli/visuals/sections.py +20 -0
- datapipeline/cli/visuals/sources.py +132 -119
- datapipeline/cli/visuals/sources_basic.py +260 -0
- datapipeline/cli/visuals/sources_off.py +76 -0
- datapipeline/cli/visuals/sources_rich.py +414 -0
- datapipeline/config/catalog.py +37 -3
- datapipeline/config/context.py +214 -0
- datapipeline/config/dataset/loader.py +21 -4
- datapipeline/config/dataset/normalize.py +4 -4
- datapipeline/config/metadata.py +43 -0
- datapipeline/config/postprocess.py +2 -2
- datapipeline/config/project.py +3 -2
- datapipeline/config/resolution.py +129 -0
- datapipeline/config/tasks.py +309 -0
- datapipeline/config/workspace.py +155 -0
- datapipeline/domain/__init__.py +12 -0
- datapipeline/domain/record.py +11 -0
- datapipeline/domain/sample.py +54 -0
- datapipeline/integrations/ml/adapter.py +34 -20
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +1 -6
- datapipeline/integrations/ml/torch_support.py +1 -3
- datapipeline/io/factory.py +112 -0
- datapipeline/io/output.py +132 -0
- datapipeline/io/protocols.py +21 -0
- datapipeline/io/serializers.py +219 -0
- datapipeline/io/sinks/__init__.py +23 -0
- datapipeline/io/sinks/base.py +2 -0
- datapipeline/io/sinks/files.py +79 -0
- datapipeline/io/sinks/rich.py +57 -0
- datapipeline/io/sinks/stdout.py +18 -0
- datapipeline/io/writers/__init__.py +14 -0
- datapipeline/io/writers/base.py +28 -0
- datapipeline/io/writers/csv_writer.py +25 -0
- datapipeline/io/writers/jsonl.py +52 -0
- datapipeline/io/writers/pickle_writer.py +30 -0
- datapipeline/pipeline/artifacts.py +58 -0
- datapipeline/pipeline/context.py +66 -7
- datapipeline/pipeline/observability.py +65 -0
- datapipeline/pipeline/pipelines.py +65 -13
- datapipeline/pipeline/split.py +11 -10
- datapipeline/pipeline/stages.py +127 -16
- datapipeline/pipeline/utils/keygen.py +20 -7
- datapipeline/pipeline/utils/memory_sort.py +22 -10
- datapipeline/pipeline/utils/transform_utils.py +22 -0
- datapipeline/runtime.py +5 -2
- datapipeline/services/artifacts.py +12 -6
- datapipeline/services/bootstrap/config.py +25 -0
- datapipeline/services/bootstrap/core.py +52 -37
- datapipeline/services/constants.py +6 -5
- datapipeline/services/factories.py +123 -1
- datapipeline/services/project_paths.py +43 -16
- datapipeline/services/runs.py +208 -0
- datapipeline/services/scaffold/domain.py +3 -2
- datapipeline/services/scaffold/filter.py +3 -2
- datapipeline/services/scaffold/mappers.py +9 -6
- datapipeline/services/scaffold/plugin.py +54 -10
- datapipeline/services/scaffold/source.py +93 -56
- datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
- datapipeline/sources/decoders.py +83 -18
- datapipeline/sources/factory.py +26 -16
- datapipeline/sources/models/__init__.py +2 -2
- datapipeline/sources/models/generator.py +0 -7
- datapipeline/sources/models/loader.py +3 -3
- datapipeline/sources/models/parsing_error.py +24 -0
- datapipeline/sources/models/source.py +6 -6
- datapipeline/sources/synthetic/time/loader.py +14 -2
- datapipeline/sources/transports.py +74 -37
- datapipeline/templates/plugin_skeleton/README.md +76 -30
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
- datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
- datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
- datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -0
- datapipeline/templates/stubs/mapper.py.j2 +5 -4
- datapipeline/templates/stubs/parser.py.j2 +2 -0
- datapipeline/templates/stubs/record.py.j2 +2 -0
- datapipeline/templates/stubs/source.yaml.j2 +2 -3
- datapipeline/transforms/debug/lint.py +26 -41
- datapipeline/transforms/feature/scaler.py +89 -13
- datapipeline/transforms/record/floor_time.py +4 -4
- datapipeline/transforms/sequence.py +2 -35
- datapipeline/transforms/stream/dedupe.py +24 -0
- datapipeline/transforms/stream/ensure_ticks.py +7 -6
- datapipeline/transforms/vector/__init__.py +5 -0
- datapipeline/transforms/vector/common.py +98 -0
- datapipeline/transforms/vector/drop/__init__.py +4 -0
- datapipeline/transforms/vector/drop/horizontal.py +79 -0
- datapipeline/transforms/vector/drop/orchestrator.py +59 -0
- datapipeline/transforms/vector/drop/vertical.py +182 -0
- datapipeline/transforms/vector/ensure_schema.py +184 -0
- datapipeline/transforms/vector/fill.py +87 -0
- datapipeline/transforms/vector/replace.py +62 -0
- datapipeline/utils/load.py +24 -3
- datapipeline/utils/rich_compat.py +38 -0
- datapipeline/utils/window.py +76 -0
- jerry_thomas-1.0.1.dist-info/METADATA +825 -0
- jerry_thomas-1.0.1.dist-info/RECORD +199 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
- datapipeline/build/tasks.py +0 -186
- datapipeline/cli/commands/link.py +0 -128
- datapipeline/cli/commands/writers.py +0 -138
- datapipeline/config/build.py +0 -64
- datapipeline/config/run.py +0 -116
- datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
- datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
- datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
- datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
- datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
- datapipeline/transforms/vector.py +0 -210
- jerry_thomas-0.3.0.dist-info/METADATA +0 -502
- jerry_thomas-0.3.0.dist-info/RECORD +0 -139
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
2
|
+
from typing import Iterator, Any, Optional, Deque, Dict, Tuple
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from math import ceil
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
|
|
8
|
+
from collections import deque
|
|
9
|
+
|
|
10
|
+
from rich.live import Live
|
|
11
|
+
from rich.progress import (
|
|
12
|
+
Progress,
|
|
13
|
+
ProgressColumn,
|
|
14
|
+
SpinnerColumn,
|
|
15
|
+
TextColumn,
|
|
16
|
+
BarColumn,
|
|
17
|
+
MofNCompleteColumn,
|
|
18
|
+
TaskProgressColumn,
|
|
19
|
+
TimeElapsedColumn,
|
|
20
|
+
Task,
|
|
21
|
+
)
|
|
22
|
+
from rich.text import Text
|
|
23
|
+
|
|
24
|
+
from .labels import progress_meta_for_loader
|
|
25
|
+
from .common import (
|
|
26
|
+
compute_glob_root,
|
|
27
|
+
current_transport_label,
|
|
28
|
+
log_combined_stream,
|
|
29
|
+
transport_debug_lines,
|
|
30
|
+
transport_info_lines,
|
|
31
|
+
)
|
|
32
|
+
from datapipeline.runtime import Runtime
|
|
33
|
+
from datapipeline.sources.models.source import Source
|
|
34
|
+
from datapipeline.sources.transports import FsGlobTransport, FsFileTransport, HttpTransport
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class AverageTimeRemainingColumn(ProgressColumn):
|
|
39
|
+
"""ETA column that blends long-term and recent throughput for stability."""
|
|
40
|
+
|
|
41
|
+
max_refresh = 0.5
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
compact: bool = False,
|
|
46
|
+
elapsed_when_finished: bool = False,
|
|
47
|
+
table_column: Optional[Any] = None,
|
|
48
|
+
window_seconds: float = 300.0,
|
|
49
|
+
) -> None:
|
|
50
|
+
self.compact = compact
|
|
51
|
+
self.elapsed_when_finished = elapsed_when_finished
|
|
52
|
+
self.window_seconds = max(0.0, float(window_seconds))
|
|
53
|
+
self._history: Dict[int, Deque[Tuple[float, float]]] = {}
|
|
54
|
+
super().__init__(table_column=table_column)
|
|
55
|
+
|
|
56
|
+
def _format_seconds(self, seconds: int) -> str:
|
|
57
|
+
minutes, secs = divmod(seconds, 60)
|
|
58
|
+
hours, minutes = divmod(minutes, 60)
|
|
59
|
+
if self.compact and not hours:
|
|
60
|
+
return f"{minutes:02d}:{secs:02d}"
|
|
61
|
+
return f"{hours:d}:{minutes:02d}:{secs:02d}"
|
|
62
|
+
|
|
63
|
+
def _recent_seconds_per_item(self, task: Task) -> Optional[float]:
|
|
64
|
+
if self.window_seconds <= 0:
|
|
65
|
+
return None
|
|
66
|
+
if task.start_time is None:
|
|
67
|
+
return None
|
|
68
|
+
history = self._history.setdefault(int(task.id), deque())
|
|
69
|
+
now = task.get_time()
|
|
70
|
+
completed = float(task.completed)
|
|
71
|
+
if not history or history[-1][1] != completed:
|
|
72
|
+
history.append((now, completed))
|
|
73
|
+
cutoff = now - self.window_seconds
|
|
74
|
+
while history and history[0][0] < cutoff:
|
|
75
|
+
history.popleft()
|
|
76
|
+
if len(history) < 2:
|
|
77
|
+
return None
|
|
78
|
+
start_time, start_completed = history[0]
|
|
79
|
+
delta_completed = completed - start_completed
|
|
80
|
+
delta_time = now - start_time
|
|
81
|
+
if delta_completed <= 0 or delta_time <= 0:
|
|
82
|
+
return None
|
|
83
|
+
return delta_time / delta_completed
|
|
84
|
+
|
|
85
|
+
def render(self, task: Task) -> Text:
|
|
86
|
+
if self.elapsed_when_finished and task.finished:
|
|
87
|
+
self._history.pop(int(task.id), None)
|
|
88
|
+
elapsed = task.finished_time
|
|
89
|
+
if elapsed is None:
|
|
90
|
+
return Text("-:--:--", style="progress.elapsed")
|
|
91
|
+
return Text(self._format_seconds(int(elapsed)), style="progress.elapsed")
|
|
92
|
+
|
|
93
|
+
style = "progress.remaining"
|
|
94
|
+
total = task.total
|
|
95
|
+
if total is None:
|
|
96
|
+
return Text("", style=style)
|
|
97
|
+
elapsed = task.elapsed
|
|
98
|
+
completed = task.completed
|
|
99
|
+
remaining = task.remaining
|
|
100
|
+
if not completed or elapsed is None or remaining is None:
|
|
101
|
+
return Text("--:--" if self.compact else "-:--:--", style=style)
|
|
102
|
+
recent = self._recent_seconds_per_item(task)
|
|
103
|
+
avg_seconds_per_item = recent if recent is not None else (elapsed / completed)
|
|
104
|
+
if avg_seconds_per_item <= 0:
|
|
105
|
+
return Text("--:--" if self.compact else "-:--:--", style=style)
|
|
106
|
+
eta_seconds = int(max(0, ceil(remaining * avg_seconds_per_item)))
|
|
107
|
+
return Text(self._format_seconds(eta_seconds), style=style)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class _RichSourceProxy(Source):
|
|
111
|
+
def __init__(self, *, inner: Source, alias: str, verbosity: int, progress: Progress, unit: Optional[str] = None, shared_task_id: Optional[int] = None, finalize: Optional[callable] = None, started: Optional[callable] = None):
|
|
112
|
+
self._inner = inner
|
|
113
|
+
self._alias = alias
|
|
114
|
+
self._verbosity = max(0, min(verbosity, 2))
|
|
115
|
+
self._progress = progress
|
|
116
|
+
self._task_id = None
|
|
117
|
+
self._shared_task_id = shared_task_id
|
|
118
|
+
self._unit = unit
|
|
119
|
+
self._emitted = 0
|
|
120
|
+
self._finalize = finalize
|
|
121
|
+
self._started = started
|
|
122
|
+
|
|
123
|
+
def _format_text(self, message: str) -> str:
|
|
124
|
+
# Plain alias prefix to avoid Rich markup issues
|
|
125
|
+
return f"[{self._alias}] {message}" if message else f"[{self._alias}]"
|
|
126
|
+
|
|
127
|
+
def _safe_count(self) -> Optional[int]:
|
|
128
|
+
try:
|
|
129
|
+
return self._inner.count()
|
|
130
|
+
except Exception:
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
def stream(self) -> Iterator[Any]:
|
|
134
|
+
loader = getattr(self._inner, "loader", None)
|
|
135
|
+
desc, unit = progress_meta_for_loader(loader)
|
|
136
|
+
self._unit = unit
|
|
137
|
+
prefix, sep, suffix = desc.partition(": ")
|
|
138
|
+
header = f"{prefix}:" if sep else desc
|
|
139
|
+
tail = suffix if sep else None
|
|
140
|
+
|
|
141
|
+
transport = getattr(loader, "transport", None)
|
|
142
|
+
glob_root: Optional[Path] = None
|
|
143
|
+
if isinstance(transport, FsGlobTransport):
|
|
144
|
+
glob_root = compute_glob_root(
|
|
145
|
+
getattr(transport, "files", []))
|
|
146
|
+
|
|
147
|
+
def compose_text(name: Optional[str]) -> str:
|
|
148
|
+
if name:
|
|
149
|
+
base = header if sep else desc
|
|
150
|
+
return f"{base} {name}".rstrip()
|
|
151
|
+
if tail:
|
|
152
|
+
return f"{header} {tail}".rstrip()
|
|
153
|
+
return f"{desc}"
|
|
154
|
+
|
|
155
|
+
# Create task lazily with no total (DEBUG) or reuse shared spinner (INFO)
|
|
156
|
+
if self._verbosity >= 2 or self._shared_task_id is None:
|
|
157
|
+
self._task_id = self._progress.add_task(
|
|
158
|
+
"", start=False, total=None, text=self._format_text(compose_text(None)))
|
|
159
|
+
|
|
160
|
+
# If verbose, try to resolve total and show a real bar
|
|
161
|
+
if self._verbosity >= 2 and self._task_id is not None:
|
|
162
|
+
total = self._safe_count()
|
|
163
|
+
if total is not None:
|
|
164
|
+
self._progress.update(self._task_id, total=total)
|
|
165
|
+
|
|
166
|
+
emitted = 0
|
|
167
|
+
last_path_label: Optional[str] = None
|
|
168
|
+
shared_init_done = False
|
|
169
|
+
started_logged = False
|
|
170
|
+
|
|
171
|
+
if self._task_id is not None:
|
|
172
|
+
self._progress.start_task(self._task_id)
|
|
173
|
+
|
|
174
|
+
try:
|
|
175
|
+
for item in self._inner.stream():
|
|
176
|
+
current_label = current_transport_label(
|
|
177
|
+
transport, glob_root=glob_root
|
|
178
|
+
)
|
|
179
|
+
# On first item: emit Start + transport details
|
|
180
|
+
if not started_logged:
|
|
181
|
+
try:
|
|
182
|
+
if callable(self._started):
|
|
183
|
+
info_lines = transport_info_lines(transport)
|
|
184
|
+
debug_lines = transport_debug_lines(
|
|
185
|
+
transport) if self._verbosity >= 2 else []
|
|
186
|
+
self._started(self._alias, info_lines, debug_lines)
|
|
187
|
+
except Exception:
|
|
188
|
+
pass
|
|
189
|
+
started_logged = True
|
|
190
|
+
# Initialize shared spinner text on first item (INFO)
|
|
191
|
+
if not shared_init_done and self._shared_task_id is not None:
|
|
192
|
+
base = current_label if current_label else None
|
|
193
|
+
text0 = self._format_text(compose_text(base))
|
|
194
|
+
self._progress.update(self._shared_task_id, text=text0)
|
|
195
|
+
shared_init_done = True
|
|
196
|
+
if current_label and current_label != last_path_label:
|
|
197
|
+
last_path_label = current_label
|
|
198
|
+
text = self._format_text(compose_text(current_label))
|
|
199
|
+
if self._verbosity >= 2 and self._task_id is not None:
|
|
200
|
+
self._progress.update(self._task_id, text=text)
|
|
201
|
+
elif self._shared_task_id is not None:
|
|
202
|
+
self._progress.update(self._shared_task_id, text=text)
|
|
203
|
+
if self._verbosity >= 2 and self._task_id is not None:
|
|
204
|
+
self._progress.advance(self._task_id, 1)
|
|
205
|
+
emitted += 1
|
|
206
|
+
yield item
|
|
207
|
+
finally:
|
|
208
|
+
try:
|
|
209
|
+
if self._verbosity >= 2 and self._task_id is not None:
|
|
210
|
+
self._progress.update(self._task_id, completed=emitted)
|
|
211
|
+
self._progress.stop_task(self._task_id)
|
|
212
|
+
unit = self._unit or "item"
|
|
213
|
+
unit_suffix = "" if emitted == 1 else "s"
|
|
214
|
+
completed_text = f"[{self._alias}] Stream complete ({emitted} {unit}{unit_suffix})"
|
|
215
|
+
if callable(self._finalize):
|
|
216
|
+
try:
|
|
217
|
+
self._finalize(self._alias, completed_text)
|
|
218
|
+
except Exception:
|
|
219
|
+
pass
|
|
220
|
+
except Exception:
|
|
221
|
+
pass
|
|
222
|
+
# Defer logging of completion to the session footer to avoid interleaving
|
|
223
|
+
self._emitted = emitted
|
|
224
|
+
# No explicit end separator; completion line is sufficient
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _style_mode(progress_style: str, log_level: int | None) -> str:
|
|
228
|
+
mode = (progress_style or "auto").lower()
|
|
229
|
+
if mode == "auto":
|
|
230
|
+
level = log_level if log_level is not None else logging.INFO
|
|
231
|
+
return "bars" if level <= logging.DEBUG else "spinner"
|
|
232
|
+
return mode
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
@contextmanager
|
|
236
|
+
def visual_sources(runtime: Runtime, log_level: int | None, progress_style: str = "auto"):
|
|
237
|
+
level = log_level if log_level is not None else logging.INFO
|
|
238
|
+
if level > logging.INFO:
|
|
239
|
+
yield
|
|
240
|
+
return
|
|
241
|
+
|
|
242
|
+
style_mode = _style_mode(progress_style, log_level)
|
|
243
|
+
if style_mode == "off":
|
|
244
|
+
yield
|
|
245
|
+
return
|
|
246
|
+
|
|
247
|
+
verbosity = 2 if style_mode == "bars" else 1
|
|
248
|
+
|
|
249
|
+
# Build a console on stderr for visuals/logs
|
|
250
|
+
from rich.console import Console as _Console
|
|
251
|
+
import sys as _sys
|
|
252
|
+
_vis_console = _Console(file=_sys.stderr, markup=False,
|
|
253
|
+
highlight=False, soft_wrap=True)
|
|
254
|
+
|
|
255
|
+
# Columns tuned by style; alias is embedded in text
|
|
256
|
+
if verbosity >= 2:
|
|
257
|
+
columns = [
|
|
258
|
+
TextColumn("{task.fields[text]}", markup=False),
|
|
259
|
+
BarColumn(),
|
|
260
|
+
MofNCompleteColumn(),
|
|
261
|
+
TaskProgressColumn(),
|
|
262
|
+
TimeElapsedColumn(),
|
|
263
|
+
AverageTimeRemainingColumn(),
|
|
264
|
+
]
|
|
265
|
+
else:
|
|
266
|
+
columns = [
|
|
267
|
+
TextColumn("{task.fields[text]}", markup=False),
|
|
268
|
+
SpinnerColumn(spinner_name="runner"),
|
|
269
|
+
]
|
|
270
|
+
|
|
271
|
+
# Keep Live output transient so the spinner/bars disappear once completed
|
|
272
|
+
progress = Progress(*columns, transient=True, console=_vis_console)
|
|
273
|
+
|
|
274
|
+
# Install a temporary Rich logging handler for clean log rendering during Live
|
|
275
|
+
class _DedupFilter(logging.Filter):
|
|
276
|
+
def __init__(self):
|
|
277
|
+
super().__init__()
|
|
278
|
+
self._last: tuple[int, str] | None = None
|
|
279
|
+
|
|
280
|
+
# type: ignore[override]
|
|
281
|
+
def filter(self, record: logging.LogRecord) -> bool:
|
|
282
|
+
try:
|
|
283
|
+
msg = record.getMessage()
|
|
284
|
+
except Exception:
|
|
285
|
+
msg = record.msg if isinstance(
|
|
286
|
+
record.msg, str) else str(record.msg)
|
|
287
|
+
key = (record.levelno, msg)
|
|
288
|
+
if self._last == key:
|
|
289
|
+
return False
|
|
290
|
+
self._last = key
|
|
291
|
+
return True
|
|
292
|
+
|
|
293
|
+
rich_handler = None
|
|
294
|
+
root_logger = logging.getLogger()
|
|
295
|
+
old_handlers = list(root_logger.handlers)
|
|
296
|
+
old_filters = list(root_logger.filters)
|
|
297
|
+
try:
|
|
298
|
+
from rich.logging import RichHandler
|
|
299
|
+
console = _vis_console
|
|
300
|
+
rich_handler = RichHandler(
|
|
301
|
+
console=console,
|
|
302
|
+
show_time=False,
|
|
303
|
+
show_level=False,
|
|
304
|
+
show_path=False,
|
|
305
|
+
markup=False,
|
|
306
|
+
rich_tracebacks=False,
|
|
307
|
+
)
|
|
308
|
+
except Exception:
|
|
309
|
+
rich_handler = None
|
|
310
|
+
|
|
311
|
+
reg = runtime.registries.stream_sources
|
|
312
|
+
originals = dict(reg.items())
|
|
313
|
+
proxies: dict[str, _RichSourceProxy] = {}
|
|
314
|
+
|
|
315
|
+
# Swap handlers if RichHandler is available
|
|
316
|
+
if rich_handler is not None:
|
|
317
|
+
# Replace handlers with Rich and add a simple de-dup filter to avoid
|
|
318
|
+
# double-rendered lines if another handler slips in.
|
|
319
|
+
root_logger.handlers = [rich_handler]
|
|
320
|
+
dedup = _DedupFilter()
|
|
321
|
+
root_logger.addFilter(dedup)
|
|
322
|
+
|
|
323
|
+
renderable = progress
|
|
324
|
+
|
|
325
|
+
with Live(renderable, console=_vis_console, refresh_per_second=10, transient=True) as live:
|
|
326
|
+
try:
|
|
327
|
+
shared_task_id: Optional[int] = None
|
|
328
|
+
active_alias: Optional[str] = None
|
|
329
|
+
pending_starts: list[tuple[str, list[tuple[str, str]]]] = []
|
|
330
|
+
seen_messages: set[str] = set()
|
|
331
|
+
|
|
332
|
+
def _emit_entries(entries: list[tuple[str, str]]) -> None:
|
|
333
|
+
for level, line in entries:
|
|
334
|
+
key = f"{level}:{line}"
|
|
335
|
+
if key in seen_messages:
|
|
336
|
+
continue
|
|
337
|
+
seen_messages.add(key)
|
|
338
|
+
if level == "debug":
|
|
339
|
+
logger.debug(line)
|
|
340
|
+
else:
|
|
341
|
+
logger.info(line)
|
|
342
|
+
|
|
343
|
+
def _flush_next_start() -> None:
|
|
344
|
+
nonlocal active_alias
|
|
345
|
+
if active_alias is not None:
|
|
346
|
+
return
|
|
347
|
+
while pending_starts:
|
|
348
|
+
next_alias, entries = pending_starts.pop(0)
|
|
349
|
+
if not entries:
|
|
350
|
+
continue
|
|
351
|
+
active_alias = next_alias
|
|
352
|
+
_emit_entries(entries)
|
|
353
|
+
break
|
|
354
|
+
|
|
355
|
+
def _append_completed(alias: str, text: str):
|
|
356
|
+
_emit_entries([("info", f"{text} ✔")])
|
|
357
|
+
nonlocal active_alias
|
|
358
|
+
if active_alias == alias:
|
|
359
|
+
active_alias = None
|
|
360
|
+
_flush_next_start()
|
|
361
|
+
|
|
362
|
+
def _append_started(alias: str, info_lines: list[str], debug_lines: list[str]):
|
|
363
|
+
nonlocal active_alias
|
|
364
|
+
entries: list[tuple[str, str]] = []
|
|
365
|
+
for line in info_lines:
|
|
366
|
+
entries.append(("info", f"[{alias}] {line}"))
|
|
367
|
+
for line in debug_lines:
|
|
368
|
+
entries.append(("debug", f"[{alias}] {line}"))
|
|
369
|
+
if not entries:
|
|
370
|
+
entries = [("info", f"[{alias}] Stream starting")]
|
|
371
|
+
if active_alias is None:
|
|
372
|
+
active_alias = alias
|
|
373
|
+
_emit_entries(entries)
|
|
374
|
+
return
|
|
375
|
+
pending_starts.append((alias, entries))
|
|
376
|
+
if verbosity < 2:
|
|
377
|
+
shared_task_id = progress.add_task("", total=None, text="")
|
|
378
|
+
for alias, src in originals.items():
|
|
379
|
+
# Composed/virtual sources (no loader): attach header-only proxy to emit when streamed
|
|
380
|
+
if getattr(src, "loader", None) is None:
|
|
381
|
+
class _ComposedHeaderProxy:
|
|
382
|
+
def __init__(self, inner, alias: str):
|
|
383
|
+
self._inner = inner
|
|
384
|
+
self._alias = alias
|
|
385
|
+
|
|
386
|
+
def stream(self):
|
|
387
|
+
detail_entries: Optional[list[str]] = None
|
|
388
|
+
try:
|
|
389
|
+
spec = getattr(self._inner, "_spec", None)
|
|
390
|
+
inputs = getattr(spec, "inputs", None)
|
|
391
|
+
if isinstance(inputs, (list, tuple)) and inputs:
|
|
392
|
+
detail_entries = [str(item)
|
|
393
|
+
for item in inputs]
|
|
394
|
+
except Exception:
|
|
395
|
+
detail_entries = None
|
|
396
|
+
log_combined_stream(self._alias, detail_entries)
|
|
397
|
+
yield from self._inner.stream()
|
|
398
|
+
|
|
399
|
+
reg.register(alias, _ComposedHeaderProxy(src, alias))
|
|
400
|
+
else:
|
|
401
|
+
proxy = _RichSourceProxy(inner=src, alias=alias, verbosity=verbosity, progress=progress,
|
|
402
|
+
shared_task_id=shared_task_id, finalize=_append_completed, started=_append_started)
|
|
403
|
+
proxies[alias] = proxy
|
|
404
|
+
reg.register(alias, proxy)
|
|
405
|
+
yield
|
|
406
|
+
finally:
|
|
407
|
+
# Restore original sources
|
|
408
|
+
for alias, src in originals.items():
|
|
409
|
+
reg.register(alias, src)
|
|
410
|
+
# After Live finishes: restore logging handlers
|
|
411
|
+
if rich_handler is not None:
|
|
412
|
+
# Restore original handlers and filters
|
|
413
|
+
root_logger.handlers = old_handlers
|
|
414
|
+
root_logger.filters = old_filters
|
datapipeline/config/catalog.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from typing import Dict, Optional, Any, List, Mapping, Union, Literal
|
|
2
|
-
from pydantic import BaseModel, Field, ConfigDict
|
|
2
|
+
from pydantic import BaseModel, Field, ConfigDict, model_validator
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class EPArgs(BaseModel):
|
|
@@ -14,8 +14,20 @@ class SourceConfig(BaseModel):
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class ContractConfig(BaseModel):
|
|
17
|
-
|
|
18
|
-
|
|
17
|
+
"""Unified contract model with explicit kind.
|
|
18
|
+
|
|
19
|
+
- kind = 'ingest': exactly one raw source via source alias
|
|
20
|
+
- kind = 'composed': inputs must reference canonical streams only
|
|
21
|
+
"""
|
|
22
|
+
kind: Literal['ingest', 'composed']
|
|
23
|
+
id: str
|
|
24
|
+
|
|
25
|
+
# Ingest-only
|
|
26
|
+
source: Optional[str] = Field(default=None)
|
|
27
|
+
|
|
28
|
+
# Composed-only: list of "[alias=]stream_id" (streams only)
|
|
29
|
+
inputs: Optional[List[str]] = Field(default=None)
|
|
30
|
+
|
|
19
31
|
mapper: Optional[EPArgs] = None
|
|
20
32
|
partition_by: Optional[Union[str, List[str]]] = Field(default=None)
|
|
21
33
|
sort_batch_size: int = Field(default=100_000)
|
|
@@ -24,6 +36,28 @@ class ContractConfig(BaseModel):
|
|
|
24
36
|
# Optional debug-only transforms (applied after stream transforms)
|
|
25
37
|
debug: Optional[List[Mapping[str, Any]]] = Field(default=None)
|
|
26
38
|
|
|
39
|
+
@model_validator(mode='after')
|
|
40
|
+
def _validate_mode(self):
|
|
41
|
+
if self.kind == 'ingest':
|
|
42
|
+
if not self.source:
|
|
43
|
+
raise ValueError("ingest contract requires 'source'")
|
|
44
|
+
if self.inputs:
|
|
45
|
+
raise ValueError("ingest contract cannot define 'inputs'")
|
|
46
|
+
elif self.kind == 'composed':
|
|
47
|
+
if not self.inputs or not isinstance(self.inputs, list):
|
|
48
|
+
raise ValueError("composed contract requires 'inputs' (list of stream ids)")
|
|
49
|
+
if self.source:
|
|
50
|
+
raise ValueError("composed contract cannot define 'source'")
|
|
51
|
+
# Enforce simple grammar: alias=stream_id or stream_id, no stages/prefixes
|
|
52
|
+
for item in self.inputs:
|
|
53
|
+
if '@' in item:
|
|
54
|
+
raise ValueError("composed inputs may not include '@stage'; streams are aligned by default")
|
|
55
|
+
# allow alias=ref
|
|
56
|
+
ref = item.split('=', 1)[1] if '=' in item else item
|
|
57
|
+
if ':' in ref:
|
|
58
|
+
raise ValueError("composed inputs must reference canonical stream ids only")
|
|
59
|
+
return self
|
|
60
|
+
|
|
27
61
|
|
|
28
62
|
class StreamsConfig(BaseModel):
|
|
29
63
|
raw: Dict[str, SourceConfig] = Field(default_factory=dict)
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional, Sequence
|
|
6
|
+
|
|
7
|
+
from datapipeline.cli.commands.run_config import RunEntry, iter_runtime_runs
|
|
8
|
+
from datapipeline.config.dataset.dataset import FeatureDatasetConfig
|
|
9
|
+
from datapipeline.config.dataset.loader import load_dataset
|
|
10
|
+
from datapipeline.config.resolution import (
|
|
11
|
+
LogLevelDecision,
|
|
12
|
+
VisualSettings,
|
|
13
|
+
cascade,
|
|
14
|
+
resolve_log_level,
|
|
15
|
+
resolve_visuals,
|
|
16
|
+
workspace_output_defaults,
|
|
17
|
+
)
|
|
18
|
+
from datapipeline.config.workspace import WorkspaceContext
|
|
19
|
+
from datapipeline.io.output import (
|
|
20
|
+
OutputTarget,
|
|
21
|
+
resolve_output_target,
|
|
22
|
+
)
|
|
23
|
+
from datapipeline.pipeline.context import PipelineContext
|
|
24
|
+
from datapipeline.runtime import Runtime
|
|
25
|
+
from datapipeline.services.bootstrap import bootstrap
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _run_config_value(run_cfg, field: str):
|
|
29
|
+
"""Return a run config field only when it was explicitly provided."""
|
|
30
|
+
if run_cfg is None:
|
|
31
|
+
return None
|
|
32
|
+
fields_set = getattr(run_cfg, "model_fields_set", None)
|
|
33
|
+
if fields_set is not None and field not in fields_set:
|
|
34
|
+
return None
|
|
35
|
+
return getattr(run_cfg, field, None)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True)
|
|
39
|
+
class RunProfile:
|
|
40
|
+
idx: int
|
|
41
|
+
total: int
|
|
42
|
+
entry: RunEntry
|
|
43
|
+
runtime: Runtime
|
|
44
|
+
stage: Optional[int]
|
|
45
|
+
limit: Optional[int]
|
|
46
|
+
throttle_ms: Optional[float]
|
|
47
|
+
log_decision: LogLevelDecision
|
|
48
|
+
visuals: VisualSettings
|
|
49
|
+
output: OutputTarget
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def label(self) -> str:
|
|
53
|
+
return self.entry.name or f"run{self.idx}"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass(frozen=True)
|
|
57
|
+
class BuildSettings:
|
|
58
|
+
visuals: str
|
|
59
|
+
progress: str
|
|
60
|
+
mode: str
|
|
61
|
+
force: bool
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass(frozen=True)
|
|
65
|
+
class DatasetContext:
|
|
66
|
+
project: Path
|
|
67
|
+
dataset: FeatureDatasetConfig
|
|
68
|
+
runtime: Runtime
|
|
69
|
+
pipeline_context: PipelineContext
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def features(self):
|
|
73
|
+
return list(self.dataset.features or [])
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def targets(self):
|
|
77
|
+
return list(self.dataset.targets or [])
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def load_dataset_context(project: Path | str) -> DatasetContext:
|
|
81
|
+
project_path = Path(project)
|
|
82
|
+
dataset = load_dataset(project_path, "vectors")
|
|
83
|
+
runtime = bootstrap(project_path)
|
|
84
|
+
context = PipelineContext(runtime)
|
|
85
|
+
return DatasetContext(
|
|
86
|
+
project=project_path,
|
|
87
|
+
dataset=dataset,
|
|
88
|
+
runtime=runtime,
|
|
89
|
+
pipeline_context=context,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def resolve_build_settings(
|
|
94
|
+
*,
|
|
95
|
+
workspace: WorkspaceContext | None,
|
|
96
|
+
cli_visuals: Optional[str],
|
|
97
|
+
cli_progress: Optional[str],
|
|
98
|
+
force_flag: bool,
|
|
99
|
+
) -> BuildSettings:
|
|
100
|
+
shared = workspace.config.shared if workspace else None
|
|
101
|
+
build_defaults = workspace.config.build if workspace else None
|
|
102
|
+
shared_visuals = shared.visuals if shared else None
|
|
103
|
+
shared_progress = shared.progress if shared else None
|
|
104
|
+
build_mode_default = (
|
|
105
|
+
build_defaults.mode.upper() if build_defaults and build_defaults.mode else None
|
|
106
|
+
)
|
|
107
|
+
visuals = resolve_visuals(
|
|
108
|
+
cli_visuals=cli_visuals,
|
|
109
|
+
config_visuals=None,
|
|
110
|
+
workspace_visuals=shared_visuals,
|
|
111
|
+
cli_progress=cli_progress,
|
|
112
|
+
config_progress=None,
|
|
113
|
+
workspace_progress=shared_progress,
|
|
114
|
+
)
|
|
115
|
+
effective_mode = "FORCE" if force_flag else (
|
|
116
|
+
cascade(build_mode_default, "AUTO") or "AUTO")
|
|
117
|
+
effective_mode = effective_mode.upper()
|
|
118
|
+
force_build = force_flag or effective_mode == "FORCE"
|
|
119
|
+
return BuildSettings(
|
|
120
|
+
visuals=visuals.visuals,
|
|
121
|
+
progress=visuals.progress,
|
|
122
|
+
mode=effective_mode,
|
|
123
|
+
force=force_build,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def resolve_run_profiles(
|
|
128
|
+
project_path: Path,
|
|
129
|
+
run_entries: Sequence[RunEntry],
|
|
130
|
+
*,
|
|
131
|
+
keep: Optional[str],
|
|
132
|
+
stage: Optional[int],
|
|
133
|
+
limit: Optional[int],
|
|
134
|
+
cli_output,
|
|
135
|
+
cli_payload: Optional[str],
|
|
136
|
+
workspace: WorkspaceContext | None,
|
|
137
|
+
cli_log_level: Optional[str],
|
|
138
|
+
base_log_level: str,
|
|
139
|
+
cli_visuals: Optional[str],
|
|
140
|
+
cli_progress: Optional[str],
|
|
141
|
+
create_run: bool = False,
|
|
142
|
+
) -> list[RunProfile]:
|
|
143
|
+
shared = workspace.config.shared if workspace else None
|
|
144
|
+
serve_defaults = workspace.config.serve if workspace else None
|
|
145
|
+
shared_visuals_default = shared.visuals if shared else None
|
|
146
|
+
shared_progress_default = shared.progress if shared else None
|
|
147
|
+
shared_log_level_default = shared.log_level if shared else None
|
|
148
|
+
serve_log_level_default = serve_defaults.log_level if serve_defaults else None
|
|
149
|
+
serve_limit_default = serve_defaults.limit if serve_defaults else None
|
|
150
|
+
serve_stage_default = serve_defaults.stage if serve_defaults else None
|
|
151
|
+
serve_throttle_default = serve_defaults.throttle_ms if serve_defaults else None
|
|
152
|
+
workspace_output_cfg = workspace_output_defaults(workspace)
|
|
153
|
+
|
|
154
|
+
profiles: list[RunProfile] = []
|
|
155
|
+
for idx, total_runs, entry, runtime in iter_runtime_runs(
|
|
156
|
+
project_path, run_entries, keep
|
|
157
|
+
):
|
|
158
|
+
entry_name = entry.name
|
|
159
|
+
run_cfg = getattr(runtime, "run", None)
|
|
160
|
+
|
|
161
|
+
resolved_stage = cascade(stage, _run_config_value(
|
|
162
|
+
run_cfg, "stage"), serve_stage_default)
|
|
163
|
+
resolved_limit = cascade(limit, _run_config_value(
|
|
164
|
+
run_cfg, "limit"), serve_limit_default)
|
|
165
|
+
throttle_ms = cascade(
|
|
166
|
+
_run_config_value(run_cfg, "throttle_ms"),
|
|
167
|
+
serve_throttle_default,
|
|
168
|
+
)
|
|
169
|
+
log_decision = resolve_log_level(
|
|
170
|
+
cli_log_level,
|
|
171
|
+
_run_config_value(run_cfg, "log_level"),
|
|
172
|
+
serve_log_level_default,
|
|
173
|
+
shared_log_level_default,
|
|
174
|
+
fallback=str(base_log_level).upper(),
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
run_visuals = _run_config_value(run_cfg, "visuals")
|
|
178
|
+
run_progress = _run_config_value(run_cfg, "progress")
|
|
179
|
+
visuals = resolve_visuals(
|
|
180
|
+
cli_visuals=cli_visuals,
|
|
181
|
+
config_visuals=run_visuals,
|
|
182
|
+
workspace_visuals=shared_visuals_default,
|
|
183
|
+
cli_progress=cli_progress,
|
|
184
|
+
config_progress=run_progress,
|
|
185
|
+
workspace_progress=shared_progress_default,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
runtime_output_cfg = workspace_output_cfg.model_copy() if workspace_output_cfg else None
|
|
189
|
+
target = resolve_output_target(
|
|
190
|
+
cli_output=cli_output,
|
|
191
|
+
config_output=getattr(run_cfg, "output", None),
|
|
192
|
+
default=runtime_output_cfg,
|
|
193
|
+
base_path=project_path.parent,
|
|
194
|
+
run_name=entry_name or f"run{idx}",
|
|
195
|
+
payload_override=cli_payload,
|
|
196
|
+
stage=resolved_stage,
|
|
197
|
+
create_run=create_run,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
profiles.append(
|
|
201
|
+
RunProfile(
|
|
202
|
+
idx=idx,
|
|
203
|
+
total=total_runs,
|
|
204
|
+
entry=entry,
|
|
205
|
+
runtime=runtime,
|
|
206
|
+
stage=resolved_stage,
|
|
207
|
+
limit=resolved_limit,
|
|
208
|
+
throttle_ms=throttle_ms,
|
|
209
|
+
log_decision=log_decision,
|
|
210
|
+
visuals=visuals,
|
|
211
|
+
output=target,
|
|
212
|
+
)
|
|
213
|
+
)
|
|
214
|
+
return profiles
|