jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. datapipeline/analysis/vector/collector.py +120 -17
  2. datapipeline/analysis/vector/matrix.py +33 -8
  3. datapipeline/analysis/vector/report.py +162 -32
  4. datapipeline/build/tasks/__init__.py +11 -0
  5. datapipeline/build/tasks/config.py +74 -0
  6. datapipeline/build/tasks/metadata.py +170 -0
  7. datapipeline/build/tasks/scaler.py +73 -0
  8. datapipeline/build/tasks/schema.py +60 -0
  9. datapipeline/build/tasks/utils.py +169 -0
  10. datapipeline/cli/app.py +304 -127
  11. datapipeline/cli/commands/build.py +240 -16
  12. datapipeline/cli/commands/contract.py +367 -0
  13. datapipeline/cli/commands/domain.py +8 -3
  14. datapipeline/cli/commands/inspect.py +401 -149
  15. datapipeline/cli/commands/list_.py +30 -7
  16. datapipeline/cli/commands/plugin.py +5 -1
  17. datapipeline/cli/commands/run.py +227 -241
  18. datapipeline/cli/commands/run_config.py +101 -0
  19. datapipeline/cli/commands/serve_pipeline.py +156 -0
  20. datapipeline/cli/commands/source.py +44 -8
  21. datapipeline/cli/visuals/__init__.py +4 -2
  22. datapipeline/cli/visuals/common.py +239 -0
  23. datapipeline/cli/visuals/labels.py +15 -15
  24. datapipeline/cli/visuals/runner.py +66 -0
  25. datapipeline/cli/visuals/sections.py +20 -0
  26. datapipeline/cli/visuals/sources.py +132 -119
  27. datapipeline/cli/visuals/sources_basic.py +260 -0
  28. datapipeline/cli/visuals/sources_off.py +76 -0
  29. datapipeline/cli/visuals/sources_rich.py +414 -0
  30. datapipeline/config/catalog.py +37 -3
  31. datapipeline/config/context.py +214 -0
  32. datapipeline/config/dataset/loader.py +21 -4
  33. datapipeline/config/dataset/normalize.py +4 -4
  34. datapipeline/config/metadata.py +43 -0
  35. datapipeline/config/postprocess.py +2 -2
  36. datapipeline/config/project.py +3 -2
  37. datapipeline/config/resolution.py +129 -0
  38. datapipeline/config/tasks.py +309 -0
  39. datapipeline/config/workspace.py +155 -0
  40. datapipeline/domain/__init__.py +12 -0
  41. datapipeline/domain/record.py +11 -0
  42. datapipeline/domain/sample.py +54 -0
  43. datapipeline/integrations/ml/adapter.py +34 -20
  44. datapipeline/integrations/ml/pandas_support.py +0 -2
  45. datapipeline/integrations/ml/rows.py +1 -6
  46. datapipeline/integrations/ml/torch_support.py +1 -3
  47. datapipeline/io/factory.py +112 -0
  48. datapipeline/io/output.py +132 -0
  49. datapipeline/io/protocols.py +21 -0
  50. datapipeline/io/serializers.py +219 -0
  51. datapipeline/io/sinks/__init__.py +23 -0
  52. datapipeline/io/sinks/base.py +2 -0
  53. datapipeline/io/sinks/files.py +79 -0
  54. datapipeline/io/sinks/rich.py +57 -0
  55. datapipeline/io/sinks/stdout.py +18 -0
  56. datapipeline/io/writers/__init__.py +14 -0
  57. datapipeline/io/writers/base.py +28 -0
  58. datapipeline/io/writers/csv_writer.py +25 -0
  59. datapipeline/io/writers/jsonl.py +52 -0
  60. datapipeline/io/writers/pickle_writer.py +30 -0
  61. datapipeline/pipeline/artifacts.py +58 -0
  62. datapipeline/pipeline/context.py +66 -7
  63. datapipeline/pipeline/observability.py +65 -0
  64. datapipeline/pipeline/pipelines.py +65 -13
  65. datapipeline/pipeline/split.py +11 -10
  66. datapipeline/pipeline/stages.py +127 -16
  67. datapipeline/pipeline/utils/keygen.py +20 -7
  68. datapipeline/pipeline/utils/memory_sort.py +22 -10
  69. datapipeline/pipeline/utils/transform_utils.py +22 -0
  70. datapipeline/runtime.py +5 -2
  71. datapipeline/services/artifacts.py +12 -6
  72. datapipeline/services/bootstrap/config.py +25 -0
  73. datapipeline/services/bootstrap/core.py +52 -37
  74. datapipeline/services/constants.py +6 -5
  75. datapipeline/services/factories.py +123 -1
  76. datapipeline/services/project_paths.py +43 -16
  77. datapipeline/services/runs.py +208 -0
  78. datapipeline/services/scaffold/domain.py +3 -2
  79. datapipeline/services/scaffold/filter.py +3 -2
  80. datapipeline/services/scaffold/mappers.py +9 -6
  81. datapipeline/services/scaffold/plugin.py +54 -10
  82. datapipeline/services/scaffold/source.py +93 -56
  83. datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
  84. datapipeline/sources/decoders.py +83 -18
  85. datapipeline/sources/factory.py +26 -16
  86. datapipeline/sources/models/__init__.py +2 -2
  87. datapipeline/sources/models/generator.py +0 -7
  88. datapipeline/sources/models/loader.py +3 -3
  89. datapipeline/sources/models/parsing_error.py +24 -0
  90. datapipeline/sources/models/source.py +6 -6
  91. datapipeline/sources/synthetic/time/loader.py +14 -2
  92. datapipeline/sources/transports.py +74 -37
  93. datapipeline/templates/plugin_skeleton/README.md +76 -30
  94. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
  95. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
  96. datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
  97. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
  98. datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
  99. datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
  100. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
  101. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
  102. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
  103. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
  104. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
  105. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
  106. datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
  107. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
  108. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
  109. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
  110. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
  111. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
  112. datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
  113. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
  114. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
  115. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
  116. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
  117. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
  118. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
  119. datapipeline/templates/stubs/dto.py.j2 +2 -0
  120. datapipeline/templates/stubs/mapper.py.j2 +5 -4
  121. datapipeline/templates/stubs/parser.py.j2 +2 -0
  122. datapipeline/templates/stubs/record.py.j2 +2 -0
  123. datapipeline/templates/stubs/source.yaml.j2 +2 -3
  124. datapipeline/transforms/debug/lint.py +26 -41
  125. datapipeline/transforms/feature/scaler.py +89 -13
  126. datapipeline/transforms/record/floor_time.py +4 -4
  127. datapipeline/transforms/sequence.py +2 -35
  128. datapipeline/transforms/stream/dedupe.py +24 -0
  129. datapipeline/transforms/stream/ensure_ticks.py +7 -6
  130. datapipeline/transforms/vector/__init__.py +5 -0
  131. datapipeline/transforms/vector/common.py +98 -0
  132. datapipeline/transforms/vector/drop/__init__.py +4 -0
  133. datapipeline/transforms/vector/drop/horizontal.py +79 -0
  134. datapipeline/transforms/vector/drop/orchestrator.py +59 -0
  135. datapipeline/transforms/vector/drop/vertical.py +182 -0
  136. datapipeline/transforms/vector/ensure_schema.py +184 -0
  137. datapipeline/transforms/vector/fill.py +87 -0
  138. datapipeline/transforms/vector/replace.py +62 -0
  139. datapipeline/utils/load.py +24 -3
  140. datapipeline/utils/rich_compat.py +38 -0
  141. datapipeline/utils/window.py +76 -0
  142. jerry_thomas-1.0.1.dist-info/METADATA +825 -0
  143. jerry_thomas-1.0.1.dist-info/RECORD +199 -0
  144. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
  145. datapipeline/build/tasks.py +0 -186
  146. datapipeline/cli/commands/link.py +0 -128
  147. datapipeline/cli/commands/writers.py +0 -138
  148. datapipeline/config/build.py +0 -64
  149. datapipeline/config/run.py +0 -116
  150. datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
  151. datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
  152. datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
  153. datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
  154. datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
  155. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
  156. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
  157. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
  158. datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
  159. datapipeline/transforms/vector.py +0 -210
  160. jerry_thomas-0.3.0.dist-info/METADATA +0 -502
  161. jerry_thomas-0.3.0.dist-info/RECORD +0 -139
  162. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
  163. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
  164. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,274 +1,260 @@
1
- import time
2
- from itertools import islice
3
- from pathlib import Path
4
- from typing import Iterator, List, Optional, Tuple, Union
5
-
1
+ import json
6
2
  import logging
7
-
8
- from datapipeline.cli.visuals import visual_sources
9
- from datapipeline.config.dataset.dataset import FeatureDatasetConfig
10
- from datapipeline.config.dataset.feature import FeatureRecordConfig
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+ from datapipeline.cli.commands.build import run_build_if_needed
7
+ from datapipeline.cli.commands.run_config import (
8
+ RunEntry,
9
+ resolve_run_entries,
10
+ )
11
+ from datapipeline.cli.commands.serve_pipeline import serve_with_runtime
12
+ from datapipeline.cli.visuals.runner import run_job
13
+ from datapipeline.cli.visuals.sections import sections_from_path
14
+ from datapipeline.config.context import resolve_run_profiles
11
15
  from datapipeline.config.dataset.loader import load_dataset
12
- from datapipeline.config.run import RunConfig, load_named_run_configs
13
- from datapipeline.domain.vector import Vector
14
- from datapipeline.pipeline.context import PipelineContext
15
- from datapipeline.pipeline.pipelines import build_vector_pipeline
16
- from datapipeline.pipeline.stages import post_process, split_stage
17
- from datapipeline.runtime import Runtime
18
- from datapipeline.services.bootstrap import bootstrap
19
- from datapipeline.cli.commands.writers import writer_factory, Writer
20
- from tqdm.contrib.logging import logging_redirect_tqdm
16
+ from datapipeline.config.tasks import ServeOutputConfig
17
+ from datapipeline.io.output import OutputResolutionError
18
+ from datapipeline.pipeline.artifacts import StageDemand, required_artifacts_for
21
19
 
22
20
  logger = logging.getLogger(__name__)
23
21
 
24
22
 
25
- def _coerce_log_level(
26
- value: Optional[Union[str, int]],
27
- *,
28
- default: int = logging.WARNING,
29
- ) -> int:
30
- if value is None:
31
- return default
32
- if isinstance(value, int):
33
- return value
34
- name = str(value).upper()
35
- if name not in logging._nameToLevel:
36
- raise ValueError(f"Unsupported log level: {value}")
37
- return logging._nameToLevel[name]
38
-
39
-
40
- def _resolve_run_entries(project_path: Path, run_name: Optional[str]) -> List[Tuple[Optional[str], Optional[RunConfig]]]:
41
- try:
42
- entries = load_named_run_configs(project_path)
43
- except FileNotFoundError:
44
- entries = []
45
- except Exception as exc:
46
- logger.error("Failed to load run configs: %s", exc)
47
- raise SystemExit(2) from exc
48
-
49
- if entries:
50
- if run_name:
51
- entries = [entry for entry in entries if entry[0] == run_name]
52
- if not entries:
53
- logger.error("Unknown run config '%s'", run_name)
54
- raise SystemExit(2)
55
- else:
56
- if run_name:
57
- logger.error("Project does not define run configs.")
58
- raise SystemExit(2)
59
- entries = [(None, None)]
60
- return entries
61
-
62
-
63
- def _iter_runtime_runs(
64
- project_path: Path,
65
- run_name: Optional[str],
66
- keep_override: Optional[str],
67
- ) -> Iterator[Tuple[int, int, Optional[str], Runtime]]:
68
- run_entries = _resolve_run_entries(project_path, run_name)
69
- total_runs = len(run_entries)
70
- for idx, (entry_name, run_cfg) in enumerate(run_entries, start=1):
71
- runtime = bootstrap(project_path)
72
- if run_cfg is not None:
73
- runtime.run = run_cfg
74
- split_keep = getattr(runtime.split, "keep", None)
75
- runtime.split_keep = run_cfg.keep or split_keep
76
- if keep_override:
77
- runtime.split_keep = keep_override
78
- yield idx, total_runs, entry_name, runtime
79
-
80
-
81
- def _limit_items(items: Iterator[Tuple[object, object]], limit: Optional[int]) -> Iterator[Tuple[object, object]]:
82
- if limit is None:
83
- yield from items
84
- else:
85
- yield from islice(items, limit)
86
-
87
-
88
- def _throttle_vectors(vectors: Iterator[Tuple[object, Vector]], throttle_ms: Optional[float]) -> Iterator[Tuple[object, Vector]]:
89
- if not throttle_ms or throttle_ms <= 0:
90
- yield from vectors
91
- return
92
- delay = throttle_ms / 1000.0
93
- for item in vectors:
94
- yield item
95
- time.sleep(delay)
96
-
97
-
98
- def _normalize(key: object, payload: object) -> dict:
99
- return {
100
- "key": list(key) if isinstance(key, tuple) else key,
101
- "values": getattr(payload, "values", payload),
23
+ def _profile_debug_payload(profile) -> dict[str, object]:
24
+ entry = profile.entry
25
+ payload: dict[str, object] = {
26
+ "label": profile.label,
27
+ "idx": profile.idx,
28
+ "total": profile.total,
29
+ "entry": {
30
+ "name": entry.name,
31
+ "path": str(entry.path) if entry.path else None,
32
+ },
33
+ "stage": profile.stage,
34
+ "limit": profile.limit,
35
+ "throttle_ms": profile.throttle_ms,
36
+ "log_level": {
37
+ "name": profile.log_decision.name,
38
+ "value": profile.log_decision.value,
39
+ },
40
+ "visuals": {
41
+ "provider": profile.visuals.visuals,
42
+ "progress": profile.visuals.progress,
43
+ },
44
+ "output": {
45
+ "transport": profile.output.transport,
46
+ "format": profile.output.format,
47
+ "payload": profile.output.payload,
48
+ "destination": str(profile.output.destination)
49
+ if profile.output.destination
50
+ else None,
51
+ },
102
52
  }
53
+ cfg = entry.config
54
+ if cfg is not None:
55
+ payload["run_config"] = cfg.model_dump(
56
+ exclude_unset=True, exclude_none=True)
57
+ return payload
103
58
 
104
59
 
105
- def _serve(
106
- items: Iterator[Tuple[object, object]],
107
- limit: Optional[int],
108
- *,
109
- writer: Writer,
110
- ) -> int:
111
- """Iterate, normalize, write, return count. Writers do only I/O."""
112
- count = 0
113
- try:
114
- for key, payload in _limit_items(items, limit):
115
- writer.write(_normalize(key, payload))
116
- count += 1
117
- except KeyboardInterrupt:
118
- pass
119
- finally:
120
- writer.close()
121
- return count
122
-
123
-
124
- def _report_end(output: Optional[str], count: int) -> None:
125
- mode = (output or "print").lower()
126
- if output and output.lower().endswith(".pt"):
127
- logger.info("Saved %d items to %s", count, output)
128
- elif output and output.lower().endswith(".csv"):
129
- logger.info("Saved %d items to %s", count, output)
130
- elif output and (output.lower().endswith(".jsonl.gz") or output.lower().endswith(".gz")):
131
- logger.info("Saved %d items to %s", count, output)
132
- elif mode == "stream":
133
- logger.info("(streamed %d items)", count)
134
- elif mode == "print":
135
- logger.info("(printed %d items to stdout)", count)
136
- else:
137
- raise ValueError("unreachable: unknown output mode in _report_end")
138
-
60
+ def _log_profile_start_debug(profile) -> None:
61
+ if not logger.isEnabledFor(logging.DEBUG):
62
+ return
63
+ payload = _profile_debug_payload(profile)
64
+ logger.debug(
65
+ "Run profile start (%s/%s):\n%s",
66
+ profile.idx,
67
+ profile.total,
68
+ json.dumps(payload, indent=2, default=str),
69
+ )
139
70
 
140
- def _serve_with_runtime(
141
- runtime,
142
- dataset: FeatureDatasetConfig,
143
- limit: Optional[int],
144
- output: Optional[str],
145
- include_targets: bool,
146
- throttle_ms: Optional[float],
147
- stage: Optional[int] = None,
148
- ) -> None:
149
- context = PipelineContext(runtime)
150
71
 
151
- features = list(dataset.features or [])
152
- if include_targets:
153
- features += list(dataset.targets or [])
72
+ def _entry_sections(run_root: Optional[Path], entry: RunEntry) -> tuple[str, ...]:
73
+ # Prefix sections with a phase label for visuals; keep path-based detail.
74
+ path_sections = sections_from_path(run_root, entry.path)
75
+ return ("Run Tasks",) + tuple(path_sections[1:])
154
76
 
155
- if not features:
156
- logger.warning("(no features configured; nothing to serve)")
157
- return
158
77
 
159
- if stage is not None and stage <= 5:
160
- for cfg in features:
161
- stream = build_vector_pipeline(
162
- context,
163
- [cfg],
164
- dataset.group_by,
165
- stage=stage,
166
- )
167
- items = ((cfg.id, item) for item in stream)
168
- writer = writer_factory(output)
169
- count = _serve(items, limit, writer=writer)
170
- _report_end(output, count)
171
- return
78
+ def _build_cli_output_config(
79
+ transport: Optional[str],
80
+ fmt: Optional[str],
81
+ path: Optional[str],
82
+ payload: Optional[str],
83
+ ) -> tuple[ServeOutputConfig | None, Optional[str]]:
84
+ payload_style = None
85
+ if payload is not None:
86
+ payload_style = payload.lower()
87
+ if payload_style not in {"sample", "vector"}:
88
+ logger.error("--out-payload must be 'sample' or 'vector'")
89
+ raise SystemExit(2)
172
90
 
173
- vector_stage = 6 if stage in (6, 7) else None
174
- vectors = build_vector_pipeline(
175
- context,
176
- features,
177
- dataset.group_by,
178
- stage=vector_stage,
91
+ if transport is None and fmt is None and path is None:
92
+ return None, payload_style
93
+
94
+ if not transport or not fmt:
95
+ logger.error(
96
+ "--out-transport and --out-format must be provided together")
97
+ raise SystemExit(2)
98
+ transport = transport.lower()
99
+ fmt = fmt.lower()
100
+ if transport == "fs":
101
+ if not path:
102
+ logger.error(
103
+ "--out-path is required when --out-transport=fs (directory)")
104
+ raise SystemExit(2)
105
+ return (
106
+ ServeOutputConfig(
107
+ transport="fs",
108
+ format=fmt,
109
+ directory=Path(path),
110
+ payload=payload_style or "sample",
111
+ ),
112
+ None,
113
+ )
114
+ if path:
115
+ logger.error("--out-path is only valid when --out-transport=fs")
116
+ raise SystemExit(2)
117
+ return (
118
+ ServeOutputConfig(
119
+ transport="stdout",
120
+ format=fmt,
121
+ payload=payload_style or "sample",
122
+ ),
123
+ None,
179
124
  )
180
125
 
181
- if stage in (None, 7):
182
- vectors = post_process(context, vectors)
183
- if stage is None:
184
- vectors = split_stage(runtime, vectors)
185
- vectors = _throttle_vectors(vectors, throttle_ms)
186
-
187
- writer = writer_factory(output)
188
- result_count = _serve(vectors, limit, writer=writer)
189
- _report_end(output, result_count)
190
126
 
191
-
192
- def _execute_runs(
127
+ def ensure_stage_artifacts(
193
128
  project_path: Path,
194
- stage: Optional[int],
195
- limit: Optional[int],
196
- output: Optional[str],
197
- include_targets: Optional[bool],
198
- keep: Optional[str],
199
- run_name: Optional[str],
129
+ dataset,
130
+ profiles,
200
131
  *,
201
- cli_log_level: Optional[str],
202
- base_log_level: str,
132
+ cli_visuals: Optional[str],
133
+ cli_progress: Optional[str],
134
+ workspace,
203
135
  ) -> None:
204
- # Helper for precedence: CLI > config > default
205
- def pick(cli_val, cfg_val, default=None):
206
- return cli_val if cli_val is not None else (cfg_val if cfg_val is not None else default)
207
-
208
- dataset_name = "vectors" if stage is None else "features"
209
- dataset = load_dataset(project_path, dataset_name)
210
-
211
- base_level_name = str(base_log_level).upper()
212
- base_level_value = _coerce_log_level(base_level_name)
213
-
214
- for idx, total_runs, entry_name, runtime in _iter_runtime_runs(project_path, run_name, keep):
215
- run = getattr(runtime, "run", None)
216
-
217
- # resolving argument hierarchy CLI args > run config > defaults
218
- resolved_limit = pick(limit, getattr(run, "limit", None), None)
219
- resolved_output = pick(output, getattr(run, "output", None), "print")
220
- resolved_include_targets = pick(
221
- include_targets, getattr(run, "include_targets", None), False)
222
- throttle_ms = getattr(run, "throttle_ms", None)
223
- resolved_level_name = pick(
224
- cli_log_level.upper() if cli_log_level else None,
225
- getattr(run, "log_level", None),
226
- base_level_name,
227
- )
228
- resolved_level_value = _coerce_log_level(
229
- resolved_level_name, default=base_level_value)
230
-
231
- root_logger = logging.getLogger()
232
- if root_logger.level != resolved_level_value:
233
- root_logger.setLevel(resolved_level_value)
234
-
235
- label = entry_name or f"run{idx}"
236
- logger.info("Run '%s' (%d/%d)", label, idx, total_runs)
237
-
238
- with visual_sources(runtime, resolved_level_value):
239
- with logging_redirect_tqdm():
240
- _serve_with_runtime(
241
- runtime,
242
- dataset,
243
- limit=resolved_limit,
244
- output=resolved_output,
245
- include_targets=resolved_include_targets,
246
- throttle_ms=throttle_ms,
247
- stage=stage,
248
- )
136
+ demands = [StageDemand(profile.stage) for profile in profiles]
137
+ required = required_artifacts_for(dataset, demands)
138
+ if not required:
139
+ return
140
+ run_build_if_needed(
141
+ project_path,
142
+ cli_visuals=cli_visuals,
143
+ cli_progress=cli_progress,
144
+ workspace=workspace,
145
+ required_artifacts=required,
146
+ )
249
147
 
250
148
 
251
149
  def handle_serve(
252
150
  project: str,
253
151
  limit: Optional[int],
254
- output: Optional[str],
255
- include_targets: Optional[bool] = None,
256
152
  keep: Optional[str] = None,
257
153
  run_name: Optional[str] = None,
258
154
  stage: Optional[int] = None,
155
+ out_transport: Optional[str] = None,
156
+ out_format: Optional[str] = None,
157
+ out_payload: Optional[str] = None,
158
+ out_path: Optional[str] = None,
159
+ skip_build: bool = False,
259
160
  *,
260
161
  cli_log_level: Optional[str],
261
162
  base_log_level: str,
163
+ cli_visuals: Optional[str] = None,
164
+ cli_progress: Optional[str] = None,
165
+ workspace=None,
262
166
  ) -> None:
263
167
  project_path = Path(project)
264
- _execute_runs(
265
- project_path=project_path,
266
- stage=stage,
267
- limit=limit,
268
- output=output,
269
- include_targets=include_targets,
270
- keep=keep,
271
- run_name=run_name,
272
- cli_log_level=cli_log_level,
273
- base_log_level=base_log_level,
274
- )
168
+ run_entries, run_root = resolve_run_entries(project_path, run_name)
169
+
170
+ cli_output_cfg, payload_override = _build_cli_output_config(
171
+ out_transport, out_format, out_path, out_payload)
172
+ try:
173
+ profiles = resolve_run_profiles(
174
+ project_path=project_path,
175
+ run_entries=run_entries,
176
+ keep=keep,
177
+ stage=stage,
178
+ limit=limit,
179
+ cli_output=cli_output_cfg,
180
+ cli_payload=payload_override or (
181
+ out_payload.lower() if out_payload else None),
182
+ workspace=workspace,
183
+ cli_log_level=cli_log_level,
184
+ base_log_level=base_log_level,
185
+ cli_visuals=cli_visuals,
186
+ cli_progress=cli_progress,
187
+ create_run=False,
188
+ )
189
+ except OutputResolutionError as exc:
190
+ logger.error("Invalid output configuration: %s", exc)
191
+ raise SystemExit(2) from exc
192
+
193
+ vector_dataset = load_dataset(project_path, "vectors")
194
+ skip_reason = None
195
+ if skip_build:
196
+ skip_reason = "--skip-build flag provided"
197
+
198
+ if not skip_reason:
199
+ ensure_stage_artifacts(
200
+ project_path,
201
+ vector_dataset,
202
+ profiles,
203
+ cli_visuals=cli_visuals,
204
+ cli_progress=cli_progress,
205
+ workspace=workspace,
206
+ )
207
+ profiles = resolve_run_profiles(
208
+ project_path=project_path,
209
+ run_entries=run_entries,
210
+ keep=keep,
211
+ stage=stage,
212
+ limit=limit,
213
+ cli_output=cli_output_cfg,
214
+ cli_payload=payload_override or (
215
+ out_payload.lower() if out_payload else None),
216
+ workspace=workspace,
217
+ cli_log_level=cli_log_level,
218
+ base_log_level=base_log_level,
219
+ cli_visuals=cli_visuals,
220
+ cli_progress=cli_progress,
221
+ create_run=True,
222
+ )
223
+
224
+ datasets: dict[str, object] = {}
225
+ datasets["vectors"] = vector_dataset
226
+ for profile in profiles:
227
+ dataset_name = "vectors" if profile.stage is None else "features"
228
+ dataset = datasets.get(dataset_name)
229
+ if dataset is None:
230
+ dataset = load_dataset(project_path, dataset_name)
231
+ datasets[dataset_name] = dataset
232
+
233
+ root_logger = logging.getLogger()
234
+ if root_logger.level != profile.log_decision.value:
235
+ root_logger.setLevel(profile.log_decision.value)
236
+
237
+ def _work(profile=profile):
238
+ _log_profile_start_debug(profile)
239
+ serve_with_runtime(
240
+ profile.runtime,
241
+ dataset,
242
+ limit=profile.limit,
243
+ target=profile.output,
244
+ throttle_ms=profile.throttle_ms,
245
+ stage=profile.stage,
246
+ visuals=profile.visuals.visuals,
247
+ )
248
+
249
+ sections = _entry_sections(run_root, profile.entry)
250
+ run_job(
251
+ sections=sections,
252
+ label=profile.label,
253
+ visuals=profile.visuals.visuals or "auto",
254
+ progress_style=profile.visuals.progress or "auto",
255
+ level=profile.log_decision.value,
256
+ runtime=profile.runtime,
257
+ work=_work,
258
+ idx=profile.idx,
259
+ total=profile.total,
260
+ )
@@ -0,0 +1,101 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Iterator, List, NamedTuple, Optional, Sequence
6
+
7
+ from datapipeline.config.tasks import ServeTask, serve_tasks
8
+ from datapipeline.runtime import Runtime
9
+ from datapipeline.services.bootstrap import bootstrap
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class RunEntry(NamedTuple):
15
+ name: Optional[str]
16
+ config: Optional[ServeTask]
17
+ path: Optional[Path]
18
+
19
+
20
+ def resolve_run_entries(project_path: Path, run_name: Optional[str]) -> tuple[List[RunEntry], Optional[Path]]:
21
+ try:
22
+ raw_entries = serve_tasks(project_path)
23
+ except FileNotFoundError:
24
+ raw_entries = []
25
+ except Exception as exc:
26
+ logger.error("Failed to load serve tasks: %s", exc)
27
+ raise SystemExit(2) from exc
28
+
29
+ entries: List[RunEntry] = []
30
+ root_path: Optional[Path] = None
31
+
32
+ if raw_entries:
33
+ if not run_name:
34
+ raw_entries = [task for task in raw_entries if task.enabled]
35
+ if run_name:
36
+ raw_entries = [
37
+ task
38
+ for task in raw_entries
39
+ if task.effective_name() == run_name
40
+ ]
41
+ if not raw_entries:
42
+ logger.error("Unknown run task '%s'", run_name)
43
+ raise SystemExit(2)
44
+ for task in raw_entries:
45
+ path = getattr(task, "source_path", None)
46
+ if root_path is None and path is not None:
47
+ root_path = path.parent
48
+ entries.append(
49
+ RunEntry(
50
+ name=task.effective_name(),
51
+ config=task,
52
+ path=path,
53
+ )
54
+ )
55
+ else:
56
+ if run_name:
57
+ logger.error("Project does not define serve tasks.")
58
+ raise SystemExit(2)
59
+ entries = [RunEntry(name=None, config=None, path=None)]
60
+ return entries, root_path
61
+
62
+
63
+ def iter_runtime_runs(
64
+ project_path: Path,
65
+ run_entries: Sequence[RunEntry],
66
+ keep_override: Optional[str],
67
+ ) -> Iterator[tuple[int, int, RunEntry, Runtime]]:
68
+ total_runs = len(run_entries)
69
+ for idx, entry in enumerate(run_entries, start=1):
70
+ run_cfg = entry.config
71
+ runtime = bootstrap(project_path)
72
+ if run_cfg is not None:
73
+ runtime.run = run_cfg
74
+ split_keep = getattr(runtime.split, "keep", None)
75
+ runtime.split_keep = run_cfg.keep or split_keep
76
+ if keep_override:
77
+ runtime.split_keep = keep_override
78
+ yield idx, total_runs, entry, runtime
79
+
80
+
81
+ def determine_preview_stage(
82
+ cli_stage: Optional[int],
83
+ run_entries: Sequence[RunEntry],
84
+ ) -> tuple[Optional[int], Optional[str]]:
85
+ if cli_stage is not None:
86
+ return cli_stage, "CLI flag"
87
+
88
+ stages: List[int] = []
89
+ for entry in run_entries:
90
+ run_cfg = entry.config
91
+ cfg_stage = getattr(run_cfg, "stage", None) if run_cfg else None
92
+ if cfg_stage is None:
93
+ return None, None
94
+ stages.append(cfg_stage)
95
+
96
+ if not stages or any(stage > 5 for stage in stages):
97
+ return None, None
98
+
99
+ if len(set(stages)) == 1:
100
+ return stages[0], "run config"
101
+ return min(stages), "run configs"