jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +120 -17
- datapipeline/analysis/vector/matrix.py +33 -8
- datapipeline/analysis/vector/report.py +162 -32
- datapipeline/build/tasks/__init__.py +11 -0
- datapipeline/build/tasks/config.py +74 -0
- datapipeline/build/tasks/metadata.py +170 -0
- datapipeline/build/tasks/scaler.py +73 -0
- datapipeline/build/tasks/schema.py +60 -0
- datapipeline/build/tasks/utils.py +169 -0
- datapipeline/cli/app.py +304 -127
- datapipeline/cli/commands/build.py +240 -16
- datapipeline/cli/commands/contract.py +367 -0
- datapipeline/cli/commands/domain.py +8 -3
- datapipeline/cli/commands/inspect.py +401 -149
- datapipeline/cli/commands/list_.py +30 -7
- datapipeline/cli/commands/plugin.py +5 -1
- datapipeline/cli/commands/run.py +227 -241
- datapipeline/cli/commands/run_config.py +101 -0
- datapipeline/cli/commands/serve_pipeline.py +156 -0
- datapipeline/cli/commands/source.py +44 -8
- datapipeline/cli/visuals/__init__.py +4 -2
- datapipeline/cli/visuals/common.py +239 -0
- datapipeline/cli/visuals/labels.py +15 -15
- datapipeline/cli/visuals/runner.py +66 -0
- datapipeline/cli/visuals/sections.py +20 -0
- datapipeline/cli/visuals/sources.py +132 -119
- datapipeline/cli/visuals/sources_basic.py +260 -0
- datapipeline/cli/visuals/sources_off.py +76 -0
- datapipeline/cli/visuals/sources_rich.py +414 -0
- datapipeline/config/catalog.py +37 -3
- datapipeline/config/context.py +214 -0
- datapipeline/config/dataset/loader.py +21 -4
- datapipeline/config/dataset/normalize.py +4 -4
- datapipeline/config/metadata.py +43 -0
- datapipeline/config/postprocess.py +2 -2
- datapipeline/config/project.py +3 -2
- datapipeline/config/resolution.py +129 -0
- datapipeline/config/tasks.py +309 -0
- datapipeline/config/workspace.py +155 -0
- datapipeline/domain/__init__.py +12 -0
- datapipeline/domain/record.py +11 -0
- datapipeline/domain/sample.py +54 -0
- datapipeline/integrations/ml/adapter.py +34 -20
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +1 -6
- datapipeline/integrations/ml/torch_support.py +1 -3
- datapipeline/io/factory.py +112 -0
- datapipeline/io/output.py +132 -0
- datapipeline/io/protocols.py +21 -0
- datapipeline/io/serializers.py +219 -0
- datapipeline/io/sinks/__init__.py +23 -0
- datapipeline/io/sinks/base.py +2 -0
- datapipeline/io/sinks/files.py +79 -0
- datapipeline/io/sinks/rich.py +57 -0
- datapipeline/io/sinks/stdout.py +18 -0
- datapipeline/io/writers/__init__.py +14 -0
- datapipeline/io/writers/base.py +28 -0
- datapipeline/io/writers/csv_writer.py +25 -0
- datapipeline/io/writers/jsonl.py +52 -0
- datapipeline/io/writers/pickle_writer.py +30 -0
- datapipeline/pipeline/artifacts.py +58 -0
- datapipeline/pipeline/context.py +66 -7
- datapipeline/pipeline/observability.py +65 -0
- datapipeline/pipeline/pipelines.py +65 -13
- datapipeline/pipeline/split.py +11 -10
- datapipeline/pipeline/stages.py +127 -16
- datapipeline/pipeline/utils/keygen.py +20 -7
- datapipeline/pipeline/utils/memory_sort.py +22 -10
- datapipeline/pipeline/utils/transform_utils.py +22 -0
- datapipeline/runtime.py +5 -2
- datapipeline/services/artifacts.py +12 -6
- datapipeline/services/bootstrap/config.py +25 -0
- datapipeline/services/bootstrap/core.py +52 -37
- datapipeline/services/constants.py +6 -5
- datapipeline/services/factories.py +123 -1
- datapipeline/services/project_paths.py +43 -16
- datapipeline/services/runs.py +208 -0
- datapipeline/services/scaffold/domain.py +3 -2
- datapipeline/services/scaffold/filter.py +3 -2
- datapipeline/services/scaffold/mappers.py +9 -6
- datapipeline/services/scaffold/plugin.py +54 -10
- datapipeline/services/scaffold/source.py +93 -56
- datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
- datapipeline/sources/decoders.py +83 -18
- datapipeline/sources/factory.py +26 -16
- datapipeline/sources/models/__init__.py +2 -2
- datapipeline/sources/models/generator.py +0 -7
- datapipeline/sources/models/loader.py +3 -3
- datapipeline/sources/models/parsing_error.py +24 -0
- datapipeline/sources/models/source.py +6 -6
- datapipeline/sources/synthetic/time/loader.py +14 -2
- datapipeline/sources/transports.py +74 -37
- datapipeline/templates/plugin_skeleton/README.md +76 -30
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
- datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
- datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
- datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -0
- datapipeline/templates/stubs/mapper.py.j2 +5 -4
- datapipeline/templates/stubs/parser.py.j2 +2 -0
- datapipeline/templates/stubs/record.py.j2 +2 -0
- datapipeline/templates/stubs/source.yaml.j2 +2 -3
- datapipeline/transforms/debug/lint.py +26 -41
- datapipeline/transforms/feature/scaler.py +89 -13
- datapipeline/transforms/record/floor_time.py +4 -4
- datapipeline/transforms/sequence.py +2 -35
- datapipeline/transforms/stream/dedupe.py +24 -0
- datapipeline/transforms/stream/ensure_ticks.py +7 -6
- datapipeline/transforms/vector/__init__.py +5 -0
- datapipeline/transforms/vector/common.py +98 -0
- datapipeline/transforms/vector/drop/__init__.py +4 -0
- datapipeline/transforms/vector/drop/horizontal.py +79 -0
- datapipeline/transforms/vector/drop/orchestrator.py +59 -0
- datapipeline/transforms/vector/drop/vertical.py +182 -0
- datapipeline/transforms/vector/ensure_schema.py +184 -0
- datapipeline/transforms/vector/fill.py +87 -0
- datapipeline/transforms/vector/replace.py +62 -0
- datapipeline/utils/load.py +24 -3
- datapipeline/utils/rich_compat.py +38 -0
- datapipeline/utils/window.py +76 -0
- jerry_thomas-1.0.1.dist-info/METADATA +825 -0
- jerry_thomas-1.0.1.dist-info/RECORD +199 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
- datapipeline/build/tasks.py +0 -186
- datapipeline/cli/commands/link.py +0 -128
- datapipeline/cli/commands/writers.py +0 -138
- datapipeline/config/build.py +0 -64
- datapipeline/config/run.py +0 -116
- datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
- datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
- datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
- datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
- datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
- datapipeline/transforms/vector.py +0 -210
- jerry_thomas-0.3.0.dist-info/METADATA +0 -502
- jerry_thomas-0.3.0.dist-info/RECORD +0 -139
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0
datapipeline/cli/commands/run.py
CHANGED
|
@@ -1,274 +1,260 @@
|
|
|
1
|
-
import
|
|
2
|
-
from itertools import islice
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import Iterator, List, Optional, Tuple, Union
|
|
5
|
-
|
|
1
|
+
import json
|
|
6
2
|
import logging
|
|
7
|
-
|
|
8
|
-
from
|
|
9
|
-
|
|
10
|
-
from datapipeline.
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from datapipeline.cli.commands.build import run_build_if_needed
|
|
7
|
+
from datapipeline.cli.commands.run_config import (
|
|
8
|
+
RunEntry,
|
|
9
|
+
resolve_run_entries,
|
|
10
|
+
)
|
|
11
|
+
from datapipeline.cli.commands.serve_pipeline import serve_with_runtime
|
|
12
|
+
from datapipeline.cli.visuals.runner import run_job
|
|
13
|
+
from datapipeline.cli.visuals.sections import sections_from_path
|
|
14
|
+
from datapipeline.config.context import resolve_run_profiles
|
|
11
15
|
from datapipeline.config.dataset.loader import load_dataset
|
|
12
|
-
from datapipeline.config.
|
|
13
|
-
from datapipeline.
|
|
14
|
-
from datapipeline.pipeline.
|
|
15
|
-
from datapipeline.pipeline.pipelines import build_vector_pipeline
|
|
16
|
-
from datapipeline.pipeline.stages import post_process, split_stage
|
|
17
|
-
from datapipeline.runtime import Runtime
|
|
18
|
-
from datapipeline.services.bootstrap import bootstrap
|
|
19
|
-
from datapipeline.cli.commands.writers import writer_factory, Writer
|
|
20
|
-
from tqdm.contrib.logging import logging_redirect_tqdm
|
|
16
|
+
from datapipeline.config.tasks import ServeOutputConfig
|
|
17
|
+
from datapipeline.io.output import OutputResolutionError
|
|
18
|
+
from datapipeline.pipeline.artifacts import StageDemand, required_artifacts_for
|
|
21
19
|
|
|
22
20
|
logger = logging.getLogger(__name__)
|
|
23
21
|
|
|
24
22
|
|
|
25
|
-
def
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
raise SystemExit(2)
|
|
55
|
-
else:
|
|
56
|
-
if run_name:
|
|
57
|
-
logger.error("Project does not define run configs.")
|
|
58
|
-
raise SystemExit(2)
|
|
59
|
-
entries = [(None, None)]
|
|
60
|
-
return entries
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def _iter_runtime_runs(
|
|
64
|
-
project_path: Path,
|
|
65
|
-
run_name: Optional[str],
|
|
66
|
-
keep_override: Optional[str],
|
|
67
|
-
) -> Iterator[Tuple[int, int, Optional[str], Runtime]]:
|
|
68
|
-
run_entries = _resolve_run_entries(project_path, run_name)
|
|
69
|
-
total_runs = len(run_entries)
|
|
70
|
-
for idx, (entry_name, run_cfg) in enumerate(run_entries, start=1):
|
|
71
|
-
runtime = bootstrap(project_path)
|
|
72
|
-
if run_cfg is not None:
|
|
73
|
-
runtime.run = run_cfg
|
|
74
|
-
split_keep = getattr(runtime.split, "keep", None)
|
|
75
|
-
runtime.split_keep = run_cfg.keep or split_keep
|
|
76
|
-
if keep_override:
|
|
77
|
-
runtime.split_keep = keep_override
|
|
78
|
-
yield idx, total_runs, entry_name, runtime
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def _limit_items(items: Iterator[Tuple[object, object]], limit: Optional[int]) -> Iterator[Tuple[object, object]]:
|
|
82
|
-
if limit is None:
|
|
83
|
-
yield from items
|
|
84
|
-
else:
|
|
85
|
-
yield from islice(items, limit)
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
def _throttle_vectors(vectors: Iterator[Tuple[object, Vector]], throttle_ms: Optional[float]) -> Iterator[Tuple[object, Vector]]:
|
|
89
|
-
if not throttle_ms or throttle_ms <= 0:
|
|
90
|
-
yield from vectors
|
|
91
|
-
return
|
|
92
|
-
delay = throttle_ms / 1000.0
|
|
93
|
-
for item in vectors:
|
|
94
|
-
yield item
|
|
95
|
-
time.sleep(delay)
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
def _normalize(key: object, payload: object) -> dict:
|
|
99
|
-
return {
|
|
100
|
-
"key": list(key) if isinstance(key, tuple) else key,
|
|
101
|
-
"values": getattr(payload, "values", payload),
|
|
23
|
+
def _profile_debug_payload(profile) -> dict[str, object]:
|
|
24
|
+
entry = profile.entry
|
|
25
|
+
payload: dict[str, object] = {
|
|
26
|
+
"label": profile.label,
|
|
27
|
+
"idx": profile.idx,
|
|
28
|
+
"total": profile.total,
|
|
29
|
+
"entry": {
|
|
30
|
+
"name": entry.name,
|
|
31
|
+
"path": str(entry.path) if entry.path else None,
|
|
32
|
+
},
|
|
33
|
+
"stage": profile.stage,
|
|
34
|
+
"limit": profile.limit,
|
|
35
|
+
"throttle_ms": profile.throttle_ms,
|
|
36
|
+
"log_level": {
|
|
37
|
+
"name": profile.log_decision.name,
|
|
38
|
+
"value": profile.log_decision.value,
|
|
39
|
+
},
|
|
40
|
+
"visuals": {
|
|
41
|
+
"provider": profile.visuals.visuals,
|
|
42
|
+
"progress": profile.visuals.progress,
|
|
43
|
+
},
|
|
44
|
+
"output": {
|
|
45
|
+
"transport": profile.output.transport,
|
|
46
|
+
"format": profile.output.format,
|
|
47
|
+
"payload": profile.output.payload,
|
|
48
|
+
"destination": str(profile.output.destination)
|
|
49
|
+
if profile.output.destination
|
|
50
|
+
else None,
|
|
51
|
+
},
|
|
102
52
|
}
|
|
53
|
+
cfg = entry.config
|
|
54
|
+
if cfg is not None:
|
|
55
|
+
payload["run_config"] = cfg.model_dump(
|
|
56
|
+
exclude_unset=True, exclude_none=True)
|
|
57
|
+
return payload
|
|
103
58
|
|
|
104
59
|
|
|
105
|
-
def
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
writer.write(_normalize(key, payload))
|
|
116
|
-
count += 1
|
|
117
|
-
except KeyboardInterrupt:
|
|
118
|
-
pass
|
|
119
|
-
finally:
|
|
120
|
-
writer.close()
|
|
121
|
-
return count
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
def _report_end(output: Optional[str], count: int) -> None:
|
|
125
|
-
mode = (output or "print").lower()
|
|
126
|
-
if output and output.lower().endswith(".pt"):
|
|
127
|
-
logger.info("Saved %d items to %s", count, output)
|
|
128
|
-
elif output and output.lower().endswith(".csv"):
|
|
129
|
-
logger.info("Saved %d items to %s", count, output)
|
|
130
|
-
elif output and (output.lower().endswith(".jsonl.gz") or output.lower().endswith(".gz")):
|
|
131
|
-
logger.info("Saved %d items to %s", count, output)
|
|
132
|
-
elif mode == "stream":
|
|
133
|
-
logger.info("(streamed %d items)", count)
|
|
134
|
-
elif mode == "print":
|
|
135
|
-
logger.info("(printed %d items to stdout)", count)
|
|
136
|
-
else:
|
|
137
|
-
raise ValueError("unreachable: unknown output mode in _report_end")
|
|
138
|
-
|
|
60
|
+
def _log_profile_start_debug(profile) -> None:
|
|
61
|
+
if not logger.isEnabledFor(logging.DEBUG):
|
|
62
|
+
return
|
|
63
|
+
payload = _profile_debug_payload(profile)
|
|
64
|
+
logger.debug(
|
|
65
|
+
"Run profile start (%s/%s):\n%s",
|
|
66
|
+
profile.idx,
|
|
67
|
+
profile.total,
|
|
68
|
+
json.dumps(payload, indent=2, default=str),
|
|
69
|
+
)
|
|
139
70
|
|
|
140
|
-
def _serve_with_runtime(
|
|
141
|
-
runtime,
|
|
142
|
-
dataset: FeatureDatasetConfig,
|
|
143
|
-
limit: Optional[int],
|
|
144
|
-
output: Optional[str],
|
|
145
|
-
include_targets: bool,
|
|
146
|
-
throttle_ms: Optional[float],
|
|
147
|
-
stage: Optional[int] = None,
|
|
148
|
-
) -> None:
|
|
149
|
-
context = PipelineContext(runtime)
|
|
150
71
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
72
|
+
def _entry_sections(run_root: Optional[Path], entry: RunEntry) -> tuple[str, ...]:
|
|
73
|
+
# Prefix sections with a phase label for visuals; keep path-based detail.
|
|
74
|
+
path_sections = sections_from_path(run_root, entry.path)
|
|
75
|
+
return ("Run Tasks",) + tuple(path_sections[1:])
|
|
154
76
|
|
|
155
|
-
if not features:
|
|
156
|
-
logger.warning("(no features configured; nothing to serve)")
|
|
157
|
-
return
|
|
158
77
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
return
|
|
78
|
+
def _build_cli_output_config(
|
|
79
|
+
transport: Optional[str],
|
|
80
|
+
fmt: Optional[str],
|
|
81
|
+
path: Optional[str],
|
|
82
|
+
payload: Optional[str],
|
|
83
|
+
) -> tuple[ServeOutputConfig | None, Optional[str]]:
|
|
84
|
+
payload_style = None
|
|
85
|
+
if payload is not None:
|
|
86
|
+
payload_style = payload.lower()
|
|
87
|
+
if payload_style not in {"sample", "vector"}:
|
|
88
|
+
logger.error("--out-payload must be 'sample' or 'vector'")
|
|
89
|
+
raise SystemExit(2)
|
|
172
90
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
91
|
+
if transport is None and fmt is None and path is None:
|
|
92
|
+
return None, payload_style
|
|
93
|
+
|
|
94
|
+
if not transport or not fmt:
|
|
95
|
+
logger.error(
|
|
96
|
+
"--out-transport and --out-format must be provided together")
|
|
97
|
+
raise SystemExit(2)
|
|
98
|
+
transport = transport.lower()
|
|
99
|
+
fmt = fmt.lower()
|
|
100
|
+
if transport == "fs":
|
|
101
|
+
if not path:
|
|
102
|
+
logger.error(
|
|
103
|
+
"--out-path is required when --out-transport=fs (directory)")
|
|
104
|
+
raise SystemExit(2)
|
|
105
|
+
return (
|
|
106
|
+
ServeOutputConfig(
|
|
107
|
+
transport="fs",
|
|
108
|
+
format=fmt,
|
|
109
|
+
directory=Path(path),
|
|
110
|
+
payload=payload_style or "sample",
|
|
111
|
+
),
|
|
112
|
+
None,
|
|
113
|
+
)
|
|
114
|
+
if path:
|
|
115
|
+
logger.error("--out-path is only valid when --out-transport=fs")
|
|
116
|
+
raise SystemExit(2)
|
|
117
|
+
return (
|
|
118
|
+
ServeOutputConfig(
|
|
119
|
+
transport="stdout",
|
|
120
|
+
format=fmt,
|
|
121
|
+
payload=payload_style or "sample",
|
|
122
|
+
),
|
|
123
|
+
None,
|
|
179
124
|
)
|
|
180
125
|
|
|
181
|
-
if stage in (None, 7):
|
|
182
|
-
vectors = post_process(context, vectors)
|
|
183
|
-
if stage is None:
|
|
184
|
-
vectors = split_stage(runtime, vectors)
|
|
185
|
-
vectors = _throttle_vectors(vectors, throttle_ms)
|
|
186
|
-
|
|
187
|
-
writer = writer_factory(output)
|
|
188
|
-
result_count = _serve(vectors, limit, writer=writer)
|
|
189
|
-
_report_end(output, result_count)
|
|
190
126
|
|
|
191
|
-
|
|
192
|
-
def _execute_runs(
|
|
127
|
+
def ensure_stage_artifacts(
|
|
193
128
|
project_path: Path,
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
output: Optional[str],
|
|
197
|
-
include_targets: Optional[bool],
|
|
198
|
-
keep: Optional[str],
|
|
199
|
-
run_name: Optional[str],
|
|
129
|
+
dataset,
|
|
130
|
+
profiles,
|
|
200
131
|
*,
|
|
201
|
-
|
|
202
|
-
|
|
132
|
+
cli_visuals: Optional[str],
|
|
133
|
+
cli_progress: Optional[str],
|
|
134
|
+
workspace,
|
|
203
135
|
) -> None:
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
run = getattr(runtime, "run", None)
|
|
216
|
-
|
|
217
|
-
# resolving argument hierarchy CLI args > run config > defaults
|
|
218
|
-
resolved_limit = pick(limit, getattr(run, "limit", None), None)
|
|
219
|
-
resolved_output = pick(output, getattr(run, "output", None), "print")
|
|
220
|
-
resolved_include_targets = pick(
|
|
221
|
-
include_targets, getattr(run, "include_targets", None), False)
|
|
222
|
-
throttle_ms = getattr(run, "throttle_ms", None)
|
|
223
|
-
resolved_level_name = pick(
|
|
224
|
-
cli_log_level.upper() if cli_log_level else None,
|
|
225
|
-
getattr(run, "log_level", None),
|
|
226
|
-
base_level_name,
|
|
227
|
-
)
|
|
228
|
-
resolved_level_value = _coerce_log_level(
|
|
229
|
-
resolved_level_name, default=base_level_value)
|
|
230
|
-
|
|
231
|
-
root_logger = logging.getLogger()
|
|
232
|
-
if root_logger.level != resolved_level_value:
|
|
233
|
-
root_logger.setLevel(resolved_level_value)
|
|
234
|
-
|
|
235
|
-
label = entry_name or f"run{idx}"
|
|
236
|
-
logger.info("Run '%s' (%d/%d)", label, idx, total_runs)
|
|
237
|
-
|
|
238
|
-
with visual_sources(runtime, resolved_level_value):
|
|
239
|
-
with logging_redirect_tqdm():
|
|
240
|
-
_serve_with_runtime(
|
|
241
|
-
runtime,
|
|
242
|
-
dataset,
|
|
243
|
-
limit=resolved_limit,
|
|
244
|
-
output=resolved_output,
|
|
245
|
-
include_targets=resolved_include_targets,
|
|
246
|
-
throttle_ms=throttle_ms,
|
|
247
|
-
stage=stage,
|
|
248
|
-
)
|
|
136
|
+
demands = [StageDemand(profile.stage) for profile in profiles]
|
|
137
|
+
required = required_artifacts_for(dataset, demands)
|
|
138
|
+
if not required:
|
|
139
|
+
return
|
|
140
|
+
run_build_if_needed(
|
|
141
|
+
project_path,
|
|
142
|
+
cli_visuals=cli_visuals,
|
|
143
|
+
cli_progress=cli_progress,
|
|
144
|
+
workspace=workspace,
|
|
145
|
+
required_artifacts=required,
|
|
146
|
+
)
|
|
249
147
|
|
|
250
148
|
|
|
251
149
|
def handle_serve(
|
|
252
150
|
project: str,
|
|
253
151
|
limit: Optional[int],
|
|
254
|
-
output: Optional[str],
|
|
255
|
-
include_targets: Optional[bool] = None,
|
|
256
152
|
keep: Optional[str] = None,
|
|
257
153
|
run_name: Optional[str] = None,
|
|
258
154
|
stage: Optional[int] = None,
|
|
155
|
+
out_transport: Optional[str] = None,
|
|
156
|
+
out_format: Optional[str] = None,
|
|
157
|
+
out_payload: Optional[str] = None,
|
|
158
|
+
out_path: Optional[str] = None,
|
|
159
|
+
skip_build: bool = False,
|
|
259
160
|
*,
|
|
260
161
|
cli_log_level: Optional[str],
|
|
261
162
|
base_log_level: str,
|
|
163
|
+
cli_visuals: Optional[str] = None,
|
|
164
|
+
cli_progress: Optional[str] = None,
|
|
165
|
+
workspace=None,
|
|
262
166
|
) -> None:
|
|
263
167
|
project_path = Path(project)
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
168
|
+
run_entries, run_root = resolve_run_entries(project_path, run_name)
|
|
169
|
+
|
|
170
|
+
cli_output_cfg, payload_override = _build_cli_output_config(
|
|
171
|
+
out_transport, out_format, out_path, out_payload)
|
|
172
|
+
try:
|
|
173
|
+
profiles = resolve_run_profiles(
|
|
174
|
+
project_path=project_path,
|
|
175
|
+
run_entries=run_entries,
|
|
176
|
+
keep=keep,
|
|
177
|
+
stage=stage,
|
|
178
|
+
limit=limit,
|
|
179
|
+
cli_output=cli_output_cfg,
|
|
180
|
+
cli_payload=payload_override or (
|
|
181
|
+
out_payload.lower() if out_payload else None),
|
|
182
|
+
workspace=workspace,
|
|
183
|
+
cli_log_level=cli_log_level,
|
|
184
|
+
base_log_level=base_log_level,
|
|
185
|
+
cli_visuals=cli_visuals,
|
|
186
|
+
cli_progress=cli_progress,
|
|
187
|
+
create_run=False,
|
|
188
|
+
)
|
|
189
|
+
except OutputResolutionError as exc:
|
|
190
|
+
logger.error("Invalid output configuration: %s", exc)
|
|
191
|
+
raise SystemExit(2) from exc
|
|
192
|
+
|
|
193
|
+
vector_dataset = load_dataset(project_path, "vectors")
|
|
194
|
+
skip_reason = None
|
|
195
|
+
if skip_build:
|
|
196
|
+
skip_reason = "--skip-build flag provided"
|
|
197
|
+
|
|
198
|
+
if not skip_reason:
|
|
199
|
+
ensure_stage_artifacts(
|
|
200
|
+
project_path,
|
|
201
|
+
vector_dataset,
|
|
202
|
+
profiles,
|
|
203
|
+
cli_visuals=cli_visuals,
|
|
204
|
+
cli_progress=cli_progress,
|
|
205
|
+
workspace=workspace,
|
|
206
|
+
)
|
|
207
|
+
profiles = resolve_run_profiles(
|
|
208
|
+
project_path=project_path,
|
|
209
|
+
run_entries=run_entries,
|
|
210
|
+
keep=keep,
|
|
211
|
+
stage=stage,
|
|
212
|
+
limit=limit,
|
|
213
|
+
cli_output=cli_output_cfg,
|
|
214
|
+
cli_payload=payload_override or (
|
|
215
|
+
out_payload.lower() if out_payload else None),
|
|
216
|
+
workspace=workspace,
|
|
217
|
+
cli_log_level=cli_log_level,
|
|
218
|
+
base_log_level=base_log_level,
|
|
219
|
+
cli_visuals=cli_visuals,
|
|
220
|
+
cli_progress=cli_progress,
|
|
221
|
+
create_run=True,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
datasets: dict[str, object] = {}
|
|
225
|
+
datasets["vectors"] = vector_dataset
|
|
226
|
+
for profile in profiles:
|
|
227
|
+
dataset_name = "vectors" if profile.stage is None else "features"
|
|
228
|
+
dataset = datasets.get(dataset_name)
|
|
229
|
+
if dataset is None:
|
|
230
|
+
dataset = load_dataset(project_path, dataset_name)
|
|
231
|
+
datasets[dataset_name] = dataset
|
|
232
|
+
|
|
233
|
+
root_logger = logging.getLogger()
|
|
234
|
+
if root_logger.level != profile.log_decision.value:
|
|
235
|
+
root_logger.setLevel(profile.log_decision.value)
|
|
236
|
+
|
|
237
|
+
def _work(profile=profile):
|
|
238
|
+
_log_profile_start_debug(profile)
|
|
239
|
+
serve_with_runtime(
|
|
240
|
+
profile.runtime,
|
|
241
|
+
dataset,
|
|
242
|
+
limit=profile.limit,
|
|
243
|
+
target=profile.output,
|
|
244
|
+
throttle_ms=profile.throttle_ms,
|
|
245
|
+
stage=profile.stage,
|
|
246
|
+
visuals=profile.visuals.visuals,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
sections = _entry_sections(run_root, profile.entry)
|
|
250
|
+
run_job(
|
|
251
|
+
sections=sections,
|
|
252
|
+
label=profile.label,
|
|
253
|
+
visuals=profile.visuals.visuals or "auto",
|
|
254
|
+
progress_style=profile.visuals.progress or "auto",
|
|
255
|
+
level=profile.log_decision.value,
|
|
256
|
+
runtime=profile.runtime,
|
|
257
|
+
work=_work,
|
|
258
|
+
idx=profile.idx,
|
|
259
|
+
total=profile.total,
|
|
260
|
+
)
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Iterator, List, NamedTuple, Optional, Sequence
|
|
6
|
+
|
|
7
|
+
from datapipeline.config.tasks import ServeTask, serve_tasks
|
|
8
|
+
from datapipeline.runtime import Runtime
|
|
9
|
+
from datapipeline.services.bootstrap import bootstrap
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class RunEntry(NamedTuple):
|
|
15
|
+
name: Optional[str]
|
|
16
|
+
config: Optional[ServeTask]
|
|
17
|
+
path: Optional[Path]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def resolve_run_entries(project_path: Path, run_name: Optional[str]) -> tuple[List[RunEntry], Optional[Path]]:
|
|
21
|
+
try:
|
|
22
|
+
raw_entries = serve_tasks(project_path)
|
|
23
|
+
except FileNotFoundError:
|
|
24
|
+
raw_entries = []
|
|
25
|
+
except Exception as exc:
|
|
26
|
+
logger.error("Failed to load serve tasks: %s", exc)
|
|
27
|
+
raise SystemExit(2) from exc
|
|
28
|
+
|
|
29
|
+
entries: List[RunEntry] = []
|
|
30
|
+
root_path: Optional[Path] = None
|
|
31
|
+
|
|
32
|
+
if raw_entries:
|
|
33
|
+
if not run_name:
|
|
34
|
+
raw_entries = [task for task in raw_entries if task.enabled]
|
|
35
|
+
if run_name:
|
|
36
|
+
raw_entries = [
|
|
37
|
+
task
|
|
38
|
+
for task in raw_entries
|
|
39
|
+
if task.effective_name() == run_name
|
|
40
|
+
]
|
|
41
|
+
if not raw_entries:
|
|
42
|
+
logger.error("Unknown run task '%s'", run_name)
|
|
43
|
+
raise SystemExit(2)
|
|
44
|
+
for task in raw_entries:
|
|
45
|
+
path = getattr(task, "source_path", None)
|
|
46
|
+
if root_path is None and path is not None:
|
|
47
|
+
root_path = path.parent
|
|
48
|
+
entries.append(
|
|
49
|
+
RunEntry(
|
|
50
|
+
name=task.effective_name(),
|
|
51
|
+
config=task,
|
|
52
|
+
path=path,
|
|
53
|
+
)
|
|
54
|
+
)
|
|
55
|
+
else:
|
|
56
|
+
if run_name:
|
|
57
|
+
logger.error("Project does not define serve tasks.")
|
|
58
|
+
raise SystemExit(2)
|
|
59
|
+
entries = [RunEntry(name=None, config=None, path=None)]
|
|
60
|
+
return entries, root_path
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def iter_runtime_runs(
|
|
64
|
+
project_path: Path,
|
|
65
|
+
run_entries: Sequence[RunEntry],
|
|
66
|
+
keep_override: Optional[str],
|
|
67
|
+
) -> Iterator[tuple[int, int, RunEntry, Runtime]]:
|
|
68
|
+
total_runs = len(run_entries)
|
|
69
|
+
for idx, entry in enumerate(run_entries, start=1):
|
|
70
|
+
run_cfg = entry.config
|
|
71
|
+
runtime = bootstrap(project_path)
|
|
72
|
+
if run_cfg is not None:
|
|
73
|
+
runtime.run = run_cfg
|
|
74
|
+
split_keep = getattr(runtime.split, "keep", None)
|
|
75
|
+
runtime.split_keep = run_cfg.keep or split_keep
|
|
76
|
+
if keep_override:
|
|
77
|
+
runtime.split_keep = keep_override
|
|
78
|
+
yield idx, total_runs, entry, runtime
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def determine_preview_stage(
|
|
82
|
+
cli_stage: Optional[int],
|
|
83
|
+
run_entries: Sequence[RunEntry],
|
|
84
|
+
) -> tuple[Optional[int], Optional[str]]:
|
|
85
|
+
if cli_stage is not None:
|
|
86
|
+
return cli_stage, "CLI flag"
|
|
87
|
+
|
|
88
|
+
stages: List[int] = []
|
|
89
|
+
for entry in run_entries:
|
|
90
|
+
run_cfg = entry.config
|
|
91
|
+
cfg_stage = getattr(run_cfg, "stage", None) if run_cfg else None
|
|
92
|
+
if cfg_stage is None:
|
|
93
|
+
return None, None
|
|
94
|
+
stages.append(cfg_stage)
|
|
95
|
+
|
|
96
|
+
if not stages or any(stage > 5 for stage in stages):
|
|
97
|
+
return None, None
|
|
98
|
+
|
|
99
|
+
if len(set(stages)) == 1:
|
|
100
|
+
return stages[0], "run config"
|
|
101
|
+
return min(stages), "run configs"
|