jerry-thomas 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +120 -17
- datapipeline/analysis/vector/matrix.py +33 -8
- datapipeline/analysis/vector/report.py +162 -32
- datapipeline/build/tasks/__init__.py +11 -0
- datapipeline/build/tasks/config.py +74 -0
- datapipeline/build/tasks/metadata.py +170 -0
- datapipeline/build/tasks/scaler.py +73 -0
- datapipeline/build/tasks/schema.py +60 -0
- datapipeline/build/tasks/utils.py +169 -0
- datapipeline/cli/app.py +304 -127
- datapipeline/cli/commands/build.py +240 -16
- datapipeline/cli/commands/contract.py +367 -0
- datapipeline/cli/commands/domain.py +8 -3
- datapipeline/cli/commands/inspect.py +401 -149
- datapipeline/cli/commands/list_.py +30 -7
- datapipeline/cli/commands/plugin.py +1 -1
- datapipeline/cli/commands/run.py +227 -241
- datapipeline/cli/commands/run_config.py +101 -0
- datapipeline/cli/commands/serve_pipeline.py +156 -0
- datapipeline/cli/commands/source.py +44 -8
- datapipeline/cli/visuals/__init__.py +4 -2
- datapipeline/cli/visuals/common.py +239 -0
- datapipeline/cli/visuals/labels.py +15 -15
- datapipeline/cli/visuals/runner.py +66 -0
- datapipeline/cli/visuals/sections.py +20 -0
- datapipeline/cli/visuals/sources.py +132 -119
- datapipeline/cli/visuals/sources_basic.py +260 -0
- datapipeline/cli/visuals/sources_off.py +76 -0
- datapipeline/cli/visuals/sources_rich.py +414 -0
- datapipeline/config/catalog.py +37 -3
- datapipeline/config/context.py +214 -0
- datapipeline/config/dataset/loader.py +21 -4
- datapipeline/config/dataset/normalize.py +4 -4
- datapipeline/config/metadata.py +43 -0
- datapipeline/config/postprocess.py +2 -2
- datapipeline/config/project.py +3 -2
- datapipeline/config/resolution.py +129 -0
- datapipeline/config/tasks.py +309 -0
- datapipeline/config/workspace.py +155 -0
- datapipeline/domain/__init__.py +12 -0
- datapipeline/domain/record.py +11 -0
- datapipeline/domain/sample.py +54 -0
- datapipeline/integrations/ml/adapter.py +34 -20
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +1 -6
- datapipeline/integrations/ml/torch_support.py +1 -3
- datapipeline/io/factory.py +112 -0
- datapipeline/io/output.py +132 -0
- datapipeline/io/protocols.py +21 -0
- datapipeline/io/serializers.py +219 -0
- datapipeline/io/sinks/__init__.py +23 -0
- datapipeline/io/sinks/base.py +2 -0
- datapipeline/io/sinks/files.py +79 -0
- datapipeline/io/sinks/rich.py +57 -0
- datapipeline/io/sinks/stdout.py +18 -0
- datapipeline/io/writers/__init__.py +14 -0
- datapipeline/io/writers/base.py +28 -0
- datapipeline/io/writers/csv_writer.py +25 -0
- datapipeline/io/writers/jsonl.py +52 -0
- datapipeline/io/writers/pickle_writer.py +30 -0
- datapipeline/pipeline/artifacts.py +58 -0
- datapipeline/pipeline/context.py +66 -7
- datapipeline/pipeline/observability.py +65 -0
- datapipeline/pipeline/pipelines.py +65 -13
- datapipeline/pipeline/split.py +11 -10
- datapipeline/pipeline/stages.py +127 -16
- datapipeline/pipeline/utils/keygen.py +20 -7
- datapipeline/pipeline/utils/memory_sort.py +22 -10
- datapipeline/pipeline/utils/transform_utils.py +22 -0
- datapipeline/runtime.py +5 -2
- datapipeline/services/artifacts.py +12 -6
- datapipeline/services/bootstrap/config.py +25 -0
- datapipeline/services/bootstrap/core.py +52 -37
- datapipeline/services/constants.py +6 -5
- datapipeline/services/factories.py +123 -1
- datapipeline/services/project_paths.py +43 -16
- datapipeline/services/runs.py +208 -0
- datapipeline/services/scaffold/domain.py +3 -2
- datapipeline/services/scaffold/filter.py +3 -2
- datapipeline/services/scaffold/mappers.py +9 -6
- datapipeline/services/scaffold/plugin.py +3 -3
- datapipeline/services/scaffold/source.py +93 -56
- datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
- datapipeline/sources/decoders.py +83 -18
- datapipeline/sources/factory.py +26 -16
- datapipeline/sources/models/__init__.py +2 -2
- datapipeline/sources/models/generator.py +0 -7
- datapipeline/sources/models/loader.py +3 -3
- datapipeline/sources/models/parsing_error.py +24 -0
- datapipeline/sources/models/source.py +6 -6
- datapipeline/sources/synthetic/time/loader.py +14 -2
- datapipeline/sources/transports.py +74 -37
- datapipeline/templates/plugin_skeleton/README.md +74 -30
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
- datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
- datapipeline/templates/plugin_skeleton/jerry.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
- datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -0
- datapipeline/templates/stubs/mapper.py.j2 +5 -4
- datapipeline/templates/stubs/parser.py.j2 +2 -0
- datapipeline/templates/stubs/record.py.j2 +2 -0
- datapipeline/templates/stubs/source.yaml.j2 +2 -3
- datapipeline/transforms/debug/lint.py +26 -41
- datapipeline/transforms/feature/scaler.py +89 -13
- datapipeline/transforms/record/floor_time.py +4 -4
- datapipeline/transforms/sequence.py +2 -35
- datapipeline/transforms/stream/dedupe.py +24 -0
- datapipeline/transforms/stream/ensure_ticks.py +7 -6
- datapipeline/transforms/vector/__init__.py +5 -0
- datapipeline/transforms/vector/common.py +98 -0
- datapipeline/transforms/vector/drop/__init__.py +4 -0
- datapipeline/transforms/vector/drop/horizontal.py +79 -0
- datapipeline/transforms/vector/drop/orchestrator.py +59 -0
- datapipeline/transforms/vector/drop/vertical.py +182 -0
- datapipeline/transforms/vector/ensure_schema.py +184 -0
- datapipeline/transforms/vector/fill.py +87 -0
- datapipeline/transforms/vector/replace.py +62 -0
- datapipeline/utils/load.py +24 -3
- datapipeline/utils/rich_compat.py +38 -0
- datapipeline/utils/window.py +76 -0
- jerry_thomas-1.0.0.dist-info/METADATA +825 -0
- jerry_thomas-1.0.0.dist-info/RECORD +199 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/entry_points.txt +9 -8
- datapipeline/build/tasks.py +0 -186
- datapipeline/cli/commands/link.py +0 -128
- datapipeline/cli/commands/writers.py +0 -138
- datapipeline/config/build.py +0 -64
- datapipeline/config/run.py +0 -116
- datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
- datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
- datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
- datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
- datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
- datapipeline/transforms/vector.py +0 -210
- jerry_thomas-0.3.0.dist-info/METADATA +0 -502
- jerry_thomas-0.3.0.dist-info/RECORD +0 -139
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/WHEEL +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/top_level.txt +0 -0
datapipeline/cli/app.py
CHANGED
|
@@ -1,17 +1,100 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import logging
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional, Tuple
|
|
3
5
|
|
|
4
6
|
from datapipeline.cli.commands.run import handle_serve
|
|
5
7
|
from datapipeline.cli.commands.plugin import bar as handle_bar
|
|
6
8
|
from datapipeline.cli.commands.source import handle as handle_source
|
|
7
9
|
from datapipeline.cli.commands.domain import handle as handle_domain
|
|
8
|
-
from datapipeline.cli.commands.
|
|
10
|
+
from datapipeline.cli.commands.contract import handle as handle_contract
|
|
9
11
|
from datapipeline.cli.commands.list_ import handle as handle_list
|
|
10
12
|
from datapipeline.cli.commands.filter import handle as handle_filter
|
|
11
13
|
from datapipeline.cli.commands.inspect import (
|
|
12
14
|
report as handle_inspect_report,
|
|
13
15
|
)
|
|
14
16
|
from datapipeline.cli.commands.build import handle as handle_build
|
|
17
|
+
from datapipeline.config.workspace import (
|
|
18
|
+
WorkspaceContext,
|
|
19
|
+
load_workspace_context,
|
|
20
|
+
)
|
|
21
|
+
from datapipeline.config.resolution import resolve_visuals
|
|
22
|
+
from datapipeline.utils.rich_compat import suppress_file_proxy_shutdown_errors
|
|
23
|
+
|
|
24
|
+
suppress_file_proxy_shutdown_errors()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _dataset_to_project_path(
|
|
28
|
+
dataset: str,
|
|
29
|
+
workspace: Optional[WorkspaceContext],
|
|
30
|
+
) -> str:
|
|
31
|
+
"""Resolve a dataset selector (alias, folder, or file) into a project.yaml path."""
|
|
32
|
+
# 1) Alias via jerry.yaml datasets (wins over local folders with same name)
|
|
33
|
+
if workspace is not None:
|
|
34
|
+
datasets = getattr(workspace.config, "datasets", {}) or {}
|
|
35
|
+
raw = datasets.get(dataset)
|
|
36
|
+
if raw:
|
|
37
|
+
base = workspace.root
|
|
38
|
+
candidate = Path(raw)
|
|
39
|
+
candidate = candidate if candidate.is_absolute() else (base / candidate)
|
|
40
|
+
if candidate.is_dir():
|
|
41
|
+
candidate = candidate / "project.yaml"
|
|
42
|
+
return str(candidate.resolve())
|
|
43
|
+
|
|
44
|
+
# 2) Direct file path
|
|
45
|
+
path = Path(dataset)
|
|
46
|
+
if path.suffix in {".yaml", ".yml"}:
|
|
47
|
+
return str(path if path.is_absolute() else (Path.cwd() / path).resolve())
|
|
48
|
+
|
|
49
|
+
# 3) Directory: assume project.yaml inside
|
|
50
|
+
if path.is_dir():
|
|
51
|
+
candidate = path / "project.yaml"
|
|
52
|
+
return str(candidate.resolve())
|
|
53
|
+
|
|
54
|
+
raise SystemExit(f"Unknown dataset '{dataset}'. Define it under datasets: in jerry.yaml or pass a valid path.")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _resolve_project_from_args(
|
|
58
|
+
project: Optional[str],
|
|
59
|
+
dataset: Optional[str],
|
|
60
|
+
workspace: Optional[WorkspaceContext],
|
|
61
|
+
) -> Tuple[Optional[str], Optional[str]]:
|
|
62
|
+
"""Resolve final project path from --project / --dataset / jerry.yaml defaults.
|
|
63
|
+
|
|
64
|
+
Rules:
|
|
65
|
+
- If both project and dataset are explicitly given (and project != DEFAULT_PROJECT_PATH), error.
|
|
66
|
+
- If dataset is given, resolve it to a project path (alias, dir, or file).
|
|
67
|
+
- If neither is given (or project==DEFAULT_PROJECT_PATH), and jerry.yaml declares default_dataset,
|
|
68
|
+
resolve that alias.
|
|
69
|
+
- Otherwise fall back to legacy DEFAULT_PROJECT_PATH resolution.
|
|
70
|
+
"""
|
|
71
|
+
explicit_project = project is not None
|
|
72
|
+
explicit_dataset = dataset is not None
|
|
73
|
+
|
|
74
|
+
if explicit_project and explicit_dataset:
|
|
75
|
+
raise SystemExit("Cannot use both --project and --dataset; pick one.")
|
|
76
|
+
|
|
77
|
+
# Prefer dataset when provided
|
|
78
|
+
if explicit_dataset:
|
|
79
|
+
resolved = _dataset_to_project_path(dataset, workspace)
|
|
80
|
+
return resolved, dataset
|
|
81
|
+
|
|
82
|
+
# No explicit dataset; use default_dataset from workspace when project is not explicitly set
|
|
83
|
+
if not explicit_project and workspace is not None:
|
|
84
|
+
default_ds = getattr(workspace.config, "default_dataset", None)
|
|
85
|
+
if default_ds:
|
|
86
|
+
resolved = _dataset_to_project_path(default_ds, workspace)
|
|
87
|
+
return resolved, default_ds
|
|
88
|
+
|
|
89
|
+
# If project was given explicitly, use it as-is (caller is responsible for validity).
|
|
90
|
+
if explicit_project:
|
|
91
|
+
return project, dataset
|
|
92
|
+
|
|
93
|
+
# Nothing resolved: require explicit selection.
|
|
94
|
+
raise SystemExit(
|
|
95
|
+
"No dataset/project selected. Use --dataset <name|path>, --project <path>, "
|
|
96
|
+
"or define default_dataset in jerry.yaml."
|
|
97
|
+
)
|
|
15
98
|
|
|
16
99
|
|
|
17
100
|
def main() -> None:
|
|
@@ -37,10 +120,15 @@ def main() -> None:
|
|
|
37
120
|
help="produce vectors with configurable logging",
|
|
38
121
|
parents=[common],
|
|
39
122
|
)
|
|
123
|
+
p_serve.add_argument(
|
|
124
|
+
"--dataset",
|
|
125
|
+
"-d",
|
|
126
|
+
help="dataset alias, folder, or project.yaml path",
|
|
127
|
+
)
|
|
40
128
|
p_serve.add_argument(
|
|
41
129
|
"--project",
|
|
42
130
|
"-p",
|
|
43
|
-
default=
|
|
131
|
+
default=None,
|
|
44
132
|
help="path to project.yaml",
|
|
45
133
|
)
|
|
46
134
|
p_serve.add_argument(
|
|
@@ -48,22 +136,31 @@ def main() -> None:
|
|
|
48
136
|
help="optional cap on the number of vectors to emit",
|
|
49
137
|
)
|
|
50
138
|
p_serve.add_argument(
|
|
51
|
-
"--
|
|
52
|
-
|
|
139
|
+
"--out-transport",
|
|
140
|
+
choices=["stdout", "fs"],
|
|
141
|
+
help="output transport (stdout or fs) for serve runs",
|
|
53
142
|
)
|
|
54
143
|
p_serve.add_argument(
|
|
55
|
-
"--
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
144
|
+
"--out-format",
|
|
145
|
+
choices=["print", "json-lines", "json", "csv", "pickle"],
|
|
146
|
+
help="output format (print/json-lines/csv/pickle) for serve runs",
|
|
147
|
+
)
|
|
148
|
+
p_serve.add_argument(
|
|
149
|
+
"--out-payload",
|
|
150
|
+
choices=["sample", "vector"],
|
|
151
|
+
help="payload structure: full sample (default) or vector-only body",
|
|
152
|
+
)
|
|
153
|
+
p_serve.add_argument(
|
|
154
|
+
"--out-path",
|
|
155
|
+
help="destination file path when using fs transport",
|
|
59
156
|
)
|
|
60
157
|
p_serve.add_argument(
|
|
61
158
|
"--keep",
|
|
62
|
-
help="split label to serve; overrides
|
|
159
|
+
help="split label to serve; overrides serve tasks and project globals",
|
|
63
160
|
)
|
|
64
161
|
p_serve.add_argument(
|
|
65
162
|
"--run",
|
|
66
|
-
help="select a
|
|
163
|
+
help="select a serve task by name when project.paths.tasks contains multiple entries",
|
|
67
164
|
)
|
|
68
165
|
p_serve.add_argument(
|
|
69
166
|
"--stage",
|
|
@@ -73,6 +170,23 @@ def main() -> None:
|
|
|
73
170
|
default=None,
|
|
74
171
|
help="preview a specific pipeline stage (0-5 feature stages, 6 assembled vectors, 7 transformed vectors)",
|
|
75
172
|
)
|
|
173
|
+
p_serve.add_argument(
|
|
174
|
+
"--visuals",
|
|
175
|
+
choices=["auto", "tqdm", "rich", "off"],
|
|
176
|
+
default=None,
|
|
177
|
+
help="visuals renderer: auto (default), tqdm, rich, or off",
|
|
178
|
+
)
|
|
179
|
+
p_serve.add_argument(
|
|
180
|
+
"--progress",
|
|
181
|
+
choices=["auto", "spinner", "bars", "off"],
|
|
182
|
+
default=None,
|
|
183
|
+
help="progress display: auto (spinner unless DEBUG), spinner, bars, or off",
|
|
184
|
+
)
|
|
185
|
+
p_serve.add_argument(
|
|
186
|
+
"--skip-build",
|
|
187
|
+
action="store_true",
|
|
188
|
+
help="skip the automatic build step (useful for quick feature previews)",
|
|
189
|
+
)
|
|
76
190
|
|
|
77
191
|
# build (materialize artifacts)
|
|
78
192
|
p_build = sub.add_parser(
|
|
@@ -80,10 +194,15 @@ def main() -> None:
|
|
|
80
194
|
help="materialize project artifacts (expected ids, hashes, etc.)",
|
|
81
195
|
parents=[common],
|
|
82
196
|
)
|
|
197
|
+
p_build.add_argument(
|
|
198
|
+
"--dataset",
|
|
199
|
+
"-d",
|
|
200
|
+
help="dataset alias, folder, or project.yaml path",
|
|
201
|
+
)
|
|
83
202
|
p_build.add_argument(
|
|
84
203
|
"--project",
|
|
85
204
|
"-p",
|
|
86
|
-
default=
|
|
205
|
+
default=None,
|
|
87
206
|
help="path to project.yaml",
|
|
88
207
|
)
|
|
89
208
|
p_build.add_argument(
|
|
@@ -91,63 +210,99 @@ def main() -> None:
|
|
|
91
210
|
action="store_true",
|
|
92
211
|
help="rebuild even when the configuration hash matches the last run",
|
|
93
212
|
)
|
|
213
|
+
p_build.add_argument(
|
|
214
|
+
"--visuals",
|
|
215
|
+
choices=["auto", "tqdm", "rich", "off"],
|
|
216
|
+
default=None,
|
|
217
|
+
help="visuals renderer: auto (default), tqdm, rich, or off",
|
|
218
|
+
)
|
|
219
|
+
p_build.add_argument(
|
|
220
|
+
"--progress",
|
|
221
|
+
choices=["auto", "spinner", "bars", "off"],
|
|
222
|
+
default=None,
|
|
223
|
+
help="progress display: auto (spinner unless DEBUG), spinner, bars, or off",
|
|
224
|
+
)
|
|
94
225
|
|
|
95
226
|
# source
|
|
96
|
-
|
|
227
|
+
p_source = sub.add_parser(
|
|
97
228
|
"source",
|
|
98
229
|
help="add or list raw sources",
|
|
99
230
|
parents=[common],
|
|
100
231
|
)
|
|
101
|
-
|
|
102
|
-
|
|
232
|
+
source_sub = p_source.add_subparsers(dest="source_cmd", required=True)
|
|
233
|
+
p_source_add = source_sub.add_parser(
|
|
103
234
|
"add",
|
|
104
235
|
help="create a provider+dataset source",
|
|
105
236
|
description=(
|
|
106
237
|
"Scaffold a source using transport + format.\n\n"
|
|
238
|
+
"Usage:\n"
|
|
239
|
+
" jerry source add <provider> <dataset> -t fs -f csv\n"
|
|
240
|
+
" jerry source add <provider>.<dataset> -t http -f json\n"
|
|
241
|
+
" jerry source add -p <provider> -d <dataset> -t synthetic\n\n"
|
|
107
242
|
"Examples:\n"
|
|
108
243
|
" fs CSV: -t fs -f csv\n"
|
|
109
244
|
" fs NDJSON: -t fs -f json-lines\n"
|
|
110
|
-
"
|
|
245
|
+
" HTTP JSON: -t http -f json\n"
|
|
111
246
|
" Synthetic: -t synthetic\n\n"
|
|
112
247
|
"Note: set 'glob: true' in the generated YAML if your 'path' contains wildcards."
|
|
113
248
|
),
|
|
114
249
|
)
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
250
|
+
# Support simple positionals, plus flags for compatibility
|
|
251
|
+
# Allow either positionals or flags. Use distinct dest names for flags
|
|
252
|
+
# to avoid ambiguity when both forms are present in some environments.
|
|
253
|
+
p_source_add.add_argument("provider", nargs="?", help="provider name")
|
|
254
|
+
p_source_add.add_argument("dataset", nargs="?", help="dataset slug")
|
|
255
|
+
p_source_add.add_argument("--provider", "-p", dest="provider_opt", metavar="PROVIDER", help="provider name")
|
|
256
|
+
p_source_add.add_argument("--dataset", "-d", dest="dataset_opt", metavar="DATASET", help="dataset slug")
|
|
257
|
+
p_source_add.add_argument("--alias", "-a", help="provider.dataset alias")
|
|
258
|
+
p_source_add.add_argument(
|
|
118
259
|
"--transport", "-t",
|
|
119
|
-
choices=["fs", "
|
|
260
|
+
choices=["fs", "http", "synthetic"],
|
|
120
261
|
required=True,
|
|
121
|
-
help="how data is accessed: fs/
|
|
262
|
+
help="how data is accessed: fs/http/synthetic",
|
|
122
263
|
)
|
|
123
|
-
|
|
264
|
+
p_source_add.add_argument(
|
|
124
265
|
"--format", "-f",
|
|
125
|
-
choices=["csv", "json", "json-lines"],
|
|
126
|
-
help="data format for fs/
|
|
266
|
+
choices=["csv", "json", "json-lines", "pickle"],
|
|
267
|
+
help="data format for fs/http transports (ignored otherwise)",
|
|
268
|
+
)
|
|
269
|
+
p_source_add.add_argument(
|
|
270
|
+
"--identity",
|
|
271
|
+
action="store_true",
|
|
272
|
+
help="use the built-in identity parser (skips DTO/parser scaffolding)",
|
|
127
273
|
)
|
|
128
|
-
|
|
274
|
+
source_sub.add_parser("list", help="list known sources")
|
|
129
275
|
|
|
130
276
|
# domain
|
|
131
|
-
|
|
277
|
+
p_domain = sub.add_parser(
|
|
132
278
|
"domain",
|
|
133
279
|
help="add or list domains",
|
|
134
280
|
parents=[common],
|
|
135
281
|
)
|
|
136
|
-
|
|
137
|
-
|
|
282
|
+
domain_sub = p_domain.add_subparsers(dest="domain_cmd", required=True)
|
|
283
|
+
p_domain_add = domain_sub.add_parser(
|
|
138
284
|
"add",
|
|
139
285
|
help="create a domain",
|
|
140
286
|
description="Create a time-aware domain package rooted in TemporalRecord.",
|
|
141
287
|
)
|
|
142
|
-
|
|
143
|
-
|
|
288
|
+
# Accept positional name, plus flags for flexibility and consistency.
|
|
289
|
+
p_domain_add.add_argument("domain", nargs="?", help="domain name")
|
|
290
|
+
p_domain_add.add_argument(
|
|
291
|
+
"--name", "-n", dest="domain", help="domain name"
|
|
292
|
+
)
|
|
293
|
+
domain_sub.add_parser("list", help="list known domains")
|
|
144
294
|
|
|
145
|
-
# contract (
|
|
295
|
+
# contract (interactive: ingest or composed)
|
|
146
296
|
p_contract = sub.add_parser(
|
|
147
297
|
"contract",
|
|
148
|
-
help="
|
|
298
|
+
help="manage stream contracts (ingest or composed)",
|
|
149
299
|
parents=[common],
|
|
150
300
|
)
|
|
301
|
+
p_contract.add_argument(
|
|
302
|
+
"--identity",
|
|
303
|
+
action="store_true",
|
|
304
|
+
help="use built-in identity mapper (skip mapper scaffolding)",
|
|
305
|
+
)
|
|
151
306
|
|
|
152
307
|
# plugin (plugin scaffolding)
|
|
153
308
|
p_bar = sub.add_parser(
|
|
@@ -158,7 +313,9 @@ def main() -> None:
|
|
|
158
313
|
bar_sub = p_bar.add_subparsers(dest="bar_cmd", required=True)
|
|
159
314
|
p_bar_init = bar_sub.add_parser(
|
|
160
315
|
"init", help="create a plugin skeleton")
|
|
161
|
-
|
|
316
|
+
# Accept positional name and flag for flexibility
|
|
317
|
+
p_bar_init.add_argument("name", nargs="?", help="plugin distribution name")
|
|
318
|
+
p_bar_init.add_argument("--name", "-n", dest="name", help="plugin distribution name")
|
|
162
319
|
p_bar_init.add_argument("--out", "-o", default=".")
|
|
163
320
|
|
|
164
321
|
# filter (unchanged helper)
|
|
@@ -171,11 +328,31 @@ def main() -> None:
|
|
|
171
328
|
help="filter entrypoint name and function/module name",
|
|
172
329
|
)
|
|
173
330
|
|
|
331
|
+
# Shared visuals/progress controls for inspect commands
|
|
332
|
+
inspect_common = argparse.ArgumentParser(add_help=False)
|
|
333
|
+
inspect_common.add_argument(
|
|
334
|
+
"--visuals",
|
|
335
|
+
choices=["auto", "tqdm", "rich", "off"],
|
|
336
|
+
default=None,
|
|
337
|
+
help="visuals renderer: auto (default), tqdm, rich, or off",
|
|
338
|
+
)
|
|
339
|
+
inspect_common.add_argument(
|
|
340
|
+
"--progress",
|
|
341
|
+
choices=["auto", "spinner", "bars", "off"],
|
|
342
|
+
default=None,
|
|
343
|
+
help="progress display: auto (spinner unless DEBUG), spinner, bars, or off",
|
|
344
|
+
)
|
|
345
|
+
inspect_common.add_argument(
|
|
346
|
+
"--dataset",
|
|
347
|
+
"-d",
|
|
348
|
+
help="dataset alias, folder, or project.yaml path",
|
|
349
|
+
)
|
|
350
|
+
|
|
174
351
|
# inspect (metadata helpers)
|
|
175
352
|
p_inspect = sub.add_parser(
|
|
176
353
|
"inspect",
|
|
177
|
-
help="inspect dataset metadata: report,
|
|
178
|
-
parents=[common],
|
|
354
|
+
help="inspect dataset metadata: report, matrix, partitions",
|
|
355
|
+
parents=[common, inspect_common],
|
|
179
356
|
)
|
|
180
357
|
inspect_sub = p_inspect.add_subparsers(dest="inspect_cmd", required=False)
|
|
181
358
|
|
|
@@ -183,11 +360,12 @@ def main() -> None:
|
|
|
183
360
|
p_inspect_report = inspect_sub.add_parser(
|
|
184
361
|
"report",
|
|
185
362
|
help="print a quality report to stdout",
|
|
363
|
+
parents=[inspect_common],
|
|
186
364
|
)
|
|
187
365
|
p_inspect_report.add_argument(
|
|
188
366
|
"--project",
|
|
189
367
|
"-p",
|
|
190
|
-
default=
|
|
368
|
+
default=None,
|
|
191
369
|
help="path to project.yaml",
|
|
192
370
|
)
|
|
193
371
|
p_inspect_report.add_argument(
|
|
@@ -210,62 +388,22 @@ def main() -> None:
|
|
|
210
388
|
help="whether to apply postprocess transforms (final) or skip them (raw)",
|
|
211
389
|
)
|
|
212
390
|
p_inspect_report.add_argument(
|
|
213
|
-
"--
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
# Coverage (JSON file)
|
|
219
|
-
p_inspect_cov = inspect_sub.add_parser(
|
|
220
|
-
"coverage",
|
|
221
|
-
help="write coverage summary JSON",
|
|
222
|
-
)
|
|
223
|
-
p_inspect_cov.add_argument(
|
|
224
|
-
"--project",
|
|
225
|
-
"-p",
|
|
226
|
-
default="config/datasets/default/project.yaml",
|
|
227
|
-
help="path to project.yaml",
|
|
228
|
-
)
|
|
229
|
-
p_inspect_cov.add_argument(
|
|
230
|
-
"--output",
|
|
231
|
-
"-o",
|
|
232
|
-
default=None,
|
|
233
|
-
help="coverage JSON path (defaults to build/coverage.json)",
|
|
234
|
-
)
|
|
235
|
-
p_inspect_cov.add_argument(
|
|
236
|
-
"--threshold",
|
|
237
|
-
"-t",
|
|
238
|
-
type=float,
|
|
239
|
-
default=0.95,
|
|
240
|
-
help="coverage threshold (0-1) for keep/drop lists",
|
|
241
|
-
)
|
|
242
|
-
p_inspect_cov.add_argument(
|
|
243
|
-
"--match-partition",
|
|
244
|
-
choices=["base", "full"],
|
|
245
|
-
default="base",
|
|
246
|
-
help="match features by base id or full partition id",
|
|
247
|
-
)
|
|
248
|
-
p_inspect_cov.add_argument(
|
|
249
|
-
"--mode",
|
|
250
|
-
choices=["final", "raw"],
|
|
251
|
-
default="final",
|
|
252
|
-
help="whether to apply postprocess transforms (final) or skip them (raw)",
|
|
253
|
-
)
|
|
254
|
-
p_inspect_cov.add_argument(
|
|
255
|
-
"--include-targets",
|
|
256
|
-
action="store_true",
|
|
257
|
-
help="include dataset.targets when computing coverage",
|
|
391
|
+
"--sort",
|
|
392
|
+
choices=["missing", "nulls"],
|
|
393
|
+
default="missing",
|
|
394
|
+
help="feature ranking metric in the report (missing or nulls)",
|
|
258
395
|
)
|
|
259
396
|
|
|
260
397
|
# Matrix export
|
|
261
398
|
p_inspect_matrix = inspect_sub.add_parser(
|
|
262
399
|
"matrix",
|
|
263
400
|
help="export availability matrix",
|
|
401
|
+
parents=[inspect_common],
|
|
264
402
|
)
|
|
265
403
|
p_inspect_matrix.add_argument(
|
|
266
404
|
"--project",
|
|
267
405
|
"-p",
|
|
268
|
-
default=
|
|
406
|
+
default=None,
|
|
269
407
|
help="path to project.yaml",
|
|
270
408
|
)
|
|
271
409
|
p_inspect_matrix.add_argument(
|
|
@@ -309,21 +447,17 @@ def main() -> None:
|
|
|
309
447
|
default="final",
|
|
310
448
|
help="whether to apply postprocess transforms (final) or skip them (raw)",
|
|
311
449
|
)
|
|
312
|
-
p_inspect_matrix.add_argument(
|
|
313
|
-
"--include-targets",
|
|
314
|
-
action="store_true",
|
|
315
|
-
help="include dataset.targets when exporting the matrix",
|
|
316
|
-
)
|
|
317
450
|
|
|
318
451
|
# Partitions manifest subcommand
|
|
319
452
|
p_inspect_parts = inspect_sub.add_parser(
|
|
320
453
|
"partitions",
|
|
321
454
|
help="discover partitions and write a manifest JSON",
|
|
455
|
+
parents=[inspect_common],
|
|
322
456
|
)
|
|
323
457
|
p_inspect_parts.add_argument(
|
|
324
458
|
"--project",
|
|
325
459
|
"-p",
|
|
326
|
-
default=
|
|
460
|
+
default=None,
|
|
327
461
|
help="path to project.yaml",
|
|
328
462
|
)
|
|
329
463
|
p_inspect_parts.add_argument(
|
|
@@ -332,21 +466,17 @@ def main() -> None:
|
|
|
332
466
|
default=None,
|
|
333
467
|
help="partitions manifest path (defaults to build/partitions.json)",
|
|
334
468
|
)
|
|
335
|
-
p_inspect_parts.add_argument(
|
|
336
|
-
"--include-targets",
|
|
337
|
-
action="store_true",
|
|
338
|
-
help="include dataset.targets when discovering partitions",
|
|
339
|
-
)
|
|
340
469
|
|
|
341
470
|
# Expected IDs (newline list)
|
|
342
471
|
p_inspect_expected = inspect_sub.add_parser(
|
|
343
472
|
"expected",
|
|
344
473
|
help="discover full feature ids and write a newline list",
|
|
474
|
+
parents=[inspect_common],
|
|
345
475
|
)
|
|
346
476
|
p_inspect_expected.add_argument(
|
|
347
477
|
"--project",
|
|
348
478
|
"-p",
|
|
349
|
-
default=
|
|
479
|
+
default=None,
|
|
350
480
|
help="path to project.yaml",
|
|
351
481
|
)
|
|
352
482
|
p_inspect_expected.add_argument(
|
|
@@ -355,47 +485,87 @@ def main() -> None:
|
|
|
355
485
|
default=None,
|
|
356
486
|
help="expected ids output path (defaults to build/datasets/<name>/expected.txt)",
|
|
357
487
|
)
|
|
358
|
-
p_inspect_expected.add_argument(
|
|
359
|
-
"--include-targets",
|
|
360
|
-
action="store_true",
|
|
361
|
-
help="include dataset.targets when discovering expected ids",
|
|
362
|
-
)
|
|
363
488
|
|
|
489
|
+
workspace_context = load_workspace_context(Path.cwd())
|
|
364
490
|
args = parser.parse_args()
|
|
365
491
|
|
|
492
|
+
# Resolve dataset/project selection for commands that use a project.
|
|
493
|
+
if hasattr(args, "project") or hasattr(args, "dataset"):
|
|
494
|
+
raw_project = getattr(args, "project", None)
|
|
495
|
+
raw_dataset = getattr(args, "dataset", None)
|
|
496
|
+
resolved_project, resolved_dataset = _resolve_project_from_args(
|
|
497
|
+
raw_project,
|
|
498
|
+
raw_dataset,
|
|
499
|
+
workspace_context,
|
|
500
|
+
)
|
|
501
|
+
if hasattr(args, "project"):
|
|
502
|
+
args.project = resolved_project
|
|
503
|
+
if hasattr(args, "dataset"):
|
|
504
|
+
args.dataset = resolved_dataset
|
|
505
|
+
|
|
366
506
|
cli_level_arg = getattr(args, "log_level", None)
|
|
367
|
-
|
|
507
|
+
shared_defaults = workspace_context.config.shared if workspace_context else None
|
|
508
|
+
# Default logging level: CLI flag > jerry.yaml shared.log_level > INFO
|
|
509
|
+
default_level_name = (
|
|
510
|
+
shared_defaults.log_level.upper()
|
|
511
|
+
if shared_defaults and shared_defaults.log_level
|
|
512
|
+
else "INFO"
|
|
513
|
+
)
|
|
514
|
+
base_level_name = (cli_level_arg or default_level_name).upper()
|
|
368
515
|
base_level = logging._nameToLevel.get(base_level_name, logging.WARNING)
|
|
369
516
|
|
|
370
517
|
logging.basicConfig(level=base_level, format="%(message)s")
|
|
518
|
+
plugin_root = (
|
|
519
|
+
workspace_context.resolve_plugin_root() if workspace_context else None
|
|
520
|
+
)
|
|
371
521
|
|
|
372
522
|
if args.cmd == "serve":
|
|
373
523
|
handle_serve(
|
|
374
524
|
project=args.project,
|
|
375
525
|
limit=getattr(args, "limit", None),
|
|
376
|
-
output=args.output,
|
|
377
|
-
include_targets=args.include_targets,
|
|
378
526
|
keep=getattr(args, "keep", None),
|
|
379
527
|
run_name=getattr(args, "run", None),
|
|
380
528
|
stage=getattr(args, "stage", None),
|
|
529
|
+
out_transport=getattr(args, "out_transport", None),
|
|
530
|
+
out_format=getattr(args, "out_format", None),
|
|
531
|
+
out_payload=getattr(args, "out_payload", None),
|
|
532
|
+
out_path=getattr(args, "out_path", None),
|
|
533
|
+
skip_build=getattr(args, "skip_build", False),
|
|
381
534
|
cli_log_level=cli_level_arg,
|
|
382
535
|
base_log_level=base_level_name,
|
|
536
|
+
cli_visuals=getattr(args, "visuals", None),
|
|
537
|
+
cli_progress=getattr(args, "progress", None),
|
|
538
|
+
workspace=workspace_context,
|
|
383
539
|
)
|
|
384
540
|
return
|
|
385
541
|
if args.cmd == "build":
|
|
386
542
|
handle_build(
|
|
387
543
|
project=args.project,
|
|
388
544
|
force=getattr(args, "force", False),
|
|
545
|
+
cli_visuals=getattr(args, "visuals", None),
|
|
546
|
+
cli_progress=getattr(args, "progress", None),
|
|
547
|
+
workspace=workspace_context,
|
|
389
548
|
)
|
|
390
549
|
return
|
|
391
550
|
|
|
392
551
|
if args.cmd == "inspect":
|
|
393
552
|
# Default to 'report' when no subcommand is given
|
|
394
553
|
subcmd = getattr(args, "inspect_cmd", None)
|
|
554
|
+
shared_visuals_default = shared_defaults.visuals if shared_defaults else None
|
|
555
|
+
shared_progress_default = shared_defaults.progress if shared_defaults else None
|
|
556
|
+
inspect_visuals = resolve_visuals(
|
|
557
|
+
cli_visuals=getattr(args, "visuals", None),
|
|
558
|
+
config_visuals=None,
|
|
559
|
+
workspace_visuals=shared_visuals_default,
|
|
560
|
+
cli_progress=getattr(args, "progress", None),
|
|
561
|
+
config_progress=None,
|
|
562
|
+
workspace_progress=shared_progress_default,
|
|
563
|
+
)
|
|
564
|
+
inspect_visual_provider = inspect_visuals.visuals or "auto"
|
|
565
|
+
inspect_progress_style = inspect_visuals.progress or "auto"
|
|
395
566
|
if subcmd in (None, "report"):
|
|
396
567
|
handle_inspect_report(
|
|
397
|
-
project=
|
|
398
|
-
"config/datasets/default/project.yaml"),
|
|
568
|
+
project=args.project,
|
|
399
569
|
output=None,
|
|
400
570
|
threshold=getattr(args, "threshold", 0.95),
|
|
401
571
|
match_partition=getattr(args, "match_partition", "base"),
|
|
@@ -406,22 +576,11 @@ def main() -> None:
|
|
|
406
576
|
quiet=False,
|
|
407
577
|
write_coverage=False,
|
|
408
578
|
apply_postprocess=(getattr(args, "mode", "final") == "final"),
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
output=getattr(args, "output", None),
|
|
415
|
-
threshold=getattr(args, "threshold", 0.95),
|
|
416
|
-
match_partition=getattr(args, "match_partition", "base"),
|
|
417
|
-
matrix="none",
|
|
418
|
-
matrix_output=None,
|
|
419
|
-
rows=20,
|
|
420
|
-
cols=10,
|
|
421
|
-
quiet=True,
|
|
422
|
-
write_coverage=True,
|
|
423
|
-
apply_postprocess=(getattr(args, "mode", "final") == "final"),
|
|
424
|
-
include_targets=getattr(args, "include_targets", False),
|
|
579
|
+
visuals=inspect_visual_provider,
|
|
580
|
+
progress=inspect_progress_style,
|
|
581
|
+
log_level=base_level,
|
|
582
|
+
sort=getattr(args, "sort", "missing"),
|
|
583
|
+
workspace=workspace_context,
|
|
425
584
|
)
|
|
426
585
|
elif subcmd == "matrix":
|
|
427
586
|
handle_inspect_report(
|
|
@@ -436,49 +595,67 @@ def main() -> None:
|
|
|
436
595
|
quiet=getattr(args, "quiet", False),
|
|
437
596
|
write_coverage=False,
|
|
438
597
|
apply_postprocess=(getattr(args, "mode", "final") == "final"),
|
|
439
|
-
|
|
598
|
+
visuals=inspect_visual_provider,
|
|
599
|
+
progress=inspect_progress_style,
|
|
600
|
+
log_level=base_level,
|
|
601
|
+
sort=getattr(args, "sort", "missing"),
|
|
602
|
+
workspace=workspace_context,
|
|
440
603
|
)
|
|
441
604
|
elif subcmd == "partitions":
|
|
442
605
|
from datapipeline.cli.commands.inspect import partitions as handle_inspect_partitions
|
|
443
606
|
handle_inspect_partitions(
|
|
444
607
|
project=args.project,
|
|
445
608
|
output=getattr(args, "output", None),
|
|
446
|
-
|
|
609
|
+
visuals=inspect_visual_provider,
|
|
610
|
+
progress=inspect_progress_style,
|
|
611
|
+
log_level=base_level,
|
|
612
|
+
workspace=workspace_context,
|
|
447
613
|
)
|
|
448
614
|
elif subcmd == "expected":
|
|
449
615
|
from datapipeline.cli.commands.inspect import expected as handle_inspect_expected
|
|
450
616
|
handle_inspect_expected(
|
|
451
617
|
project=args.project,
|
|
452
618
|
output=getattr(args, "output", None),
|
|
453
|
-
|
|
619
|
+
visuals=inspect_visual_provider,
|
|
620
|
+
progress=inspect_progress_style,
|
|
621
|
+
log_level=base_level,
|
|
622
|
+
workspace=workspace_context,
|
|
454
623
|
)
|
|
455
624
|
return
|
|
456
625
|
|
|
457
626
|
if args.cmd == "source":
|
|
458
|
-
if args.
|
|
627
|
+
if args.source_cmd == "list":
|
|
459
628
|
handle_list(subcmd="sources")
|
|
460
629
|
else:
|
|
630
|
+
# Merge positionals and flags for provider/dataset
|
|
461
631
|
handle_source(
|
|
462
632
|
subcmd="add",
|
|
463
|
-
provider=getattr(args, "provider", None),
|
|
464
|
-
dataset=getattr(args, "dataset", None),
|
|
633
|
+
provider=(getattr(args, "provider", None) or getattr(args, "provider_opt", None)),
|
|
634
|
+
dataset=(getattr(args, "dataset", None) or getattr(args, "dataset_opt", None)),
|
|
465
635
|
transport=getattr(args, "transport", None),
|
|
466
636
|
format=getattr(args, "format", None),
|
|
637
|
+
alias=getattr(args, "alias", None),
|
|
638
|
+
identity=getattr(args, "identity", False),
|
|
639
|
+
plugin_root=plugin_root,
|
|
467
640
|
)
|
|
468
641
|
return
|
|
469
642
|
|
|
470
643
|
if args.cmd == "domain":
|
|
471
|
-
if args.
|
|
644
|
+
if args.domain_cmd == "list":
|
|
472
645
|
handle_list(subcmd="domains")
|
|
473
646
|
else:
|
|
474
647
|
handle_domain(
|
|
475
648
|
subcmd="add",
|
|
476
649
|
domain=getattr(args, "domain", None),
|
|
650
|
+
plugin_root=plugin_root,
|
|
477
651
|
)
|
|
478
652
|
return
|
|
479
653
|
|
|
480
654
|
if args.cmd == "contract":
|
|
481
|
-
|
|
655
|
+
handle_contract(
|
|
656
|
+
plugin_root=plugin_root,
|
|
657
|
+
use_identity=args.identity,
|
|
658
|
+
)
|
|
482
659
|
return
|
|
483
660
|
|
|
484
661
|
if args.cmd == "plugin":
|