jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +120 -17
- datapipeline/analysis/vector/matrix.py +33 -8
- datapipeline/analysis/vector/report.py +162 -32
- datapipeline/build/tasks/__init__.py +11 -0
- datapipeline/build/tasks/config.py +74 -0
- datapipeline/build/tasks/metadata.py +170 -0
- datapipeline/build/tasks/scaler.py +73 -0
- datapipeline/build/tasks/schema.py +60 -0
- datapipeline/build/tasks/utils.py +169 -0
- datapipeline/cli/app.py +304 -127
- datapipeline/cli/commands/build.py +240 -16
- datapipeline/cli/commands/contract.py +367 -0
- datapipeline/cli/commands/domain.py +8 -3
- datapipeline/cli/commands/inspect.py +401 -149
- datapipeline/cli/commands/list_.py +30 -7
- datapipeline/cli/commands/plugin.py +5 -1
- datapipeline/cli/commands/run.py +227 -241
- datapipeline/cli/commands/run_config.py +101 -0
- datapipeline/cli/commands/serve_pipeline.py +156 -0
- datapipeline/cli/commands/source.py +44 -8
- datapipeline/cli/visuals/__init__.py +4 -2
- datapipeline/cli/visuals/common.py +239 -0
- datapipeline/cli/visuals/labels.py +15 -15
- datapipeline/cli/visuals/runner.py +66 -0
- datapipeline/cli/visuals/sections.py +20 -0
- datapipeline/cli/visuals/sources.py +132 -119
- datapipeline/cli/visuals/sources_basic.py +260 -0
- datapipeline/cli/visuals/sources_off.py +76 -0
- datapipeline/cli/visuals/sources_rich.py +414 -0
- datapipeline/config/catalog.py +37 -3
- datapipeline/config/context.py +214 -0
- datapipeline/config/dataset/loader.py +21 -4
- datapipeline/config/dataset/normalize.py +4 -4
- datapipeline/config/metadata.py +43 -0
- datapipeline/config/postprocess.py +2 -2
- datapipeline/config/project.py +3 -2
- datapipeline/config/resolution.py +129 -0
- datapipeline/config/tasks.py +309 -0
- datapipeline/config/workspace.py +155 -0
- datapipeline/domain/__init__.py +12 -0
- datapipeline/domain/record.py +11 -0
- datapipeline/domain/sample.py +54 -0
- datapipeline/integrations/ml/adapter.py +34 -20
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +1 -6
- datapipeline/integrations/ml/torch_support.py +1 -3
- datapipeline/io/factory.py +112 -0
- datapipeline/io/output.py +132 -0
- datapipeline/io/protocols.py +21 -0
- datapipeline/io/serializers.py +219 -0
- datapipeline/io/sinks/__init__.py +23 -0
- datapipeline/io/sinks/base.py +2 -0
- datapipeline/io/sinks/files.py +79 -0
- datapipeline/io/sinks/rich.py +57 -0
- datapipeline/io/sinks/stdout.py +18 -0
- datapipeline/io/writers/__init__.py +14 -0
- datapipeline/io/writers/base.py +28 -0
- datapipeline/io/writers/csv_writer.py +25 -0
- datapipeline/io/writers/jsonl.py +52 -0
- datapipeline/io/writers/pickle_writer.py +30 -0
- datapipeline/pipeline/artifacts.py +58 -0
- datapipeline/pipeline/context.py +66 -7
- datapipeline/pipeline/observability.py +65 -0
- datapipeline/pipeline/pipelines.py +65 -13
- datapipeline/pipeline/split.py +11 -10
- datapipeline/pipeline/stages.py +127 -16
- datapipeline/pipeline/utils/keygen.py +20 -7
- datapipeline/pipeline/utils/memory_sort.py +22 -10
- datapipeline/pipeline/utils/transform_utils.py +22 -0
- datapipeline/runtime.py +5 -2
- datapipeline/services/artifacts.py +12 -6
- datapipeline/services/bootstrap/config.py +25 -0
- datapipeline/services/bootstrap/core.py +52 -37
- datapipeline/services/constants.py +6 -5
- datapipeline/services/factories.py +123 -1
- datapipeline/services/project_paths.py +43 -16
- datapipeline/services/runs.py +208 -0
- datapipeline/services/scaffold/domain.py +3 -2
- datapipeline/services/scaffold/filter.py +3 -2
- datapipeline/services/scaffold/mappers.py +9 -6
- datapipeline/services/scaffold/plugin.py +54 -10
- datapipeline/services/scaffold/source.py +93 -56
- datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
- datapipeline/sources/decoders.py +83 -18
- datapipeline/sources/factory.py +26 -16
- datapipeline/sources/models/__init__.py +2 -2
- datapipeline/sources/models/generator.py +0 -7
- datapipeline/sources/models/loader.py +3 -3
- datapipeline/sources/models/parsing_error.py +24 -0
- datapipeline/sources/models/source.py +6 -6
- datapipeline/sources/synthetic/time/loader.py +14 -2
- datapipeline/sources/transports.py +74 -37
- datapipeline/templates/plugin_skeleton/README.md +76 -30
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
- datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
- datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
- datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -0
- datapipeline/templates/stubs/mapper.py.j2 +5 -4
- datapipeline/templates/stubs/parser.py.j2 +2 -0
- datapipeline/templates/stubs/record.py.j2 +2 -0
- datapipeline/templates/stubs/source.yaml.j2 +2 -3
- datapipeline/transforms/debug/lint.py +26 -41
- datapipeline/transforms/feature/scaler.py +89 -13
- datapipeline/transforms/record/floor_time.py +4 -4
- datapipeline/transforms/sequence.py +2 -35
- datapipeline/transforms/stream/dedupe.py +24 -0
- datapipeline/transforms/stream/ensure_ticks.py +7 -6
- datapipeline/transforms/vector/__init__.py +5 -0
- datapipeline/transforms/vector/common.py +98 -0
- datapipeline/transforms/vector/drop/__init__.py +4 -0
- datapipeline/transforms/vector/drop/horizontal.py +79 -0
- datapipeline/transforms/vector/drop/orchestrator.py +59 -0
- datapipeline/transforms/vector/drop/vertical.py +182 -0
- datapipeline/transforms/vector/ensure_schema.py +184 -0
- datapipeline/transforms/vector/fill.py +87 -0
- datapipeline/transforms/vector/replace.py +62 -0
- datapipeline/utils/load.py +24 -3
- datapipeline/utils/rich_compat.py +38 -0
- datapipeline/utils/window.py +76 -0
- jerry_thomas-1.0.1.dist-info/METADATA +825 -0
- jerry_thomas-1.0.1.dist-info/RECORD +199 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
- datapipeline/build/tasks.py +0 -186
- datapipeline/cli/commands/link.py +0 -128
- datapipeline/cli/commands/writers.py +0 -138
- datapipeline/config/build.py +0 -64
- datapipeline/config/run.py +0 -116
- datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
- datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
- datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
- datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
- datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
- datapipeline/transforms/vector.py +0 -210
- jerry_thomas-0.3.0.dist-info/METADATA +0 -502
- jerry_thomas-0.3.0.dist-info/RECORD +0 -139
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -1,16 +1,104 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import json
|
|
3
|
+
import logging
|
|
4
|
+
import sys
|
|
3
5
|
from contextlib import redirect_stdout
|
|
4
6
|
from pathlib import Path
|
|
7
|
+
from typing import Iterable, Iterator, TypeVar
|
|
5
8
|
|
|
6
9
|
from datapipeline.analysis.vector.collector import VectorStatsCollector
|
|
10
|
+
from datapipeline.cli.visuals.runner import run_job
|
|
11
|
+
from datapipeline.config.context import load_dataset_context
|
|
7
12
|
from datapipeline.config.dataset.loader import load_dataset
|
|
8
|
-
from datapipeline.services.bootstrap import bootstrap
|
|
9
13
|
from datapipeline.utils.paths import ensure_parent
|
|
10
14
|
from datapipeline.services.bootstrap import artifacts_root
|
|
11
|
-
from datapipeline.pipeline.context import PipelineContext
|
|
12
15
|
from datapipeline.pipeline.pipelines import build_vector_pipeline
|
|
13
16
|
from datapipeline.pipeline.stages import post_process
|
|
17
|
+
from datapipeline.pipeline.artifacts import StageDemand, required_artifacts_for
|
|
18
|
+
from datapipeline.cli.commands.build import run_build_if_needed
|
|
19
|
+
from tqdm import tqdm
|
|
20
|
+
|
|
21
|
+
T = TypeVar("T")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _prepare_inspect_build(
|
|
25
|
+
project: str | Path,
|
|
26
|
+
*,
|
|
27
|
+
visuals: str | None,
|
|
28
|
+
progress: str | None,
|
|
29
|
+
workspace=None,
|
|
30
|
+
) -> None:
|
|
31
|
+
project_path = Path(project)
|
|
32
|
+
dataset = load_dataset(project_path, "vectors")
|
|
33
|
+
demands = [StageDemand(stage=None)]
|
|
34
|
+
required = required_artifacts_for(dataset, demands)
|
|
35
|
+
if not required:
|
|
36
|
+
return
|
|
37
|
+
run_build_if_needed(
|
|
38
|
+
project_path,
|
|
39
|
+
required_artifacts=required,
|
|
40
|
+
cli_visuals=visuals,
|
|
41
|
+
cli_progress=progress,
|
|
42
|
+
workspace=workspace,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _iter_with_progress(
|
|
47
|
+
iterable: Iterable[T],
|
|
48
|
+
*,
|
|
49
|
+
progress_style: str | None,
|
|
50
|
+
label: str,
|
|
51
|
+
) -> Iterator[T]:
|
|
52
|
+
style = (progress_style or "auto").lower()
|
|
53
|
+
if style == "auto":
|
|
54
|
+
# Default to a light spinner unless DEBUG logging is active.
|
|
55
|
+
style = "bars" if logging.getLogger().isEnabledFor(logging.DEBUG) else "spinner"
|
|
56
|
+
if style == "off":
|
|
57
|
+
yield from iterable
|
|
58
|
+
return
|
|
59
|
+
bar_kwargs = {
|
|
60
|
+
"desc": label,
|
|
61
|
+
"unit": "vec",
|
|
62
|
+
"dynamic_ncols": True,
|
|
63
|
+
"mininterval": 0.2,
|
|
64
|
+
"leave": False,
|
|
65
|
+
# Avoid noisy multi-line progress when stdout is not a TTY (e.g., logs)
|
|
66
|
+
"disable": not sys.stderr.isatty(),
|
|
67
|
+
}
|
|
68
|
+
if style == "spinner":
|
|
69
|
+
bar_kwargs["bar_format"] = "{desc} {n_fmt}{unit}"
|
|
70
|
+
bar = tqdm(iterable, **bar_kwargs)
|
|
71
|
+
try:
|
|
72
|
+
for item in bar:
|
|
73
|
+
yield item
|
|
74
|
+
finally:
|
|
75
|
+
bar.close()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _run_inspect_job(
|
|
79
|
+
project: str,
|
|
80
|
+
*,
|
|
81
|
+
visuals: str | None,
|
|
82
|
+
progress: str | None,
|
|
83
|
+
log_level: int | None,
|
|
84
|
+
label: str,
|
|
85
|
+
section: str,
|
|
86
|
+
work,
|
|
87
|
+
) -> None:
|
|
88
|
+
dataset_ctx = load_dataset_context(project)
|
|
89
|
+
level_value = log_level if log_level is not None else logging.getLogger().getEffectiveLevel()
|
|
90
|
+
visuals_provider = visuals or "auto"
|
|
91
|
+
progress_style = progress or "auto"
|
|
92
|
+
|
|
93
|
+
run_job(
|
|
94
|
+
sections=("inspect", section),
|
|
95
|
+
label=label,
|
|
96
|
+
visuals=visuals_provider,
|
|
97
|
+
progress_style=progress_style,
|
|
98
|
+
level=level_value,
|
|
99
|
+
runtime=dataset_ctx.runtime,
|
|
100
|
+
work=lambda: work(dataset_ctx, progress_style),
|
|
101
|
+
)
|
|
14
102
|
|
|
15
103
|
|
|
16
104
|
def report(
|
|
@@ -27,7 +115,11 @@ def report(
|
|
|
27
115
|
quiet: bool = False,
|
|
28
116
|
write_coverage: bool = True,
|
|
29
117
|
apply_postprocess: bool = True,
|
|
30
|
-
|
|
118
|
+
visuals: str | None = None,
|
|
119
|
+
progress: str | None = None,
|
|
120
|
+
log_level: int | None = None,
|
|
121
|
+
sort: str = "missing",
|
|
122
|
+
workspace=None,
|
|
31
123
|
) -> None:
|
|
32
124
|
"""Compute a quality report and optionally export coverage JSON and/or a matrix.
|
|
33
125
|
|
|
@@ -36,92 +128,189 @@ def report(
|
|
|
36
128
|
- When matrix != 'none', writes an availability matrix in the requested format.
|
|
37
129
|
"""
|
|
38
130
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
feature_cfgs = list(dataset.features or [])
|
|
45
|
-
if include_targets:
|
|
46
|
-
feature_cfgs += list(dataset.targets or [])
|
|
47
|
-
expected_feature_ids = [cfg.id for cfg in feature_cfgs]
|
|
48
|
-
|
|
49
|
-
# Resolve matrix format and path
|
|
50
|
-
matrix_fmt = (fmt or matrix) if matrix in {"csv", "html"} else None
|
|
51
|
-
if matrix_fmt:
|
|
52
|
-
filename = "matrix.html" if matrix_fmt == "html" else "matrix.csv"
|
|
53
|
-
else:
|
|
54
|
-
filename = None
|
|
55
|
-
base_artifacts = artifacts_root(project_path)
|
|
56
|
-
matrix_path = None
|
|
57
|
-
if matrix_fmt:
|
|
58
|
-
matrix_path = Path(matrix_output) if matrix_output else (base_artifacts / filename)
|
|
59
|
-
|
|
60
|
-
collector = VectorStatsCollector(
|
|
61
|
-
expected_feature_ids or None,
|
|
62
|
-
match_partition=match_partition,
|
|
63
|
-
threshold=threshold,
|
|
64
|
-
show_matrix=False,
|
|
65
|
-
matrix_rows=rows,
|
|
66
|
-
matrix_cols=cols,
|
|
67
|
-
matrix_output=(str(matrix_path) if matrix_path else None),
|
|
68
|
-
matrix_format=(matrix_fmt or "html"),
|
|
131
|
+
_prepare_inspect_build(
|
|
132
|
+
project,
|
|
133
|
+
visuals=visuals,
|
|
134
|
+
progress=progress,
|
|
135
|
+
workspace=workspace,
|
|
69
136
|
)
|
|
137
|
+
coverage_path: Path | None = None
|
|
70
138
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
vectors = post_process(context, vectors) # use global postprocess
|
|
76
|
-
|
|
77
|
-
for group_key, vector in vectors:
|
|
78
|
-
collector.update(group_key, vector.values)
|
|
79
|
-
|
|
80
|
-
buffer = io.StringIO()
|
|
81
|
-
with redirect_stdout(buffer):
|
|
82
|
-
summary = collector.print_report()
|
|
83
|
-
if not quiet:
|
|
84
|
-
report_text = buffer.getvalue()
|
|
85
|
-
if report_text.strip():
|
|
86
|
-
print(report_text, end="")
|
|
87
|
-
|
|
88
|
-
# Optionally write coverage summary JSON to a path
|
|
89
|
-
if write_coverage:
|
|
90
|
-
output_path = Path(output) if output else (base_artifacts / "coverage.json")
|
|
91
|
-
ensure_parent(output_path)
|
|
139
|
+
def _work(dataset_ctx, progress_style):
|
|
140
|
+
project_path = dataset_ctx.project
|
|
141
|
+
context = dataset_ctx.pipeline_context
|
|
142
|
+
dataset = dataset_ctx.dataset
|
|
92
143
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
trimmed = {
|
|
97
|
-
"total_vectors": summary.get("total_vectors", collector.total_vectors),
|
|
98
|
-
"empty_vectors": summary.get("empty_vectors", collector.empty_vectors),
|
|
99
|
-
"threshold": threshold,
|
|
100
|
-
"match_partition": match_partition,
|
|
101
|
-
"features": {
|
|
102
|
-
"keep": summary.get("keep_features", []),
|
|
103
|
-
"below": summary.get("below_features", []),
|
|
104
|
-
"coverage": {stat["id"]: stat["coverage"] for stat in feature_stats},
|
|
105
|
-
},
|
|
106
|
-
"partitions": {
|
|
107
|
-
"keep": summary.get("keep_partitions", []),
|
|
108
|
-
"below": summary.get("below_partitions", []),
|
|
109
|
-
"keep_suffixes": summary.get("keep_suffixes", []),
|
|
110
|
-
"below_suffixes": summary.get("below_suffixes", []),
|
|
111
|
-
"coverage": {stat["id"]: stat["coverage"] for stat in partition_stats},
|
|
112
|
-
},
|
|
113
|
-
}
|
|
144
|
+
feature_cfgs = dataset_ctx.features
|
|
145
|
+
target_cfgs = dataset_ctx.targets
|
|
146
|
+
expected_feature_ids = [cfg.id for cfg in feature_cfgs]
|
|
114
147
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
148
|
+
matrix_fmt = (fmt or matrix) if matrix in {"csv", "html"} else None
|
|
149
|
+
if matrix_fmt:
|
|
150
|
+
filename = "matrix.html" if matrix_fmt == "html" else "matrix.csv"
|
|
151
|
+
else:
|
|
152
|
+
filename = None
|
|
153
|
+
base_artifacts = artifacts_root(project_path)
|
|
154
|
+
matrix_path = None
|
|
155
|
+
if matrix_fmt:
|
|
156
|
+
matrix_path = Path(matrix_output) if matrix_output else (base_artifacts / filename)
|
|
157
|
+
|
|
158
|
+
schema_entries = dataset_ctx.pipeline_context.load_schema(payload="features")
|
|
159
|
+
schema_meta = {entry["id"]: entry for entry in (schema_entries or []) if isinstance(entry.get("id"), str)}
|
|
160
|
+
|
|
161
|
+
collector = VectorStatsCollector(
|
|
162
|
+
expected_feature_ids or None,
|
|
163
|
+
match_partition=match_partition,
|
|
164
|
+
schema_meta=schema_meta,
|
|
165
|
+
threshold=threshold,
|
|
166
|
+
show_matrix=False,
|
|
167
|
+
matrix_rows=rows,
|
|
168
|
+
matrix_cols=cols,
|
|
169
|
+
matrix_output=(str(matrix_path) if matrix_path else None),
|
|
170
|
+
matrix_format=(matrix_fmt or "html"),
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
context.window_bounds(rectangular_required=True)
|
|
174
|
+
vectors = build_vector_pipeline(
|
|
175
|
+
context,
|
|
176
|
+
feature_cfgs,
|
|
177
|
+
dataset.group_by,
|
|
178
|
+
target_configs=target_cfgs,
|
|
179
|
+
rectangular=True,
|
|
180
|
+
)
|
|
181
|
+
if apply_postprocess:
|
|
182
|
+
vectors = post_process(context, vectors)
|
|
183
|
+
|
|
184
|
+
vector_iter = _iter_with_progress(
|
|
185
|
+
vectors,
|
|
186
|
+
progress_style=progress_style,
|
|
187
|
+
label="Processing vectors",
|
|
188
|
+
)
|
|
189
|
+
for sample in vector_iter:
|
|
190
|
+
merged = dict(sample.features.values)
|
|
191
|
+
if sample.targets:
|
|
192
|
+
merged.update(sample.targets.values)
|
|
193
|
+
collector.update(sample.key, merged)
|
|
194
|
+
|
|
195
|
+
buffer = io.StringIO()
|
|
196
|
+
with redirect_stdout(buffer):
|
|
197
|
+
summary = collector.print_report(sort_key=sort)
|
|
198
|
+
if not quiet:
|
|
199
|
+
report_text = buffer.getvalue()
|
|
200
|
+
if report_text.strip():
|
|
201
|
+
print(report_text, end="")
|
|
202
|
+
|
|
203
|
+
if write_coverage:
|
|
204
|
+
output_path = Path(output) if output else (base_artifacts / "coverage.json")
|
|
205
|
+
ensure_parent(output_path)
|
|
206
|
+
|
|
207
|
+
feature_stats = summary.get("feature_stats", [])
|
|
208
|
+
partition_stats = summary.get("partition_stats", [])
|
|
209
|
+
|
|
210
|
+
trimmed = {
|
|
211
|
+
"total_vectors": summary.get("total_vectors", collector.total_vectors),
|
|
212
|
+
"empty_vectors": summary.get("empty_vectors", collector.empty_vectors),
|
|
213
|
+
"threshold": threshold,
|
|
214
|
+
"match_partition": match_partition,
|
|
215
|
+
"features": {
|
|
216
|
+
"keep": summary.get("keep_features", []),
|
|
217
|
+
"below": summary.get("below_features", []),
|
|
218
|
+
"coverage": {stat["id"]: stat["coverage"] for stat in feature_stats},
|
|
219
|
+
"availability": {
|
|
220
|
+
stat["id"]: (
|
|
221
|
+
stat["present"] / stat["opportunities"]
|
|
222
|
+
if stat.get("opportunities")
|
|
223
|
+
else 0
|
|
224
|
+
)
|
|
225
|
+
for stat in feature_stats
|
|
226
|
+
},
|
|
227
|
+
"nulls": {stat["id"]: stat.get("nulls", 0) for stat in feature_stats},
|
|
228
|
+
"null_rate": {
|
|
229
|
+
stat["id"]: (
|
|
230
|
+
stat.get("nulls", 0) / stat["opportunities"]
|
|
231
|
+
if stat.get("opportunities")
|
|
232
|
+
else 0
|
|
233
|
+
)
|
|
234
|
+
for stat in feature_stats
|
|
235
|
+
},
|
|
236
|
+
"cadence_nulls": {
|
|
237
|
+
stat["id"]: stat.get("cadence_nulls")
|
|
238
|
+
for stat in feature_stats
|
|
239
|
+
if stat.get("cadence_opportunities")
|
|
240
|
+
},
|
|
241
|
+
"cadence_opportunities": {
|
|
242
|
+
stat["id"]: stat.get("cadence_opportunities")
|
|
243
|
+
for stat in feature_stats
|
|
244
|
+
if stat.get("cadence_opportunities")
|
|
245
|
+
},
|
|
246
|
+
},
|
|
247
|
+
"partitions": {
|
|
248
|
+
"keep": summary.get("keep_partitions", []),
|
|
249
|
+
"below": summary.get("below_partitions", []),
|
|
250
|
+
"keep_suffixes": summary.get("keep_suffixes", []),
|
|
251
|
+
"below_suffixes": summary.get("below_suffixes", []),
|
|
252
|
+
"keep_values": summary.get("keep_partition_values", []),
|
|
253
|
+
"below_values": summary.get("below_partition_values", []),
|
|
254
|
+
"coverage": {stat["id"]: stat["coverage"] for stat in partition_stats},
|
|
255
|
+
"availability": {
|
|
256
|
+
stat["id"]: (
|
|
257
|
+
stat["present"] / stat["opportunities"]
|
|
258
|
+
if stat.get("opportunities")
|
|
259
|
+
else 0
|
|
260
|
+
)
|
|
261
|
+
for stat in partition_stats
|
|
262
|
+
},
|
|
263
|
+
"nulls": {
|
|
264
|
+
stat["id"]: stat.get("nulls", 0) for stat in partition_stats
|
|
265
|
+
},
|
|
266
|
+
"null_rate": {
|
|
267
|
+
stat["id"]: (
|
|
268
|
+
stat.get("nulls", 0) / stat["opportunities"]
|
|
269
|
+
if stat.get("opportunities")
|
|
270
|
+
else 0
|
|
271
|
+
)
|
|
272
|
+
for stat in partition_stats
|
|
273
|
+
},
|
|
274
|
+
"cadence_nulls": {
|
|
275
|
+
stat["id"]: stat.get("cadence_nulls")
|
|
276
|
+
for stat in partition_stats
|
|
277
|
+
if stat.get("cadence_opportunities")
|
|
278
|
+
},
|
|
279
|
+
"cadence_opportunities": {
|
|
280
|
+
stat["id"]: stat.get("cadence_opportunities")
|
|
281
|
+
for stat in partition_stats
|
|
282
|
+
if stat.get("cadence_opportunities")
|
|
283
|
+
},
|
|
284
|
+
},
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
with output_path.open("w", encoding="utf-8") as fh:
|
|
288
|
+
json.dump(trimmed, fh, indent=2)
|
|
289
|
+
print(f"[write] Saved coverage summary to {output_path}")
|
|
290
|
+
coverage_path = output_path
|
|
291
|
+
|
|
292
|
+
_run_inspect_job(
|
|
293
|
+
project,
|
|
294
|
+
visuals=visuals,
|
|
295
|
+
progress=progress,
|
|
296
|
+
log_level=log_level,
|
|
297
|
+
label="Inspect report",
|
|
298
|
+
section="report",
|
|
299
|
+
work=_work,
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
if write_coverage and coverage_path:
|
|
303
|
+
print(f"[inspect] Coverage summary available at {coverage_path}")
|
|
118
304
|
|
|
119
305
|
|
|
120
306
|
def partitions(
|
|
121
307
|
project: str,
|
|
122
308
|
*,
|
|
123
309
|
output: str | None = None,
|
|
124
|
-
|
|
310
|
+
visuals: str | None = None,
|
|
311
|
+
progress: str | None = None,
|
|
312
|
+
log_level: int | None = None,
|
|
313
|
+
workspace=None,
|
|
125
314
|
) -> None:
|
|
126
315
|
"""Discover observed partitions and write a manifest JSON.
|
|
127
316
|
|
|
@@ -131,90 +320,153 @@ def partitions(
|
|
|
131
320
|
- by_feature: mapping base id -> list of suffixes (empty when none)
|
|
132
321
|
"""
|
|
133
322
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
if include_targets:
|
|
140
|
-
feature_cfgs += list(dataset.targets or [])
|
|
141
|
-
expected_feature_ids = [cfg.id for cfg in feature_cfgs]
|
|
142
|
-
collector = VectorStatsCollector(
|
|
143
|
-
expected_feature_ids or None,
|
|
144
|
-
match_partition="full",
|
|
145
|
-
threshold=None,
|
|
146
|
-
show_matrix=False,
|
|
323
|
+
_prepare_inspect_build(
|
|
324
|
+
project,
|
|
325
|
+
visuals=visuals,
|
|
326
|
+
progress=progress,
|
|
327
|
+
workspace=workspace,
|
|
147
328
|
)
|
|
148
329
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
vectors = post_process(context, vectors) # apply global postprocess
|
|
152
|
-
for group_key, vector in vectors:
|
|
153
|
-
collector.update(group_key, vector.values)
|
|
154
|
-
|
|
155
|
-
base_artifacts = artifacts_root(project_path)
|
|
156
|
-
output_path = Path(output) if output else (base_artifacts / "partitions.json")
|
|
157
|
-
ensure_parent(output_path)
|
|
158
|
-
|
|
159
|
-
parts = sorted(collector.discovered_partitions)
|
|
160
|
-
features = sorted({pid.split("__", 1)[0] for pid in parts})
|
|
161
|
-
by_feature: dict[str, list[str]] = {}
|
|
162
|
-
for pid in parts:
|
|
163
|
-
if "__" in pid:
|
|
164
|
-
base, suffix = pid.split("__", 1)
|
|
165
|
-
else:
|
|
166
|
-
base, suffix = pid, ""
|
|
167
|
-
by_feature.setdefault(base, [])
|
|
168
|
-
if suffix and suffix not in by_feature[base]:
|
|
169
|
-
by_feature[base].append(suffix)
|
|
170
|
-
for k in list(by_feature.keys()):
|
|
171
|
-
by_feature[k] = sorted(by_feature[k])
|
|
172
|
-
|
|
173
|
-
data = {
|
|
174
|
-
"features": features,
|
|
175
|
-
"partitions": parts,
|
|
176
|
-
"by_feature": by_feature,
|
|
177
|
-
}
|
|
330
|
+
def _work(dataset_ctx, progress_style):
|
|
331
|
+
project_path = dataset_ctx.project
|
|
178
332
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
333
|
+
dataset = dataset_ctx.dataset
|
|
334
|
+
feature_cfgs = list(dataset.features or [])
|
|
335
|
+
target_cfgs = list(dataset.targets or [])
|
|
336
|
+
expected_feature_ids = [cfg.id for cfg in feature_cfgs]
|
|
337
|
+
|
|
338
|
+
base_artifacts = artifacts_root(project_path)
|
|
339
|
+
output_path = Path(output) if output else (base_artifacts / "partitions.json")
|
|
340
|
+
|
|
341
|
+
collector = VectorStatsCollector(
|
|
342
|
+
expected_feature_ids or None,
|
|
343
|
+
match_partition="full",
|
|
344
|
+
threshold=None,
|
|
345
|
+
show_matrix=False,
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
context = dataset_ctx.pipeline_context
|
|
349
|
+
context.window_bounds(rectangular_required=True)
|
|
350
|
+
vectors = build_vector_pipeline(
|
|
351
|
+
context,
|
|
352
|
+
feature_cfgs,
|
|
353
|
+
dataset.group_by,
|
|
354
|
+
target_configs=target_cfgs,
|
|
355
|
+
rectangular=True,
|
|
356
|
+
)
|
|
357
|
+
vectors = post_process(context, vectors)
|
|
358
|
+
vector_iter = _iter_with_progress(
|
|
359
|
+
vectors,
|
|
360
|
+
progress_style=progress_style,
|
|
361
|
+
label="Processing vectors",
|
|
362
|
+
)
|
|
363
|
+
for sample in vector_iter:
|
|
364
|
+
merged = dict(sample.features.values)
|
|
365
|
+
if sample.targets:
|
|
366
|
+
merged.update(sample.targets.values)
|
|
367
|
+
collector.update(sample.key, merged)
|
|
368
|
+
|
|
369
|
+
ensure_parent(output_path)
|
|
370
|
+
|
|
371
|
+
parts = sorted(collector.discovered_partitions)
|
|
372
|
+
features = sorted({pid.split("__", 1)[0] for pid in parts})
|
|
373
|
+
by_feature: dict[str, list[str]] = {}
|
|
374
|
+
for pid in parts:
|
|
375
|
+
if "__" in pid:
|
|
376
|
+
base, suffix = pid.split("__", 1)
|
|
377
|
+
else:
|
|
378
|
+
base, suffix = pid, ""
|
|
379
|
+
by_feature.setdefault(base, [])
|
|
380
|
+
if suffix and suffix not in by_feature[base]:
|
|
381
|
+
by_feature[base].append(suffix)
|
|
382
|
+
for k in list(by_feature.keys()):
|
|
383
|
+
by_feature[k] = sorted(by_feature[k])
|
|
384
|
+
|
|
385
|
+
data = {
|
|
386
|
+
"features": features,
|
|
387
|
+
"partitions": parts,
|
|
388
|
+
"by_feature": by_feature,
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
with output_path.open("w", encoding="utf-8") as fh:
|
|
392
|
+
json.dump(data, fh, indent=2)
|
|
393
|
+
print(f"[write] Saved partitions manifest to {output_path}")
|
|
394
|
+
|
|
395
|
+
_run_inspect_job(
|
|
396
|
+
project,
|
|
397
|
+
visuals=visuals,
|
|
398
|
+
progress=progress,
|
|
399
|
+
log_level=log_level,
|
|
400
|
+
label="Inspect partitions",
|
|
401
|
+
section="partitions",
|
|
402
|
+
work=_work,
|
|
403
|
+
)
|
|
182
404
|
|
|
183
405
|
|
|
184
406
|
def expected(
|
|
185
407
|
project: str,
|
|
186
408
|
*,
|
|
187
409
|
output: str | None = None,
|
|
188
|
-
|
|
410
|
+
visuals: str | None = None,
|
|
411
|
+
progress: str | None = None,
|
|
412
|
+
log_level: int | None = None,
|
|
413
|
+
workspace=None,
|
|
189
414
|
) -> None:
|
|
190
415
|
"""Discover complete set of observed full feature IDs and write a list.
|
|
191
416
|
|
|
192
417
|
Writes newline-separated ids to `<paths.artifacts>/expected.txt` by default.
|
|
193
418
|
"""
|
|
194
419
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
feature_cfgs += list(dataset.targets or [])
|
|
202
|
-
|
|
203
|
-
context = PipelineContext(runtime)
|
|
204
|
-
vectors = build_vector_pipeline(context, feature_cfgs, dataset.group_by, stage=None)
|
|
205
|
-
ids: set[str] = set()
|
|
206
|
-
for _, vector in vectors:
|
|
207
|
-
ids.update(vector.values.keys())
|
|
420
|
+
_prepare_inspect_build(
|
|
421
|
+
project,
|
|
422
|
+
visuals=visuals,
|
|
423
|
+
progress=progress,
|
|
424
|
+
workspace=workspace,
|
|
425
|
+
)
|
|
208
426
|
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
427
|
+
def _work(dataset_ctx, progress_style):
|
|
428
|
+
project_path = dataset_ctx.project
|
|
429
|
+
dataset = dataset_ctx.dataset
|
|
430
|
+
feature_cfgs = list(dataset.features or [])
|
|
431
|
+
target_cfgs = list(dataset.targets or [])
|
|
432
|
+
|
|
433
|
+
context = dataset_ctx.pipeline_context
|
|
434
|
+
vectors = build_vector_pipeline(
|
|
435
|
+
context,
|
|
436
|
+
feature_cfgs,
|
|
437
|
+
dataset.group_by,
|
|
438
|
+
target_configs=target_cfgs,
|
|
439
|
+
)
|
|
440
|
+
vector_iter = _iter_with_progress(
|
|
441
|
+
vectors,
|
|
442
|
+
progress_style=progress_style,
|
|
443
|
+
label="Processing vectors",
|
|
214
444
|
)
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
445
|
+
ids: set[str] = set()
|
|
446
|
+
for sample in vector_iter:
|
|
447
|
+
ids.update(sample.features.values.keys())
|
|
448
|
+
if sample.targets:
|
|
449
|
+
ids.update(sample.targets.values.keys())
|
|
450
|
+
|
|
451
|
+
try:
|
|
452
|
+
default_path = artifacts_root(project_path) / "expected.txt"
|
|
453
|
+
except Exception as e:
|
|
454
|
+
raise RuntimeError(
|
|
455
|
+
f"{e}. Set `paths.artifacts` in your project.yaml to a writable directory."
|
|
456
|
+
)
|
|
457
|
+
output_path = Path(output) if output else default_path
|
|
458
|
+
ensure_parent(output_path)
|
|
459
|
+
with output_path.open("w", encoding="utf-8") as fh:
|
|
460
|
+
for fid in sorted(ids):
|
|
461
|
+
fh.write(f"{fid}\n")
|
|
462
|
+
print(f"[write] Saved expected feature list to {output_path} ({len(ids)} ids)")
|
|
463
|
+
|
|
464
|
+
_run_inspect_job(
|
|
465
|
+
project,
|
|
466
|
+
visuals=visuals,
|
|
467
|
+
progress=progress,
|
|
468
|
+
log_level=log_level,
|
|
469
|
+
label="Inspect expected ids",
|
|
470
|
+
section="expected",
|
|
471
|
+
work=_work,
|
|
472
|
+
)
|
|
@@ -1,17 +1,40 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
1
3
|
from datapipeline.services.paths import pkg_root, resolve_base_pkg_dir
|
|
2
|
-
from datapipeline.services.
|
|
4
|
+
from datapipeline.services.bootstrap.core import load_streams
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _default_project_path(root_dir: Path) -> Path | None:
|
|
8
|
+
candidate = root_dir / "config" / "project.yaml"
|
|
9
|
+
if candidate.exists():
|
|
10
|
+
return candidate
|
|
11
|
+
default_proj = root_dir / "config" / "datasets" / "default" / "project.yaml"
|
|
12
|
+
if default_proj.exists():
|
|
13
|
+
return default_proj
|
|
14
|
+
datasets_dir = root_dir / "config" / "datasets"
|
|
15
|
+
if datasets_dir.exists():
|
|
16
|
+
for p in sorted(datasets_dir.rglob("project.yaml")):
|
|
17
|
+
if p.is_file():
|
|
18
|
+
return p
|
|
19
|
+
return None
|
|
3
20
|
|
|
4
21
|
|
|
5
22
|
def handle(subcmd: str) -> None:
|
|
6
23
|
root_dir, name, pyproject = pkg_root(None)
|
|
7
24
|
if subcmd == "sources":
|
|
8
25
|
# Discover sources by scanning sources_dir for YAML files
|
|
9
|
-
proj_path = root_dir
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
26
|
+
proj_path = _default_project_path(root_dir)
|
|
27
|
+
if proj_path is None:
|
|
28
|
+
print("[error] No project.yaml found under config/.")
|
|
29
|
+
return
|
|
30
|
+
try:
|
|
31
|
+
streams = load_streams(proj_path)
|
|
32
|
+
except FileNotFoundError as exc:
|
|
33
|
+
print(f"[error] {exc}")
|
|
34
|
+
return
|
|
35
|
+
aliases = sorted(streams.raw.keys())
|
|
36
|
+
for alias in aliases:
|
|
37
|
+
print(alias)
|
|
15
38
|
elif subcmd == "domains":
|
|
16
39
|
base = resolve_base_pkg_dir(root_dir, name)
|
|
17
40
|
dom_dir = base / "domains"
|
|
@@ -1,10 +1,14 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from pathlib import Path
|
|
2
3
|
from datapipeline.services.scaffold.plugin import scaffold_plugin
|
|
3
4
|
|
|
4
5
|
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
5
9
|
def bar(subcmd: str, name: str | None, out: str) -> None:
|
|
6
10
|
if subcmd == "init":
|
|
7
11
|
if not name:
|
|
8
|
-
|
|
12
|
+
logger.error("Plugin name is required. Use 'jerry plugin init <name>' or pass -n/--name.")
|
|
9
13
|
raise SystemExit(2)
|
|
10
14
|
scaffold_plugin(name, Path(out))
|