jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. datapipeline/analysis/vector/collector.py +120 -17
  2. datapipeline/analysis/vector/matrix.py +33 -8
  3. datapipeline/analysis/vector/report.py +162 -32
  4. datapipeline/build/tasks/__init__.py +11 -0
  5. datapipeline/build/tasks/config.py +74 -0
  6. datapipeline/build/tasks/metadata.py +170 -0
  7. datapipeline/build/tasks/scaler.py +73 -0
  8. datapipeline/build/tasks/schema.py +60 -0
  9. datapipeline/build/tasks/utils.py +169 -0
  10. datapipeline/cli/app.py +304 -127
  11. datapipeline/cli/commands/build.py +240 -16
  12. datapipeline/cli/commands/contract.py +367 -0
  13. datapipeline/cli/commands/domain.py +8 -3
  14. datapipeline/cli/commands/inspect.py +401 -149
  15. datapipeline/cli/commands/list_.py +30 -7
  16. datapipeline/cli/commands/plugin.py +5 -1
  17. datapipeline/cli/commands/run.py +227 -241
  18. datapipeline/cli/commands/run_config.py +101 -0
  19. datapipeline/cli/commands/serve_pipeline.py +156 -0
  20. datapipeline/cli/commands/source.py +44 -8
  21. datapipeline/cli/visuals/__init__.py +4 -2
  22. datapipeline/cli/visuals/common.py +239 -0
  23. datapipeline/cli/visuals/labels.py +15 -15
  24. datapipeline/cli/visuals/runner.py +66 -0
  25. datapipeline/cli/visuals/sections.py +20 -0
  26. datapipeline/cli/visuals/sources.py +132 -119
  27. datapipeline/cli/visuals/sources_basic.py +260 -0
  28. datapipeline/cli/visuals/sources_off.py +76 -0
  29. datapipeline/cli/visuals/sources_rich.py +414 -0
  30. datapipeline/config/catalog.py +37 -3
  31. datapipeline/config/context.py +214 -0
  32. datapipeline/config/dataset/loader.py +21 -4
  33. datapipeline/config/dataset/normalize.py +4 -4
  34. datapipeline/config/metadata.py +43 -0
  35. datapipeline/config/postprocess.py +2 -2
  36. datapipeline/config/project.py +3 -2
  37. datapipeline/config/resolution.py +129 -0
  38. datapipeline/config/tasks.py +309 -0
  39. datapipeline/config/workspace.py +155 -0
  40. datapipeline/domain/__init__.py +12 -0
  41. datapipeline/domain/record.py +11 -0
  42. datapipeline/domain/sample.py +54 -0
  43. datapipeline/integrations/ml/adapter.py +34 -20
  44. datapipeline/integrations/ml/pandas_support.py +0 -2
  45. datapipeline/integrations/ml/rows.py +1 -6
  46. datapipeline/integrations/ml/torch_support.py +1 -3
  47. datapipeline/io/factory.py +112 -0
  48. datapipeline/io/output.py +132 -0
  49. datapipeline/io/protocols.py +21 -0
  50. datapipeline/io/serializers.py +219 -0
  51. datapipeline/io/sinks/__init__.py +23 -0
  52. datapipeline/io/sinks/base.py +2 -0
  53. datapipeline/io/sinks/files.py +79 -0
  54. datapipeline/io/sinks/rich.py +57 -0
  55. datapipeline/io/sinks/stdout.py +18 -0
  56. datapipeline/io/writers/__init__.py +14 -0
  57. datapipeline/io/writers/base.py +28 -0
  58. datapipeline/io/writers/csv_writer.py +25 -0
  59. datapipeline/io/writers/jsonl.py +52 -0
  60. datapipeline/io/writers/pickle_writer.py +30 -0
  61. datapipeline/pipeline/artifacts.py +58 -0
  62. datapipeline/pipeline/context.py +66 -7
  63. datapipeline/pipeline/observability.py +65 -0
  64. datapipeline/pipeline/pipelines.py +65 -13
  65. datapipeline/pipeline/split.py +11 -10
  66. datapipeline/pipeline/stages.py +127 -16
  67. datapipeline/pipeline/utils/keygen.py +20 -7
  68. datapipeline/pipeline/utils/memory_sort.py +22 -10
  69. datapipeline/pipeline/utils/transform_utils.py +22 -0
  70. datapipeline/runtime.py +5 -2
  71. datapipeline/services/artifacts.py +12 -6
  72. datapipeline/services/bootstrap/config.py +25 -0
  73. datapipeline/services/bootstrap/core.py +52 -37
  74. datapipeline/services/constants.py +6 -5
  75. datapipeline/services/factories.py +123 -1
  76. datapipeline/services/project_paths.py +43 -16
  77. datapipeline/services/runs.py +208 -0
  78. datapipeline/services/scaffold/domain.py +3 -2
  79. datapipeline/services/scaffold/filter.py +3 -2
  80. datapipeline/services/scaffold/mappers.py +9 -6
  81. datapipeline/services/scaffold/plugin.py +54 -10
  82. datapipeline/services/scaffold/source.py +93 -56
  83. datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
  84. datapipeline/sources/decoders.py +83 -18
  85. datapipeline/sources/factory.py +26 -16
  86. datapipeline/sources/models/__init__.py +2 -2
  87. datapipeline/sources/models/generator.py +0 -7
  88. datapipeline/sources/models/loader.py +3 -3
  89. datapipeline/sources/models/parsing_error.py +24 -0
  90. datapipeline/sources/models/source.py +6 -6
  91. datapipeline/sources/synthetic/time/loader.py +14 -2
  92. datapipeline/sources/transports.py +74 -37
  93. datapipeline/templates/plugin_skeleton/README.md +76 -30
  94. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
  95. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
  96. datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
  97. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
  98. datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
  99. datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
  100. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
  101. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
  102. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
  103. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
  104. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
  105. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
  106. datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
  107. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
  108. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
  109. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
  110. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
  111. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
  112. datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
  113. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
  114. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
  115. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
  116. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
  117. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
  118. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
  119. datapipeline/templates/stubs/dto.py.j2 +2 -0
  120. datapipeline/templates/stubs/mapper.py.j2 +5 -4
  121. datapipeline/templates/stubs/parser.py.j2 +2 -0
  122. datapipeline/templates/stubs/record.py.j2 +2 -0
  123. datapipeline/templates/stubs/source.yaml.j2 +2 -3
  124. datapipeline/transforms/debug/lint.py +26 -41
  125. datapipeline/transforms/feature/scaler.py +89 -13
  126. datapipeline/transforms/record/floor_time.py +4 -4
  127. datapipeline/transforms/sequence.py +2 -35
  128. datapipeline/transforms/stream/dedupe.py +24 -0
  129. datapipeline/transforms/stream/ensure_ticks.py +7 -6
  130. datapipeline/transforms/vector/__init__.py +5 -0
  131. datapipeline/transforms/vector/common.py +98 -0
  132. datapipeline/transforms/vector/drop/__init__.py +4 -0
  133. datapipeline/transforms/vector/drop/horizontal.py +79 -0
  134. datapipeline/transforms/vector/drop/orchestrator.py +59 -0
  135. datapipeline/transforms/vector/drop/vertical.py +182 -0
  136. datapipeline/transforms/vector/ensure_schema.py +184 -0
  137. datapipeline/transforms/vector/fill.py +87 -0
  138. datapipeline/transforms/vector/replace.py +62 -0
  139. datapipeline/utils/load.py +24 -3
  140. datapipeline/utils/rich_compat.py +38 -0
  141. datapipeline/utils/window.py +76 -0
  142. jerry_thomas-1.0.1.dist-info/METADATA +825 -0
  143. jerry_thomas-1.0.1.dist-info/RECORD +199 -0
  144. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
  145. datapipeline/build/tasks.py +0 -186
  146. datapipeline/cli/commands/link.py +0 -128
  147. datapipeline/cli/commands/writers.py +0 -138
  148. datapipeline/config/build.py +0 -64
  149. datapipeline/config/run.py +0 -116
  150. datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
  151. datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
  152. datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
  153. datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
  154. datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
  155. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
  156. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
  157. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
  158. datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
  159. datapipeline/transforms/vector.py +0 -210
  160. jerry_thomas-0.3.0.dist-info/METADATA +0 -502
  161. jerry_thomas-0.3.0.dist-info/RECORD +0 -139
  162. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
  163. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
  164. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,104 @@
1
1
  import io
2
2
  import json
3
+ import logging
4
+ import sys
3
5
  from contextlib import redirect_stdout
4
6
  from pathlib import Path
7
+ from typing import Iterable, Iterator, TypeVar
5
8
 
6
9
  from datapipeline.analysis.vector.collector import VectorStatsCollector
10
+ from datapipeline.cli.visuals.runner import run_job
11
+ from datapipeline.config.context import load_dataset_context
7
12
  from datapipeline.config.dataset.loader import load_dataset
8
- from datapipeline.services.bootstrap import bootstrap
9
13
  from datapipeline.utils.paths import ensure_parent
10
14
  from datapipeline.services.bootstrap import artifacts_root
11
- from datapipeline.pipeline.context import PipelineContext
12
15
  from datapipeline.pipeline.pipelines import build_vector_pipeline
13
16
  from datapipeline.pipeline.stages import post_process
17
+ from datapipeline.pipeline.artifacts import StageDemand, required_artifacts_for
18
+ from datapipeline.cli.commands.build import run_build_if_needed
19
+ from tqdm import tqdm
20
+
21
+ T = TypeVar("T")
22
+
23
+
24
+ def _prepare_inspect_build(
25
+ project: str | Path,
26
+ *,
27
+ visuals: str | None,
28
+ progress: str | None,
29
+ workspace=None,
30
+ ) -> None:
31
+ project_path = Path(project)
32
+ dataset = load_dataset(project_path, "vectors")
33
+ demands = [StageDemand(stage=None)]
34
+ required = required_artifacts_for(dataset, demands)
35
+ if not required:
36
+ return
37
+ run_build_if_needed(
38
+ project_path,
39
+ required_artifacts=required,
40
+ cli_visuals=visuals,
41
+ cli_progress=progress,
42
+ workspace=workspace,
43
+ )
44
+
45
+
46
+ def _iter_with_progress(
47
+ iterable: Iterable[T],
48
+ *,
49
+ progress_style: str | None,
50
+ label: str,
51
+ ) -> Iterator[T]:
52
+ style = (progress_style or "auto").lower()
53
+ if style == "auto":
54
+ # Default to a light spinner unless DEBUG logging is active.
55
+ style = "bars" if logging.getLogger().isEnabledFor(logging.DEBUG) else "spinner"
56
+ if style == "off":
57
+ yield from iterable
58
+ return
59
+ bar_kwargs = {
60
+ "desc": label,
61
+ "unit": "vec",
62
+ "dynamic_ncols": True,
63
+ "mininterval": 0.2,
64
+ "leave": False,
65
+ # Avoid noisy multi-line progress when stdout is not a TTY (e.g., logs)
66
+ "disable": not sys.stderr.isatty(),
67
+ }
68
+ if style == "spinner":
69
+ bar_kwargs["bar_format"] = "{desc} {n_fmt}{unit}"
70
+ bar = tqdm(iterable, **bar_kwargs)
71
+ try:
72
+ for item in bar:
73
+ yield item
74
+ finally:
75
+ bar.close()
76
+
77
+
78
+ def _run_inspect_job(
79
+ project: str,
80
+ *,
81
+ visuals: str | None,
82
+ progress: str | None,
83
+ log_level: int | None,
84
+ label: str,
85
+ section: str,
86
+ work,
87
+ ) -> None:
88
+ dataset_ctx = load_dataset_context(project)
89
+ level_value = log_level if log_level is not None else logging.getLogger().getEffectiveLevel()
90
+ visuals_provider = visuals or "auto"
91
+ progress_style = progress or "auto"
92
+
93
+ run_job(
94
+ sections=("inspect", section),
95
+ label=label,
96
+ visuals=visuals_provider,
97
+ progress_style=progress_style,
98
+ level=level_value,
99
+ runtime=dataset_ctx.runtime,
100
+ work=lambda: work(dataset_ctx, progress_style),
101
+ )
14
102
 
15
103
 
16
104
  def report(
@@ -27,7 +115,11 @@ def report(
27
115
  quiet: bool = False,
28
116
  write_coverage: bool = True,
29
117
  apply_postprocess: bool = True,
30
- include_targets: bool = False,
118
+ visuals: str | None = None,
119
+ progress: str | None = None,
120
+ log_level: int | None = None,
121
+ sort: str = "missing",
122
+ workspace=None,
31
123
  ) -> None:
32
124
  """Compute a quality report and optionally export coverage JSON and/or a matrix.
33
125
 
@@ -36,92 +128,189 @@ def report(
36
128
  - When matrix != 'none', writes an availability matrix in the requested format.
37
129
  """
38
130
 
39
- project_path = Path(project)
40
- dataset = load_dataset(project_path, "vectors")
41
- runtime = bootstrap(project_path)
42
- context = PipelineContext(runtime)
43
-
44
- feature_cfgs = list(dataset.features or [])
45
- if include_targets:
46
- feature_cfgs += list(dataset.targets or [])
47
- expected_feature_ids = [cfg.id for cfg in feature_cfgs]
48
-
49
- # Resolve matrix format and path
50
- matrix_fmt = (fmt or matrix) if matrix in {"csv", "html"} else None
51
- if matrix_fmt:
52
- filename = "matrix.html" if matrix_fmt == "html" else "matrix.csv"
53
- else:
54
- filename = None
55
- base_artifacts = artifacts_root(project_path)
56
- matrix_path = None
57
- if matrix_fmt:
58
- matrix_path = Path(matrix_output) if matrix_output else (base_artifacts / filename)
59
-
60
- collector = VectorStatsCollector(
61
- expected_feature_ids or None,
62
- match_partition=match_partition,
63
- threshold=threshold,
64
- show_matrix=False,
65
- matrix_rows=rows,
66
- matrix_cols=cols,
67
- matrix_output=(str(matrix_path) if matrix_path else None),
68
- matrix_format=(matrix_fmt or "html"),
131
+ _prepare_inspect_build(
132
+ project,
133
+ visuals=visuals,
134
+ progress=progress,
135
+ workspace=workspace,
69
136
  )
137
+ coverage_path: Path | None = None
70
138
 
71
- # When applying transforms, let the global postprocess registry provide them (pass None).
72
- # When raw, pass an empty list to bypass registry/defaults.
73
- vectors = build_vector_pipeline(context, feature_cfgs, dataset.group_by, stage=None)
74
- if apply_postprocess:
75
- vectors = post_process(context, vectors) # use global postprocess
76
-
77
- for group_key, vector in vectors:
78
- collector.update(group_key, vector.values)
79
-
80
- buffer = io.StringIO()
81
- with redirect_stdout(buffer):
82
- summary = collector.print_report()
83
- if not quiet:
84
- report_text = buffer.getvalue()
85
- if report_text.strip():
86
- print(report_text, end="")
87
-
88
- # Optionally write coverage summary JSON to a path
89
- if write_coverage:
90
- output_path = Path(output) if output else (base_artifacts / "coverage.json")
91
- ensure_parent(output_path)
139
+ def _work(dataset_ctx, progress_style):
140
+ project_path = dataset_ctx.project
141
+ context = dataset_ctx.pipeline_context
142
+ dataset = dataset_ctx.dataset
92
143
 
93
- feature_stats = summary.get("feature_stats", [])
94
- partition_stats = summary.get("partition_stats", [])
95
-
96
- trimmed = {
97
- "total_vectors": summary.get("total_vectors", collector.total_vectors),
98
- "empty_vectors": summary.get("empty_vectors", collector.empty_vectors),
99
- "threshold": threshold,
100
- "match_partition": match_partition,
101
- "features": {
102
- "keep": summary.get("keep_features", []),
103
- "below": summary.get("below_features", []),
104
- "coverage": {stat["id"]: stat["coverage"] for stat in feature_stats},
105
- },
106
- "partitions": {
107
- "keep": summary.get("keep_partitions", []),
108
- "below": summary.get("below_partitions", []),
109
- "keep_suffixes": summary.get("keep_suffixes", []),
110
- "below_suffixes": summary.get("below_suffixes", []),
111
- "coverage": {stat["id"]: stat["coverage"] for stat in partition_stats},
112
- },
113
- }
144
+ feature_cfgs = dataset_ctx.features
145
+ target_cfgs = dataset_ctx.targets
146
+ expected_feature_ids = [cfg.id for cfg in feature_cfgs]
114
147
 
115
- with output_path.open("w", encoding="utf-8") as fh:
116
- json.dump(trimmed, fh, indent=2)
117
- print(f"[write] Saved coverage summary to {output_path}")
148
+ matrix_fmt = (fmt or matrix) if matrix in {"csv", "html"} else None
149
+ if matrix_fmt:
150
+ filename = "matrix.html" if matrix_fmt == "html" else "matrix.csv"
151
+ else:
152
+ filename = None
153
+ base_artifacts = artifacts_root(project_path)
154
+ matrix_path = None
155
+ if matrix_fmt:
156
+ matrix_path = Path(matrix_output) if matrix_output else (base_artifacts / filename)
157
+
158
+ schema_entries = dataset_ctx.pipeline_context.load_schema(payload="features")
159
+ schema_meta = {entry["id"]: entry for entry in (schema_entries or []) if isinstance(entry.get("id"), str)}
160
+
161
+ collector = VectorStatsCollector(
162
+ expected_feature_ids or None,
163
+ match_partition=match_partition,
164
+ schema_meta=schema_meta,
165
+ threshold=threshold,
166
+ show_matrix=False,
167
+ matrix_rows=rows,
168
+ matrix_cols=cols,
169
+ matrix_output=(str(matrix_path) if matrix_path else None),
170
+ matrix_format=(matrix_fmt or "html"),
171
+ )
172
+
173
+ context.window_bounds(rectangular_required=True)
174
+ vectors = build_vector_pipeline(
175
+ context,
176
+ feature_cfgs,
177
+ dataset.group_by,
178
+ target_configs=target_cfgs,
179
+ rectangular=True,
180
+ )
181
+ if apply_postprocess:
182
+ vectors = post_process(context, vectors)
183
+
184
+ vector_iter = _iter_with_progress(
185
+ vectors,
186
+ progress_style=progress_style,
187
+ label="Processing vectors",
188
+ )
189
+ for sample in vector_iter:
190
+ merged = dict(sample.features.values)
191
+ if sample.targets:
192
+ merged.update(sample.targets.values)
193
+ collector.update(sample.key, merged)
194
+
195
+ buffer = io.StringIO()
196
+ with redirect_stdout(buffer):
197
+ summary = collector.print_report(sort_key=sort)
198
+ if not quiet:
199
+ report_text = buffer.getvalue()
200
+ if report_text.strip():
201
+ print(report_text, end="")
202
+
203
+ if write_coverage:
204
+ output_path = Path(output) if output else (base_artifacts / "coverage.json")
205
+ ensure_parent(output_path)
206
+
207
+ feature_stats = summary.get("feature_stats", [])
208
+ partition_stats = summary.get("partition_stats", [])
209
+
210
+ trimmed = {
211
+ "total_vectors": summary.get("total_vectors", collector.total_vectors),
212
+ "empty_vectors": summary.get("empty_vectors", collector.empty_vectors),
213
+ "threshold": threshold,
214
+ "match_partition": match_partition,
215
+ "features": {
216
+ "keep": summary.get("keep_features", []),
217
+ "below": summary.get("below_features", []),
218
+ "coverage": {stat["id"]: stat["coverage"] for stat in feature_stats},
219
+ "availability": {
220
+ stat["id"]: (
221
+ stat["present"] / stat["opportunities"]
222
+ if stat.get("opportunities")
223
+ else 0
224
+ )
225
+ for stat in feature_stats
226
+ },
227
+ "nulls": {stat["id"]: stat.get("nulls", 0) for stat in feature_stats},
228
+ "null_rate": {
229
+ stat["id"]: (
230
+ stat.get("nulls", 0) / stat["opportunities"]
231
+ if stat.get("opportunities")
232
+ else 0
233
+ )
234
+ for stat in feature_stats
235
+ },
236
+ "cadence_nulls": {
237
+ stat["id"]: stat.get("cadence_nulls")
238
+ for stat in feature_stats
239
+ if stat.get("cadence_opportunities")
240
+ },
241
+ "cadence_opportunities": {
242
+ stat["id"]: stat.get("cadence_opportunities")
243
+ for stat in feature_stats
244
+ if stat.get("cadence_opportunities")
245
+ },
246
+ },
247
+ "partitions": {
248
+ "keep": summary.get("keep_partitions", []),
249
+ "below": summary.get("below_partitions", []),
250
+ "keep_suffixes": summary.get("keep_suffixes", []),
251
+ "below_suffixes": summary.get("below_suffixes", []),
252
+ "keep_values": summary.get("keep_partition_values", []),
253
+ "below_values": summary.get("below_partition_values", []),
254
+ "coverage": {stat["id"]: stat["coverage"] for stat in partition_stats},
255
+ "availability": {
256
+ stat["id"]: (
257
+ stat["present"] / stat["opportunities"]
258
+ if stat.get("opportunities")
259
+ else 0
260
+ )
261
+ for stat in partition_stats
262
+ },
263
+ "nulls": {
264
+ stat["id"]: stat.get("nulls", 0) for stat in partition_stats
265
+ },
266
+ "null_rate": {
267
+ stat["id"]: (
268
+ stat.get("nulls", 0) / stat["opportunities"]
269
+ if stat.get("opportunities")
270
+ else 0
271
+ )
272
+ for stat in partition_stats
273
+ },
274
+ "cadence_nulls": {
275
+ stat["id"]: stat.get("cadence_nulls")
276
+ for stat in partition_stats
277
+ if stat.get("cadence_opportunities")
278
+ },
279
+ "cadence_opportunities": {
280
+ stat["id"]: stat.get("cadence_opportunities")
281
+ for stat in partition_stats
282
+ if stat.get("cadence_opportunities")
283
+ },
284
+ },
285
+ }
286
+
287
+ with output_path.open("w", encoding="utf-8") as fh:
288
+ json.dump(trimmed, fh, indent=2)
289
+ print(f"[write] Saved coverage summary to {output_path}")
290
+ coverage_path = output_path
291
+
292
+ _run_inspect_job(
293
+ project,
294
+ visuals=visuals,
295
+ progress=progress,
296
+ log_level=log_level,
297
+ label="Inspect report",
298
+ section="report",
299
+ work=_work,
300
+ )
301
+
302
+ if write_coverage and coverage_path:
303
+ print(f"[inspect] Coverage summary available at {coverage_path}")
118
304
 
119
305
 
120
306
  def partitions(
121
307
  project: str,
122
308
  *,
123
309
  output: str | None = None,
124
- include_targets: bool = False,
310
+ visuals: str | None = None,
311
+ progress: str | None = None,
312
+ log_level: int | None = None,
313
+ workspace=None,
125
314
  ) -> None:
126
315
  """Discover observed partitions and write a manifest JSON.
127
316
 
@@ -131,90 +320,153 @@ def partitions(
131
320
  - by_feature: mapping base id -> list of suffixes (empty when none)
132
321
  """
133
322
 
134
- project_path = Path(project)
135
- dataset = load_dataset(project_path, "vectors")
136
- runtime = bootstrap(project_path)
137
-
138
- feature_cfgs = list(dataset.features or [])
139
- if include_targets:
140
- feature_cfgs += list(dataset.targets or [])
141
- expected_feature_ids = [cfg.id for cfg in feature_cfgs]
142
- collector = VectorStatsCollector(
143
- expected_feature_ids or None,
144
- match_partition="full",
145
- threshold=None,
146
- show_matrix=False,
323
+ _prepare_inspect_build(
324
+ project,
325
+ visuals=visuals,
326
+ progress=progress,
327
+ workspace=workspace,
147
328
  )
148
329
 
149
- context = PipelineContext(runtime)
150
- vectors = build_vector_pipeline(context, feature_cfgs, dataset.group_by, stage=None)
151
- vectors = post_process(context, vectors) # apply global postprocess
152
- for group_key, vector in vectors:
153
- collector.update(group_key, vector.values)
154
-
155
- base_artifacts = artifacts_root(project_path)
156
- output_path = Path(output) if output else (base_artifacts / "partitions.json")
157
- ensure_parent(output_path)
158
-
159
- parts = sorted(collector.discovered_partitions)
160
- features = sorted({pid.split("__", 1)[0] for pid in parts})
161
- by_feature: dict[str, list[str]] = {}
162
- for pid in parts:
163
- if "__" in pid:
164
- base, suffix = pid.split("__", 1)
165
- else:
166
- base, suffix = pid, ""
167
- by_feature.setdefault(base, [])
168
- if suffix and suffix not in by_feature[base]:
169
- by_feature[base].append(suffix)
170
- for k in list(by_feature.keys()):
171
- by_feature[k] = sorted(by_feature[k])
172
-
173
- data = {
174
- "features": features,
175
- "partitions": parts,
176
- "by_feature": by_feature,
177
- }
330
+ def _work(dataset_ctx, progress_style):
331
+ project_path = dataset_ctx.project
178
332
 
179
- with output_path.open("w", encoding="utf-8") as fh:
180
- json.dump(data, fh, indent=2)
181
- print(f"[write] Saved partitions manifest to {output_path}")
333
+ dataset = dataset_ctx.dataset
334
+ feature_cfgs = list(dataset.features or [])
335
+ target_cfgs = list(dataset.targets or [])
336
+ expected_feature_ids = [cfg.id for cfg in feature_cfgs]
337
+
338
+ base_artifacts = artifacts_root(project_path)
339
+ output_path = Path(output) if output else (base_artifacts / "partitions.json")
340
+
341
+ collector = VectorStatsCollector(
342
+ expected_feature_ids or None,
343
+ match_partition="full",
344
+ threshold=None,
345
+ show_matrix=False,
346
+ )
347
+
348
+ context = dataset_ctx.pipeline_context
349
+ context.window_bounds(rectangular_required=True)
350
+ vectors = build_vector_pipeline(
351
+ context,
352
+ feature_cfgs,
353
+ dataset.group_by,
354
+ target_configs=target_cfgs,
355
+ rectangular=True,
356
+ )
357
+ vectors = post_process(context, vectors)
358
+ vector_iter = _iter_with_progress(
359
+ vectors,
360
+ progress_style=progress_style,
361
+ label="Processing vectors",
362
+ )
363
+ for sample in vector_iter:
364
+ merged = dict(sample.features.values)
365
+ if sample.targets:
366
+ merged.update(sample.targets.values)
367
+ collector.update(sample.key, merged)
368
+
369
+ ensure_parent(output_path)
370
+
371
+ parts = sorted(collector.discovered_partitions)
372
+ features = sorted({pid.split("__", 1)[0] for pid in parts})
373
+ by_feature: dict[str, list[str]] = {}
374
+ for pid in parts:
375
+ if "__" in pid:
376
+ base, suffix = pid.split("__", 1)
377
+ else:
378
+ base, suffix = pid, ""
379
+ by_feature.setdefault(base, [])
380
+ if suffix and suffix not in by_feature[base]:
381
+ by_feature[base].append(suffix)
382
+ for k in list(by_feature.keys()):
383
+ by_feature[k] = sorted(by_feature[k])
384
+
385
+ data = {
386
+ "features": features,
387
+ "partitions": parts,
388
+ "by_feature": by_feature,
389
+ }
390
+
391
+ with output_path.open("w", encoding="utf-8") as fh:
392
+ json.dump(data, fh, indent=2)
393
+ print(f"[write] Saved partitions manifest to {output_path}")
394
+
395
+ _run_inspect_job(
396
+ project,
397
+ visuals=visuals,
398
+ progress=progress,
399
+ log_level=log_level,
400
+ label="Inspect partitions",
401
+ section="partitions",
402
+ work=_work,
403
+ )
182
404
 
183
405
 
184
406
  def expected(
185
407
  project: str,
186
408
  *,
187
409
  output: str | None = None,
188
- include_targets: bool = False,
410
+ visuals: str | None = None,
411
+ progress: str | None = None,
412
+ log_level: int | None = None,
413
+ workspace=None,
189
414
  ) -> None:
190
415
  """Discover complete set of observed full feature IDs and write a list.
191
416
 
192
417
  Writes newline-separated ids to `<paths.artifacts>/expected.txt` by default.
193
418
  """
194
419
 
195
- project_path = Path(project)
196
- dataset = load_dataset(project_path, "vectors")
197
- runtime = bootstrap(project_path)
198
-
199
- feature_cfgs = list(dataset.features or [])
200
- if include_targets:
201
- feature_cfgs += list(dataset.targets or [])
202
-
203
- context = PipelineContext(runtime)
204
- vectors = build_vector_pipeline(context, feature_cfgs, dataset.group_by, stage=None)
205
- ids: set[str] = set()
206
- for _, vector in vectors:
207
- ids.update(vector.values.keys())
420
+ _prepare_inspect_build(
421
+ project,
422
+ visuals=visuals,
423
+ progress=progress,
424
+ workspace=workspace,
425
+ )
208
426
 
209
- try:
210
- default_path = artifacts_root(project_path) / "expected.txt"
211
- except Exception as e:
212
- raise RuntimeError(
213
- f"{e}. Set `paths.artifacts` in your project.yaml to a writable directory."
427
+ def _work(dataset_ctx, progress_style):
428
+ project_path = dataset_ctx.project
429
+ dataset = dataset_ctx.dataset
430
+ feature_cfgs = list(dataset.features or [])
431
+ target_cfgs = list(dataset.targets or [])
432
+
433
+ context = dataset_ctx.pipeline_context
434
+ vectors = build_vector_pipeline(
435
+ context,
436
+ feature_cfgs,
437
+ dataset.group_by,
438
+ target_configs=target_cfgs,
439
+ )
440
+ vector_iter = _iter_with_progress(
441
+ vectors,
442
+ progress_style=progress_style,
443
+ label="Processing vectors",
214
444
  )
215
- output_path = Path(output) if output else default_path
216
- ensure_parent(output_path)
217
- with output_path.open("w", encoding="utf-8") as fh:
218
- for fid in sorted(ids):
219
- fh.write(f"{fid}\n")
220
- print(f"[write] Saved expected feature list to {output_path} ({len(ids)} ids)")
445
+ ids: set[str] = set()
446
+ for sample in vector_iter:
447
+ ids.update(sample.features.values.keys())
448
+ if sample.targets:
449
+ ids.update(sample.targets.values.keys())
450
+
451
+ try:
452
+ default_path = artifacts_root(project_path) / "expected.txt"
453
+ except Exception as e:
454
+ raise RuntimeError(
455
+ f"{e}. Set `paths.artifacts` in your project.yaml to a writable directory."
456
+ )
457
+ output_path = Path(output) if output else default_path
458
+ ensure_parent(output_path)
459
+ with output_path.open("w", encoding="utf-8") as fh:
460
+ for fid in sorted(ids):
461
+ fh.write(f"{fid}\n")
462
+ print(f"[write] Saved expected feature list to {output_path} ({len(ids)} ids)")
463
+
464
+ _run_inspect_job(
465
+ project,
466
+ visuals=visuals,
467
+ progress=progress,
468
+ log_level=log_level,
469
+ label="Inspect expected ids",
470
+ section="expected",
471
+ work=_work,
472
+ )
@@ -1,17 +1,40 @@
1
+ from pathlib import Path
2
+
1
3
  from datapipeline.services.paths import pkg_root, resolve_base_pkg_dir
2
- from datapipeline.services.project_paths import sources_dir as sources_dir_from_project
4
+ from datapipeline.services.bootstrap.core import load_streams
5
+
6
+
7
+ def _default_project_path(root_dir: Path) -> Path | None:
8
+ candidate = root_dir / "config" / "project.yaml"
9
+ if candidate.exists():
10
+ return candidate
11
+ default_proj = root_dir / "config" / "datasets" / "default" / "project.yaml"
12
+ if default_proj.exists():
13
+ return default_proj
14
+ datasets_dir = root_dir / "config" / "datasets"
15
+ if datasets_dir.exists():
16
+ for p in sorted(datasets_dir.rglob("project.yaml")):
17
+ if p.is_file():
18
+ return p
19
+ return None
3
20
 
4
21
 
5
22
  def handle(subcmd: str) -> None:
6
23
  root_dir, name, pyproject = pkg_root(None)
7
24
  if subcmd == "sources":
8
25
  # Discover sources by scanning sources_dir for YAML files
9
- proj_path = root_dir / "config" / "project.yaml"
10
- sources_dir = sources_dir_from_project(proj_path)
11
- if sources_dir.exists():
12
- aliases = sorted(p.stem for p in sources_dir.glob("*.y*ml"))
13
- for a in aliases:
14
- print(a)
26
+ proj_path = _default_project_path(root_dir)
27
+ if proj_path is None:
28
+ print("[error] No project.yaml found under config/.")
29
+ return
30
+ try:
31
+ streams = load_streams(proj_path)
32
+ except FileNotFoundError as exc:
33
+ print(f"[error] {exc}")
34
+ return
35
+ aliases = sorted(streams.raw.keys())
36
+ for alias in aliases:
37
+ print(alias)
15
38
  elif subcmd == "domains":
16
39
  base = resolve_base_pkg_dir(root_dir, name)
17
40
  dom_dir = base / "domains"
@@ -1,10 +1,14 @@
1
+ import logging
1
2
  from pathlib import Path
2
3
  from datapipeline.services.scaffold.plugin import scaffold_plugin
3
4
 
4
5
 
6
+ logger = logging.getLogger(__name__)
7
+
8
+
5
9
  def bar(subcmd: str, name: str | None, out: str) -> None:
6
10
  if subcmd == "init":
7
11
  if not name:
8
- print("[error] --name is required for bar init")
12
+ logger.error("Plugin name is required. Use 'jerry plugin init <name>' or pass -n/--name.")
9
13
  raise SystemExit(2)
10
14
  scaffold_plugin(name, Path(out))