jerry-thomas 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. datapipeline/analysis/vector/collector.py +120 -17
  2. datapipeline/analysis/vector/matrix.py +33 -8
  3. datapipeline/analysis/vector/report.py +162 -32
  4. datapipeline/build/tasks/__init__.py +11 -0
  5. datapipeline/build/tasks/config.py +74 -0
  6. datapipeline/build/tasks/metadata.py +170 -0
  7. datapipeline/build/tasks/scaler.py +73 -0
  8. datapipeline/build/tasks/schema.py +60 -0
  9. datapipeline/build/tasks/utils.py +169 -0
  10. datapipeline/cli/app.py +304 -127
  11. datapipeline/cli/commands/build.py +240 -16
  12. datapipeline/cli/commands/contract.py +367 -0
  13. datapipeline/cli/commands/domain.py +8 -3
  14. datapipeline/cli/commands/inspect.py +401 -149
  15. datapipeline/cli/commands/list_.py +30 -7
  16. datapipeline/cli/commands/plugin.py +1 -1
  17. datapipeline/cli/commands/run.py +227 -241
  18. datapipeline/cli/commands/run_config.py +101 -0
  19. datapipeline/cli/commands/serve_pipeline.py +156 -0
  20. datapipeline/cli/commands/source.py +44 -8
  21. datapipeline/cli/visuals/__init__.py +4 -2
  22. datapipeline/cli/visuals/common.py +239 -0
  23. datapipeline/cli/visuals/labels.py +15 -15
  24. datapipeline/cli/visuals/runner.py +66 -0
  25. datapipeline/cli/visuals/sections.py +20 -0
  26. datapipeline/cli/visuals/sources.py +132 -119
  27. datapipeline/cli/visuals/sources_basic.py +260 -0
  28. datapipeline/cli/visuals/sources_off.py +76 -0
  29. datapipeline/cli/visuals/sources_rich.py +414 -0
  30. datapipeline/config/catalog.py +37 -3
  31. datapipeline/config/context.py +214 -0
  32. datapipeline/config/dataset/loader.py +21 -4
  33. datapipeline/config/dataset/normalize.py +4 -4
  34. datapipeline/config/metadata.py +43 -0
  35. datapipeline/config/postprocess.py +2 -2
  36. datapipeline/config/project.py +3 -2
  37. datapipeline/config/resolution.py +129 -0
  38. datapipeline/config/tasks.py +309 -0
  39. datapipeline/config/workspace.py +155 -0
  40. datapipeline/domain/__init__.py +12 -0
  41. datapipeline/domain/record.py +11 -0
  42. datapipeline/domain/sample.py +54 -0
  43. datapipeline/integrations/ml/adapter.py +34 -20
  44. datapipeline/integrations/ml/pandas_support.py +0 -2
  45. datapipeline/integrations/ml/rows.py +1 -6
  46. datapipeline/integrations/ml/torch_support.py +1 -3
  47. datapipeline/io/factory.py +112 -0
  48. datapipeline/io/output.py +132 -0
  49. datapipeline/io/protocols.py +21 -0
  50. datapipeline/io/serializers.py +219 -0
  51. datapipeline/io/sinks/__init__.py +23 -0
  52. datapipeline/io/sinks/base.py +2 -0
  53. datapipeline/io/sinks/files.py +79 -0
  54. datapipeline/io/sinks/rich.py +57 -0
  55. datapipeline/io/sinks/stdout.py +18 -0
  56. datapipeline/io/writers/__init__.py +14 -0
  57. datapipeline/io/writers/base.py +28 -0
  58. datapipeline/io/writers/csv_writer.py +25 -0
  59. datapipeline/io/writers/jsonl.py +52 -0
  60. datapipeline/io/writers/pickle_writer.py +30 -0
  61. datapipeline/pipeline/artifacts.py +58 -0
  62. datapipeline/pipeline/context.py +66 -7
  63. datapipeline/pipeline/observability.py +65 -0
  64. datapipeline/pipeline/pipelines.py +65 -13
  65. datapipeline/pipeline/split.py +11 -10
  66. datapipeline/pipeline/stages.py +127 -16
  67. datapipeline/pipeline/utils/keygen.py +20 -7
  68. datapipeline/pipeline/utils/memory_sort.py +22 -10
  69. datapipeline/pipeline/utils/transform_utils.py +22 -0
  70. datapipeline/runtime.py +5 -2
  71. datapipeline/services/artifacts.py +12 -6
  72. datapipeline/services/bootstrap/config.py +25 -0
  73. datapipeline/services/bootstrap/core.py +52 -37
  74. datapipeline/services/constants.py +6 -5
  75. datapipeline/services/factories.py +123 -1
  76. datapipeline/services/project_paths.py +43 -16
  77. datapipeline/services/runs.py +208 -0
  78. datapipeline/services/scaffold/domain.py +3 -2
  79. datapipeline/services/scaffold/filter.py +3 -2
  80. datapipeline/services/scaffold/mappers.py +9 -6
  81. datapipeline/services/scaffold/plugin.py +3 -3
  82. datapipeline/services/scaffold/source.py +93 -56
  83. datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
  84. datapipeline/sources/decoders.py +83 -18
  85. datapipeline/sources/factory.py +26 -16
  86. datapipeline/sources/models/__init__.py +2 -2
  87. datapipeline/sources/models/generator.py +0 -7
  88. datapipeline/sources/models/loader.py +3 -3
  89. datapipeline/sources/models/parsing_error.py +24 -0
  90. datapipeline/sources/models/source.py +6 -6
  91. datapipeline/sources/synthetic/time/loader.py +14 -2
  92. datapipeline/sources/transports.py +74 -37
  93. datapipeline/templates/plugin_skeleton/README.md +74 -30
  94. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
  95. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
  96. datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
  97. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
  98. datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
  99. datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
  100. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
  101. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
  102. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
  103. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
  104. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
  105. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
  106. datapipeline/templates/plugin_skeleton/jerry.yaml +28 -0
  107. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
  108. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
  109. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
  110. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
  111. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
  112. datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
  113. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
  114. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
  115. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
  116. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
  117. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
  118. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
  119. datapipeline/templates/stubs/dto.py.j2 +2 -0
  120. datapipeline/templates/stubs/mapper.py.j2 +5 -4
  121. datapipeline/templates/stubs/parser.py.j2 +2 -0
  122. datapipeline/templates/stubs/record.py.j2 +2 -0
  123. datapipeline/templates/stubs/source.yaml.j2 +2 -3
  124. datapipeline/transforms/debug/lint.py +26 -41
  125. datapipeline/transforms/feature/scaler.py +89 -13
  126. datapipeline/transforms/record/floor_time.py +4 -4
  127. datapipeline/transforms/sequence.py +2 -35
  128. datapipeline/transforms/stream/dedupe.py +24 -0
  129. datapipeline/transforms/stream/ensure_ticks.py +7 -6
  130. datapipeline/transforms/vector/__init__.py +5 -0
  131. datapipeline/transforms/vector/common.py +98 -0
  132. datapipeline/transforms/vector/drop/__init__.py +4 -0
  133. datapipeline/transforms/vector/drop/horizontal.py +79 -0
  134. datapipeline/transforms/vector/drop/orchestrator.py +59 -0
  135. datapipeline/transforms/vector/drop/vertical.py +182 -0
  136. datapipeline/transforms/vector/ensure_schema.py +184 -0
  137. datapipeline/transforms/vector/fill.py +87 -0
  138. datapipeline/transforms/vector/replace.py +62 -0
  139. datapipeline/utils/load.py +24 -3
  140. datapipeline/utils/rich_compat.py +38 -0
  141. datapipeline/utils/window.py +76 -0
  142. jerry_thomas-1.0.0.dist-info/METADATA +825 -0
  143. jerry_thomas-1.0.0.dist-info/RECORD +199 -0
  144. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/entry_points.txt +9 -8
  145. datapipeline/build/tasks.py +0 -186
  146. datapipeline/cli/commands/link.py +0 -128
  147. datapipeline/cli/commands/writers.py +0 -138
  148. datapipeline/config/build.py +0 -64
  149. datapipeline/config/run.py +0 -116
  150. datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
  151. datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
  152. datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
  153. datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
  154. datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
  155. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
  156. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
  157. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
  158. datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
  159. datapipeline/transforms/vector.py +0 -210
  160. jerry_thomas-0.3.0.dist-info/METADATA +0 -502
  161. jerry_thomas-0.3.0.dist-info/RECORD +0 -139
  162. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/WHEEL +0 -0
  163. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/licenses/LICENSE +0 -0
  164. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/top_level.txt +0 -0
@@ -27,6 +27,7 @@ class VectorStatsCollector:
27
27
  expected_feature_ids: Iterable[str] | None = None,
28
28
  *,
29
29
  match_partition: Literal["base", "full"] = "base",
30
+ schema_meta: dict[str, dict[str, Any]] | None = None,
30
31
  sample_limit: int = 5,
31
32
  threshold: float | None = 0.95,
32
33
  show_matrix: bool = False,
@@ -48,6 +49,7 @@ class VectorStatsCollector:
48
49
  if expected_feature_ids
49
50
  else set()
50
51
  )
52
+ self.schema_meta = schema_meta or {}
51
53
 
52
54
  self.discovered_features: set[str] = set()
53
55
  self.discovered_partitions: set[str] = set()
@@ -55,9 +57,14 @@ class VectorStatsCollector:
55
57
  self.total_vectors = 0
56
58
  self.empty_vectors = 0
57
59
 
58
- self.present_counts = Counter()
59
- self.present_counts_partitions = Counter()
60
+ self.seen_counts = Counter()
61
+ self.null_counts_features = Counter()
62
+ self.seen_counts_partitions = Counter()
60
63
  self.null_counts_partitions = Counter()
64
+ self.cadence_null_counts = Counter()
65
+ self.cadence_opportunities = Counter()
66
+ self.cadence_null_counts_partitions = Counter()
67
+ self.cadence_opportunities_partitions = Counter()
61
68
 
62
69
  self.missing_samples = defaultdict(list)
63
70
  self.missing_partition_samples = defaultdict(list)
@@ -107,6 +114,8 @@ class VectorStatsCollector:
107
114
 
108
115
  present_normalized: set[str] = set()
109
116
  seen_partitions: set[str] = set()
117
+ feature_seen_present: dict[str, bool] = {}
118
+ feature_seen_null: dict[str, bool] = {}
110
119
  for partition_id in present_partitions:
111
120
  normalized = self._normalize(partition_id)
112
121
  present_normalized.add(normalized)
@@ -122,12 +131,14 @@ class VectorStatsCollector:
122
131
 
123
132
  # Capture sub-status for list-valued entries
124
133
  sub: list[str] | None = None
134
+ has_present_element = False
125
135
  if isinstance(value, list):
126
136
  sub = []
127
137
  for v in value:
128
138
  if v is None or (isinstance(v, float) and v != v):
129
139
  sub.append("null")
130
140
  else:
141
+ has_present_element = True
131
142
  sub.append("present")
132
143
  if sub:
133
144
  self.group_partition_sub[group_key][partition_id] = sub
@@ -135,10 +146,10 @@ class VectorStatsCollector:
135
146
  self.group_feature_sub[group_key].setdefault(
136
147
  normalized, sub)
137
148
 
138
- is_null = _is_missing_value(value)
149
+ is_null = (not has_present_element) if isinstance(value, list) else _is_missing_value(value)
139
150
  if is_null:
140
- status_features[normalized] = "null"
141
151
  status_partitions[partition_id] = "null"
152
+ feature_seen_null[normalized] = True
142
153
  self.null_counts_partitions[partition_id] += 1
143
154
  if len(self.missing_partition_samples[partition_id]) < self.sample_limit:
144
155
  self.missing_partition_samples[partition_id].append(
@@ -147,14 +158,30 @@ class VectorStatsCollector:
147
158
  if len(self.missing_samples[normalized]) < self.sample_limit:
148
159
  self.missing_samples[normalized].append(
149
160
  (group_key, "null"))
161
+ else:
162
+ feature_seen_present[normalized] = True
163
+
164
+ # Cadence-aware null accounting (per schema metadata)
165
+ meta = self.schema_meta.get(normalized) or self.schema_meta.get(partition_id)
166
+ expected_len = self._cadence_expected_length(meta) if meta else None
167
+ if expected_len is not None:
168
+ self._update_cadence(normalized, expected_len, value, partitions=False)
169
+ self._update_cadence(partition_id, expected_len, value, partitions=True)
150
170
 
151
171
  for normalized in present_normalized:
152
- if status_features.get(normalized) == "present":
153
- self.present_counts[normalized] += 1
172
+ if feature_seen_present.get(normalized):
173
+ status_features[normalized] = "present"
174
+ # Drop stale null samples when the feature is ultimately present
175
+ self.missing_samples.pop(normalized, None)
176
+ elif feature_seen_null.get(normalized):
177
+ status_features[normalized] = "null"
178
+ self.null_counts_features[normalized] += 1
179
+ # Count availability (seen) regardless of value
180
+ self.seen_counts[normalized] += 1
154
181
 
155
182
  for partition_id in seen_partitions:
156
- if status_partitions.get(partition_id) == "present":
157
- self.present_counts_partitions[partition_id] += 1
183
+ # Availability regardless of value
184
+ self.seen_counts_partitions[partition_id] += 1
158
185
 
159
186
  tracked_features = (
160
187
  self.expected_features if self.expected_features else self.discovered_features
@@ -186,20 +213,16 @@ class VectorStatsCollector:
186
213
  self, identifier: str, *, partitions: bool = False
187
214
  ) -> tuple[int, int, int]:
188
215
  present = (
189
- self.present_counts_partitions[identifier]
216
+ self.seen_counts_partitions[identifier]
190
217
  if partitions
191
- else self.present_counts[identifier]
218
+ else self.seen_counts[identifier]
192
219
  )
193
220
  opportunities = self.total_vectors
194
221
  missing = max(opportunities - present, 0)
195
222
  return present, missing, opportunities
196
223
 
197
224
  def _feature_null_count(self, feature_id: str) -> int:
198
- total = 0
199
- for partition_id, count in self.null_counts_partitions.items():
200
- if self._normalize(partition_id) == feature_id:
201
- total += count
202
- return total
225
+ return self.null_counts_features.get(feature_id, 0)
203
226
 
204
227
  @staticmethod
205
228
  def _format_group_key(group_key: Hashable) -> str:
@@ -230,6 +253,86 @@ class VectorStatsCollector:
230
253
  def _partition_suffix(partition_id: str) -> str:
231
254
  return partition_id.split("__", 1)[1] if "__" in partition_id else partition_id
232
255
 
256
+ @staticmethod
257
+ def _partition_values(partition_id: str) -> list[str]:
258
+ """Return partition values without base id or field names."""
259
+ suffix = partition_id.split("__", 1)[1] if "__" in partition_id else partition_id
260
+ if not suffix:
261
+ return []
262
+
263
+ def _components(raw: str) -> list[str]:
264
+ if raw.startswith("@"):
265
+ parts = raw.split("_@")
266
+ return [parts[0]] + [f"@{rest}" for rest in parts[1:]]
267
+ return [raw]
268
+
269
+ values: list[str] = []
270
+ for component in _components(suffix):
271
+ field_value = component.lstrip("@")
272
+ _, _, value = field_value.partition(":")
273
+ candidate = value or field_value
274
+ # If no explicit value delimiter, drop leading field name-ish prefixes
275
+ if not value and "_" in candidate:
276
+ candidate = candidate.rsplit("_", 1)[-1]
277
+ values.append(candidate)
278
+ return values
279
+
280
+ @classmethod
281
+ def _partition_value(cls, partition_id: str) -> str:
282
+ values = cls._partition_values(partition_id)
283
+ if not values:
284
+ return ""
285
+ return values[0] if len(values) == 1 else "_".join(values)
286
+
287
+ @staticmethod
288
+ def _expected_lengths(meta: dict[str, Any]) -> list[int]:
289
+ cadence = meta.get("cadence")
290
+ if isinstance(cadence, dict):
291
+ target = cadence.get("target")
292
+ if isinstance(target, (int, float)) and target > 0:
293
+ return [int(target)]
294
+ modes = meta.get("list_length", {}).get("modes")
295
+ if isinstance(modes, (list, tuple)) and modes:
296
+ ints = [int(m) for m in modes if isinstance(m, (int, float))]
297
+ if ints:
298
+ return sorted(ints)
299
+ expected = meta.get("expected_length")
300
+ if isinstance(expected, (int, float)):
301
+ return [int(expected)]
302
+ max_len = meta.get("list_length", {}).get("max")
303
+ if isinstance(max_len, (int, float)) and max_len > 0:
304
+ return [int(max_len)]
305
+ return []
306
+
307
+ @staticmethod
308
+ def _cadence_expected_length(meta: dict[str, Any]) -> int | None:
309
+ lengths = VectorStatsCollector._expected_lengths(meta)
310
+ return max(lengths) if lengths else None
311
+
312
+ def _update_cadence(
313
+ self, identifier: str, expected_len: int | None, value: Any, *, partitions: bool
314
+ ) -> None:
315
+ if expected_len is None:
316
+ return
317
+ counter_nulls = (
318
+ self.cadence_null_counts_partitions if partitions else self.cadence_null_counts
319
+ )
320
+ counter_opps = (
321
+ self.cadence_opportunities_partitions
322
+ if partitions
323
+ else self.cadence_opportunities
324
+ )
325
+
326
+ present = 0
327
+ if isinstance(value, list):
328
+ trimmed = value[:expected_len]
329
+ present = sum(0 if _is_missing_value(v) else 1 for v in trimmed)
330
+ else:
331
+ present = 0 if _is_missing_value(value) else 1
332
+ missing = max(expected_len - present, 0)
333
+ counter_opps[identifier] += expected_len
334
+ counter_nulls[identifier] += missing
335
+
233
336
  def _render_matrix(
234
337
  self,
235
338
  *,
@@ -246,10 +349,10 @@ class VectorStatsCollector:
246
349
  column_width=column_width,
247
350
  )
248
351
 
249
- def print_report(self) -> dict[str, Any]:
352
+ def print_report(self, *, sort_key: str = "missing") -> dict[str, Any]:
250
353
  from .report import print_report as _print_report
251
354
 
252
- return _print_report(self)
355
+ return _print_report(self, sort_key=sort_key)
253
356
 
254
357
  def _export_matrix_data(self) -> None:
255
358
  from .matrix import export_matrix_data
@@ -81,11 +81,11 @@ def export_matrix_data(collector: VectorStatsCollector) -> None:
81
81
  _write_matrix_html(collector, path)
82
82
  else:
83
83
  _write_matrix_csv(collector, path)
84
- logger.info("\n[write] Saved availability matrix to %s", path)
84
+ message = f"[write] Saved availability matrix to {path}"
85
+ logger.info("\n%s", message)
85
86
  except OSError as exc:
86
- logger.warning(
87
- "\n[warn] Failed to write availability matrix to %s: %s", path, exc
88
- )
87
+ warning = f"[warn] Failed to write availability matrix to {path}: {exc}"
88
+ logger.warning("\n%s", warning)
89
89
 
90
90
 
91
91
  def _write_matrix_csv(collector: VectorStatsCollector, path: Path) -> None:
@@ -341,10 +341,11 @@ def _write_matrix_html(collector: VectorStatsCollector, path: Path) -> None:
341
341
  .heatmap th,
342
342
  .heatmap td {
343
343
  border: 1px solid #d0d0d0;
344
- padding: 4px 6px;
344
+ padding: 0 5px;
345
345
  text-align: center;
346
346
  font-size: 13px;
347
347
  line-height: 1.2;
348
+ vertical-align: middle;
348
349
  }
349
350
  .heatmap thead th {
350
351
  position: sticky;
@@ -369,9 +370,33 @@ def _write_matrix_html(collector: VectorStatsCollector, path: Path) -> None:
369
370
  .status-null { background: #f1c40f; color: #000; font-weight: bold; }
370
371
  .status-absent { background: #e74c3c; color: #fff; font-weight: bold; }
371
372
  .status-missing { background: #bdc3c7; color: #000; font-weight: bold; }
372
- .sub { display: flex; gap: 1px; height: 12px; }
373
- .sub span { flex: 1; display: block; }
374
- .sub span::after { content: ""; display: block; width: 100%; height: 100%; }
373
+ .sub {
374
+ display: flex;
375
+ gap: 5px;
376
+ height: calc(100% - 2px);
377
+ min-height: 24px;
378
+ padding: 0 2px;
379
+ margin: 1px 0;
380
+ align-items: stretch;
381
+ justify-content: center;
382
+ }
383
+ .sub span {
384
+ flex: 1;
385
+ display: block;
386
+ position: relative;
387
+ border-radius: 4px;
388
+ overflow: hidden;
389
+ border: 1px solid rgba(0,0,0,0.15);
390
+ background: #fff;
391
+ min-width: 12px;
392
+ }
393
+ .sub span::after {
394
+ content: "";
395
+ position: absolute;
396
+ inset: 0;
397
+ display: block;
398
+ border-radius: 4px;
399
+ }
375
400
  .sub .status-present::after { background: #2ecc71; }
376
401
  .sub .status-null::after { background: #f1c40f; }
377
402
  .sub .status-absent::after { background: #e74c3c; }
@@ -1,9 +1,8 @@
1
1
  from __future__ import annotations
2
- from typing import Any
2
+ from typing import Any, Literal, TYPE_CHECKING
3
3
  import logging
4
4
 
5
5
  from .matrix import export_matrix_data, render_matrix
6
- from typing import TYPE_CHECKING
7
6
 
8
7
  if TYPE_CHECKING:
9
8
  from .collector import VectorStatsCollector
@@ -12,7 +11,11 @@ if TYPE_CHECKING:
12
11
  logger = logging.getLogger(__name__)
13
12
 
14
13
 
15
- def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
14
+ def print_report(
15
+ collector: VectorStatsCollector,
16
+ *,
17
+ sort_key: Literal["missing", "nulls"] = "missing",
18
+ ) -> dict[str, Any]:
16
19
  tracked_features = (
17
20
  collector.expected_features
18
21
  if collector.expected_features
@@ -62,15 +65,25 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
62
65
  return summary
63
66
 
64
67
  feature_stats = []
65
- logger.info("\n-> Feature coverage (sorted by missing count):")
68
+ sort_label = "null" if sort_key == "nulls" else "missing"
69
+ logger.info("\n-> Feature coverage (sorted by %s count):", sort_label)
70
+ if sort_key == "nulls":
71
+ def _feature_sort(fid):
72
+ return collector._feature_null_count(fid)
73
+ else:
74
+ def _feature_sort(fid):
75
+ return collector._coverage(fid)[1]
76
+
66
77
  for feature_id in sorted(
67
78
  tracked_features,
68
- key=lambda fid: collector._coverage(fid)[1],
79
+ key=_feature_sort,
69
80
  reverse=True,
70
81
  ):
71
82
  present, missing, opportunities = collector._coverage(feature_id)
72
83
  coverage = present / opportunities if opportunities else 0.0
73
84
  nulls = collector._feature_null_count(feature_id)
85
+ cadence_nulls = collector.cadence_null_counts.get(feature_id, 0)
86
+ cadence_opps = collector.cadence_opportunities.get(feature_id, 0)
74
87
  raw_samples = collector.missing_samples.get(feature_id, [])
75
88
  sample_note = collector._format_samples(raw_samples)
76
89
  samples = [
@@ -82,7 +95,7 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
82
95
  ]
83
96
  line = (
84
97
  f" - {feature_id}: present {present}/{opportunities}"
85
- f" ({coverage:.1%}) | missing {missing} | null {nulls}"
98
+ f" ({coverage:.1%}) | absent {missing} | null {nulls}"
86
99
  )
87
100
  if sample_note:
88
101
  line += f"; samples: {sample_note}"
@@ -93,6 +106,11 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
93
106
  "present": present,
94
107
  "missing": missing,
95
108
  "nulls": nulls,
109
+ "cadence_nulls": cadence_nulls,
110
+ "cadence_opportunities": cadence_opps,
111
+ "cadence_null_fraction": (
112
+ cadence_nulls / cadence_opps if cadence_opps else None
113
+ ),
96
114
  "coverage": coverage,
97
115
  "opportunities": opportunities,
98
116
  "samples": samples,
@@ -109,6 +127,12 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
109
127
  )
110
128
  coverage = present / opportunities if opportunities else 0.0
111
129
  nulls = collector.null_counts_partitions.get(partition_id, 0)
130
+ cadence_nulls = collector.cadence_null_counts_partitions.get(
131
+ partition_id, 0
132
+ )
133
+ cadence_opps = collector.cadence_opportunities_partitions.get(
134
+ partition_id, 0
135
+ )
112
136
  raw_samples = collector.missing_partition_samples.get(
113
137
  partition_id, [])
114
138
  partition_stats.append(
@@ -118,6 +142,11 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
118
142
  "present": present,
119
143
  "missing": missing,
120
144
  "nulls": nulls,
145
+ "cadence_nulls": cadence_nulls,
146
+ "cadence_opportunities": cadence_opps,
147
+ "cadence_null_fraction": (
148
+ cadence_nulls / cadence_opps if cadence_opps else None
149
+ ),
121
150
  "coverage": coverage,
122
151
  "opportunities": opportunities,
123
152
  "samples": [
@@ -130,13 +159,16 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
130
159
  }
131
160
  )
132
161
 
133
- logger.info("\n-> Partition details (top by missing count):")
162
+ sort_label_partitions = "null" if sort_key == "nulls" else "absent"
163
+ logger.info("\n-> Partition details (top by %s count):", sort_label_partitions)
164
+ def _partition_sort(stats):
165
+ return stats["nulls"] if sort_key == "nulls" else stats["missing"]
134
166
  for stats in sorted(
135
- partition_stats, key=lambda s: s["missing"], reverse=True
167
+ partition_stats, key=_partition_sort, reverse=True
136
168
  )[:20]:
137
169
  line = (
138
170
  f" - {stats['id']} (base: {stats['base']}): present {stats['present']}/{stats['opportunities']}"
139
- f" ({stats['coverage']:.1%}) | missing {stats['missing']} | null/invalid {stats['nulls']}"
171
+ f" ({stats['coverage']:.1%}) | absent {stats['missing']} | null/invalid {stats['nulls']}"
140
172
  )
141
173
  logger.info(line)
142
174
 
@@ -148,6 +180,10 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
148
180
  above_partitions: list[str] = []
149
181
  below_suffixes: list[str] = []
150
182
  above_suffixes: list[str] = []
183
+ below_partition_values: list[str] = []
184
+ above_partition_values: list[str] = []
185
+ below_partitions_cadence: list[str] = []
186
+ above_partitions_cadence: list[str] = []
151
187
 
152
188
  if collector.threshold is not None:
153
189
  thr = collector.threshold
@@ -157,17 +193,6 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
157
193
  above_features = [
158
194
  stats["id"] for stats in feature_stats if stats["coverage"] >= thr
159
195
  ]
160
- logger.warning(
161
- "\n[low] Features below %.0f%% coverage:\n below_features = %s",
162
- thr * 100,
163
- below_features,
164
- )
165
- logger.info(
166
- "[high] Features at/above %.0f%% coverage:\n keep_features = %s",
167
- thr * 100,
168
- above_features,
169
- )
170
-
171
196
  if partition_stats:
172
197
  below_partitions = [
173
198
  stats["id"] for stats in partition_stats if stats["coverage"] < thr
@@ -185,18 +210,26 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
185
210
  ]
186
211
  if not above_partitions:
187
212
  above_suffixes = []
188
- logger.warning(
189
- "\n[low] Partitions below %.0f%% coverage:\n below_partitions = %s",
190
- thr * 100,
191
- below_partitions,
192
- )
193
- logger.warning(" below_suffixes = %s", below_suffixes)
194
- logger.info(
195
- "[high] Partitions at/above %.0f%% coverage:\n keep_partitions = %s",
196
- thr * 100,
197
- above_partitions,
198
- )
199
- logger.info(" keep_suffixes = %s", above_suffixes)
213
+ below_partition_values = [
214
+ v
215
+ for pid in below_partitions
216
+ if "__" in pid and (v := collector._partition_value(pid))
217
+ ]
218
+ above_partition_values = [
219
+ v
220
+ for pid in above_partitions
221
+ if "__" in pid and (v := collector._partition_value(pid))
222
+ ]
223
+ below_partitions_cadence = [
224
+ stats["id"]
225
+ for stats in partition_stats
226
+ if (stats.get("cadence_null_fraction") or 0) > (1 - thr)
227
+ ]
228
+ above_partitions_cadence = [
229
+ stats["id"]
230
+ for stats in partition_stats
231
+ if (stats.get("cadence_null_fraction") or 0) <= (1 - thr)
232
+ ]
200
233
 
201
234
  summary.update(
202
235
  {
@@ -216,6 +249,21 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
216
249
  if partition_stats
217
250
  else []
218
251
  ),
252
+ "below_partition_values": below_partition_values,
253
+ "keep_partition_values": above_partition_values
254
+ or (
255
+ [
256
+ collector._partition_value(stats["id"])
257
+ for stats in partition_stats
258
+ if "__" in stats["id"]
259
+ and collector._partition_value(stats["id"])
260
+ ]
261
+ if partition_stats
262
+ else []
263
+ ),
264
+ "below_partitions_cadence": below_partitions_cadence,
265
+ "keep_partitions_cadence": above_partitions_cadence
266
+ or [stats["id"] for stats in partition_stats],
219
267
  }
220
268
  )
221
269
 
@@ -310,6 +358,88 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
310
358
  if collector.matrix_output:
311
359
  export_matrix_data(collector)
312
360
 
361
+ # Record-level (cadence) gaps for list features/partitions
362
+ partition_cadence = [
363
+ stats
364
+ for stats in partition_stats
365
+ if stats.get("cadence_opportunities")
366
+ ]
367
+ if partition_cadence:
368
+ logger.info("\n-> Record-level gaps (expected cadence; null/invalid elements):")
369
+ total_missing = sum(s.get("cadence_nulls", 0) or 0 for s in partition_cadence)
370
+ total_opps = sum(s.get("cadence_opportunities", 0) or 0 for s in partition_cadence)
371
+ if total_opps:
372
+ logger.info(
373
+ " Total null/invalid elements: %d/%d (%.1f%%)",
374
+ total_missing,
375
+ total_opps,
376
+ (total_missing / total_opps) * 100,
377
+ )
378
+ logger.info(" Top partitions by null/invalid elements:")
379
+ for stats in sorted(
380
+ partition_cadence,
381
+ key=lambda s: (s.get("cadence_nulls") or 0),
382
+ reverse=True,
383
+ )[:20]:
384
+ missing_elems = stats.get("cadence_nulls") or 0
385
+ opps = stats.get("cadence_opportunities") or 0
386
+ frac = (missing_elems / opps) if opps else 0
387
+ logger.info(
388
+ " - %s (base: %s): vectors present %d/%d | absent %d | cadence null/invalid %d/%d elements (%.1f%%)",
389
+ stats["id"],
390
+ stats.get("base"),
391
+ stats.get("present", 0),
392
+ stats.get("opportunities", 0),
393
+ stats.get("missing", 0),
394
+ missing_elems,
395
+ opps,
396
+ frac * 100,
397
+ )
398
+
399
+ if collector.threshold is not None:
400
+ thr = collector.threshold
401
+ logger.warning(
402
+ "\n[low] Features below %.0f%% coverage:\n below_features = %s",
403
+ thr * 100,
404
+ below_features,
405
+ )
406
+ logger.info(
407
+ "[high] Features at/above %.0f%% coverage:\n keep_features = %s",
408
+ thr * 100,
409
+ above_features,
410
+ )
411
+ if partition_stats:
412
+ logger.warning(
413
+ "\n[low] Partitions below %.0f%% coverage:\n below_partitions = %s",
414
+ thr * 100,
415
+ below_partitions,
416
+ )
417
+ logger.warning(" below_suffixes = %s", below_suffixes)
418
+ if below_partition_values:
419
+ logger.warning(" below_partition_values = %s",
420
+ below_partition_values)
421
+ logger.info(
422
+ "[high] Partitions at/above %.0f%% coverage:\n keep_partitions = %s",
423
+ thr * 100,
424
+ above_partitions,
425
+ )
426
+ logger.info(" keep_suffixes = %s", above_suffixes)
427
+ if above_partition_values:
428
+ logger.info(
429
+ " keep_partition_values = %s", above_partition_values)
430
+ if below_partitions_cadence:
431
+ logger.warning(
432
+ "[low] Partitions below %.0f%% cadence fill:\n below_partitions_cadence = %s",
433
+ thr * 100,
434
+ below_partitions_cadence,
435
+ )
436
+ if above_partitions_cadence:
437
+ logger.info(
438
+ "[high] Partitions at/above %.0f%% cadence fill:\n keep_partitions_cadence = %s",
439
+ thr * 100,
440
+ above_partitions_cadence,
441
+ )
442
+
313
443
  return summary
314
444
 
315
445
 
@@ -0,0 +1,11 @@
1
+ from .config import compute_config_hash
2
+ from .schema import materialize_vector_schema
3
+ from .metadata import materialize_metadata
4
+ from .scaler import materialize_scaler_statistics
5
+
6
+ __all__ = [
7
+ "compute_config_hash",
8
+ "materialize_vector_schema",
9
+ "materialize_metadata",
10
+ "materialize_scaler_statistics",
11
+ ]
@@ -0,0 +1,74 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ from pathlib import Path
5
+ from typing import Iterable
6
+
7
+ from datapipeline.services.project_paths import read_project
8
+
9
+
10
+ def _resolve_relative(project_yaml: Path, value: str) -> Path:
11
+ path = Path(value)
12
+ return path if path.is_absolute() else (project_yaml.parent / path)
13
+
14
+
15
+ def _normalized_label(path: Path, base_dir: Path) -> str:
16
+ try:
17
+ return str(path.resolve().relative_to(base_dir))
18
+ except ValueError:
19
+ return str(path.resolve())
20
+
21
+
22
+ def _hash_file(hasher, path: Path, base_dir: Path) -> None:
23
+ hasher.update(_normalized_label(path, base_dir).encode("utf-8"))
24
+ hasher.update(b"\0")
25
+ hasher.update(path.read_bytes())
26
+ hasher.update(b"\0")
27
+
28
+
29
+ def _yaml_files(directory: Path) -> Iterable[Path]:
30
+ if not directory.exists():
31
+ return []
32
+ return sorted(p for p in directory.rglob("*.y*ml") if p.is_file())
33
+
34
+
35
+ def compute_config_hash(project_yaml: Path, tasks_path: Path) -> str:
36
+ """Compute a deterministic hash across relevant config inputs."""
37
+
38
+ hasher = hashlib.sha256()
39
+ base_dir = project_yaml.parent.resolve()
40
+ cfg = read_project(project_yaml)
41
+
42
+ required = [
43
+ project_yaml.resolve(),
44
+ _resolve_relative(project_yaml, cfg.paths.dataset).resolve(),
45
+ _resolve_relative(project_yaml, cfg.paths.postprocess).resolve(),
46
+ ]
47
+
48
+ for path in required:
49
+ if not path.exists():
50
+ raise FileNotFoundError(f"Expected config file missing: {path}")
51
+ _hash_file(hasher, path, base_dir)
52
+
53
+ if not tasks_path.is_dir():
54
+ raise TypeError(
55
+ f"project.paths.tasks must point to a directory, got: {tasks_path}"
56
+ )
57
+ hasher.update(
58
+ f"[dir]{_normalized_label(tasks_path, base_dir)}".encode("utf-8")
59
+ )
60
+ for p in _yaml_files(tasks_path):
61
+ _hash_file(hasher, p, base_dir)
62
+
63
+ for dir_value in (cfg.paths.sources, cfg.paths.streams):
64
+ directory = _resolve_relative(project_yaml, dir_value)
65
+ hasher.update(
66
+ f"[dir]{_normalized_label(directory, base_dir)}".encode("utf-8")
67
+ )
68
+ if not directory.exists():
69
+ hasher.update(b"[missing]")
70
+ continue
71
+ for path in _yaml_files(directory):
72
+ _hash_file(hasher, path, base_dir)
73
+
74
+ return hasher.hexdigest()