jerry-thomas 0.0.5__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. datapipeline/analysis/vector_analyzer.py +678 -31
  2. datapipeline/cli/app.py +278 -61
  3. datapipeline/cli/commands/domain.py +2 -2
  4. datapipeline/cli/commands/inspect.py +169 -0
  5. datapipeline/cli/commands/link.py +48 -14
  6. datapipeline/cli/commands/plugin.py +2 -2
  7. datapipeline/cli/commands/run.py +47 -48
  8. datapipeline/cli/visual_source.py +32 -0
  9. datapipeline/cli/visuals.py +4 -31
  10. datapipeline/config/catalog.py +15 -7
  11. datapipeline/config/dataset/dataset.py +4 -7
  12. datapipeline/config/dataset/feature.py +6 -17
  13. datapipeline/config/dataset/loader.py +83 -3
  14. datapipeline/config/dataset/normalize.py +19 -5
  15. datapipeline/config/project.py +0 -2
  16. datapipeline/domain/feature.py +13 -6
  17. datapipeline/domain/record.py +15 -7
  18. datapipeline/domain/vector.py +4 -2
  19. datapipeline/filters/filters.py +0 -1
  20. datapipeline/integrations/__init__.py +19 -0
  21. datapipeline/integrations/ml.py +319 -0
  22. datapipeline/mappers/synthetic/time.py +3 -3
  23. datapipeline/pipeline/pipelines.py +81 -34
  24. datapipeline/pipeline/stages.py +104 -49
  25. datapipeline/pipeline/utils/keygen.py +23 -1
  26. datapipeline/pipeline/utils/memory_sort.py +1 -1
  27. datapipeline/pipeline/utils/ordering.py +0 -2
  28. datapipeline/pipeline/utils/transform_utils.py +14 -79
  29. datapipeline/plugins.py +15 -1
  30. datapipeline/registries/registries.py +15 -0
  31. datapipeline/registries/registry.py +28 -0
  32. datapipeline/services/bootstrap.py +50 -17
  33. datapipeline/services/constants.py +2 -0
  34. datapipeline/services/factories.py +9 -5
  35. datapipeline/services/project_paths.py +41 -1
  36. datapipeline/services/scaffold/domain.py +6 -3
  37. datapipeline/services/scaffold/mappers.py +2 -2
  38. datapipeline/services/scaffold/plugin.py +5 -5
  39. datapipeline/services/scaffold/source.py +15 -25
  40. datapipeline/services/scaffold/templates.py +1 -5
  41. datapipeline/sources/models/__init__.py +1 -3
  42. datapipeline/sources/models/loader.py +1 -12
  43. datapipeline/sources/synthetic/time/parser.py +4 -4
  44. datapipeline/templates/plugin_skeleton/README.md +14 -11
  45. datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.yaml +22 -2
  46. datapipeline/templates/plugin_skeleton/config/contracts/time_linear.yaml +22 -3
  47. datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +29 -0
  48. datapipeline/templates/plugin_skeleton/config/{project.yaml → datasets/default/project.yaml} +3 -3
  49. datapipeline/templates/plugin_skeleton/config/{distilleries → sources}/time_ticks.yaml +2 -0
  50. datapipeline/templates/plugin_skeleton/pyproject.toml +2 -2
  51. datapipeline/templates/stubs/dto.py.j2 +1 -2
  52. datapipeline/templates/stubs/mapper.py.j2 +3 -4
  53. datapipeline/templates/stubs/record.py.j2 +1 -1
  54. datapipeline/templates/stubs/source.yaml.j2 +4 -0
  55. datapipeline/transforms/debug/identity.py +74 -0
  56. datapipeline/transforms/debug/lint.py +101 -0
  57. datapipeline/transforms/feature/model.py +12 -0
  58. datapipeline/transforms/{transforms.py → feature/scaler.py} +9 -67
  59. datapipeline/transforms/filter.py +57 -0
  60. datapipeline/transforms/record/floor_time.py +17 -0
  61. datapipeline/transforms/record/lag.py +18 -0
  62. datapipeline/transforms/sequence.py +68 -15
  63. datapipeline/transforms/stream/ensure_ticks.py +33 -0
  64. datapipeline/transforms/stream/fill.py +103 -0
  65. datapipeline/transforms/stream/granularity.py +92 -0
  66. datapipeline/transforms/utils.py +10 -0
  67. datapipeline/transforms/vector.py +226 -0
  68. datapipeline/transforms/vector_utils.py +84 -0
  69. datapipeline/utils/load.py +3 -1
  70. datapipeline/utils/paths.py +26 -0
  71. datapipeline/utils/time.py +6 -4
  72. {jerry_thomas-0.0.5.dist-info → jerry_thomas-0.2.0.dist-info}/METADATA +153 -53
  73. jerry_thomas-0.2.0.dist-info/RECORD +112 -0
  74. jerry_thomas-0.2.0.dist-info/entry_points.txt +39 -0
  75. datapipeline/cli/commands/analyze.py +0 -32
  76. datapipeline/cli/openers.py +0 -11
  77. datapipeline/config/dataset/group_by.py +0 -31
  78. datapipeline/streams/canonical.py +0 -28
  79. datapipeline/streams/raw.py +0 -16
  80. datapipeline/templates/plugin_skeleton/config/contracts/time_ticks.yaml +0 -2
  81. datapipeline/templates/plugin_skeleton/config/recipe.yaml +0 -17
  82. jerry_thomas-0.0.5.dist-info/RECORD +0 -99
  83. jerry_thomas-0.0.5.dist-info/entry_points.txt +0 -44
  84. {jerry_thomas-0.0.5.dist-info → jerry_thomas-0.2.0.dist-info}/WHEEL +0 -0
  85. {jerry_thomas-0.0.5.dist-info → jerry_thomas-0.2.0.dist-info}/licenses/LICENSE +0 -0
  86. {jerry_thomas-0.0.5.dist-info → jerry_thomas-0.2.0.dist-info}/top_level.txt +0 -0
@@ -1,49 +1,696 @@
1
- from collections import defaultdict, Counter
2
- from typing import Any, Hashable, Iterable
1
+ import csv
2
+ from collections import Counter, defaultdict
3
+ from pathlib import Path
4
+ from typing import Any, Hashable, Iterable, Literal
5
+ from datetime import datetime
3
6
 
4
- import numpy as np
7
+
8
+ def _base_feature_id(feature_id: str) -> str:
9
+ """Return the base feature id without partition suffix."""
10
+
11
+ if "__" in feature_id:
12
+ return feature_id.split("__", 1)[0]
13
+ return feature_id
14
+
15
+
16
+ def _is_missing_value(value: Any) -> bool:
17
+ if value is None:
18
+ return True
19
+ if isinstance(value, float):
20
+ return value != value # NaN without numpy
21
+ return False
5
22
 
6
23
 
7
24
  class VectorStatsCollector:
8
- def __init__(self, expected_feature_ids: Iterable[str]):
9
- self.expected_feature_ids = set(expected_feature_ids)
10
- self.missing_features = Counter()
11
- self.empty_vectors = 0
25
+ """Collect coverage statistics for feature vectors."""
26
+
27
+ def __init__(
28
+ self,
29
+ expected_feature_ids: Iterable[str] | None = None,
30
+ *,
31
+ match_partition: Literal["base", "full"] = "base",
32
+ sample_limit: int = 5,
33
+ threshold: float | None = 0.95,
34
+ show_matrix: bool = False,
35
+ matrix_rows: int = 20,
36
+ matrix_cols: int = 10,
37
+ matrix_output: str | None = None,
38
+ matrix_format: str = "csv",
39
+ ) -> None:
40
+ self.match_partition = match_partition
41
+ self.threshold = threshold
42
+ self.show_matrix = show_matrix
43
+ self.matrix_rows = matrix_rows if matrix_rows and matrix_rows > 0 else None
44
+ self.matrix_cols = matrix_cols if matrix_cols and matrix_cols > 0 else None
45
+ self.matrix_output = Path(matrix_output) if matrix_output else None
46
+ self.matrix_format = matrix_format
47
+
48
+ self.expected_features = (
49
+ {self._normalize(fid) for fid in expected_feature_ids}
50
+ if expected_feature_ids
51
+ else set()
52
+ )
53
+
54
+ self.discovered_features: set[str] = set()
55
+ self.discovered_partitions: set[str] = set()
56
+
12
57
  self.total_vectors = 0
13
- self.per_group_missing = defaultdict(set)
58
+ self.empty_vectors = 0
14
59
 
15
- def update(self, group_key: Hashable, feature_vector: dict[str, Any]):
16
- self.total_vectors += 1
60
+ self.present_counts = Counter()
61
+ self.present_counts_partitions = Counter()
62
+ self.null_counts_partitions = Counter()
63
+
64
+ self.missing_samples = defaultdict(list)
65
+ self.missing_partition_samples = defaultdict(list)
66
+ self.sample_limit = sample_limit
67
+
68
+ self.group_feature_status = defaultdict(dict)
69
+ self.group_partition_status = defaultdict(dict)
70
+ # Optional per-cell sub-status for list-valued entries (finer resolution inside a bucket)
71
+ self.group_feature_sub: dict[Hashable,
72
+ dict[str, list[str]]] = defaultdict(dict)
73
+ self.group_partition_sub: dict[Hashable,
74
+ dict[str, list[str]]] = defaultdict(dict)
75
+
76
+ @staticmethod
77
+ def _group_sort_key(g: Hashable):
78
+ """Stable, chronological sort key for group keys.
79
+
80
+ Many pipelines use a 1-tuple containing a datetime as the group key.
81
+ Sorting by ``str(g)`` can produce lexicographic mis-ordering (e.g.,
82
+ hours "3" vs "21"). This helper prefers numeric datetime ordering and
83
+ falls back to string representation only when needed.
84
+ """
85
+ def norm(p: Any):
86
+ if isinstance(p, datetime):
87
+ # Use POSIX timestamp for monotonic ordering
88
+ return p.timestamp()
89
+ return p
17
90
 
18
- present_features = set(feature_vector.keys())
91
+ if isinstance(g, (tuple, list)):
92
+ return tuple(norm(p) for p in g)
93
+ return norm(g)
19
94
 
20
- if not present_features:
95
+ def _normalize(self, feature_id: str) -> str:
96
+ if self.match_partition == "full":
97
+ return feature_id
98
+ return _base_feature_id(feature_id)
99
+
100
+ def update(self, group_key: Hashable, feature_vector: dict[str, Any]) -> None:
101
+ self.total_vectors += 1
102
+
103
+ present_partitions = set(feature_vector.keys())
104
+ if not present_partitions:
21
105
  self.empty_vectors += 1
22
106
 
23
- missing = self.expected_feature_ids - present_features
24
- for feature_id in missing:
25
- self.missing_features[feature_id] += 1
26
- self.per_group_missing[group_key].add(feature_id)
107
+ status_features = self.group_feature_status[group_key]
108
+ status_partitions = self.group_partition_status[group_key]
109
+
110
+ present_normalized: set[str] = set()
111
+ seen_partitions: set[str] = set()
112
+ for partition_id in present_partitions:
113
+ normalized = self._normalize(partition_id)
114
+ present_normalized.add(normalized)
115
+ seen_partitions.add(partition_id)
116
+
117
+ value = feature_vector[partition_id]
118
+
119
+ status_features.setdefault(normalized, "present")
120
+ status_partitions.setdefault(partition_id, "present")
121
+
122
+ self.discovered_features.add(normalized)
123
+ self.discovered_partitions.add(partition_id)
124
+
125
+ # Capture sub-status for list-valued entries
126
+ sub: list[str] | None = None
127
+ if isinstance(value, list):
128
+ sub = []
129
+ for v in value:
130
+ if v is None or (isinstance(v, float) and v != v):
131
+ sub.append("null")
132
+ else:
133
+ sub.append("present")
134
+ if sub:
135
+ self.group_partition_sub[group_key][partition_id] = sub
136
+ # Only store one sub per normalized id (first seen)
137
+ self.group_feature_sub[group_key].setdefault(
138
+ normalized, sub)
139
+
140
+ is_null = _is_missing_value(value)
141
+ if is_null:
142
+ status_features[normalized] = "null"
143
+ status_partitions[partition_id] = "null"
144
+ self.null_counts_partitions[partition_id] += 1
145
+ if len(self.missing_partition_samples[partition_id]) < self.sample_limit:
146
+ self.missing_partition_samples[partition_id].append(
147
+ (group_key, "null")
148
+ )
149
+ if len(self.missing_samples[normalized]) < self.sample_limit:
150
+ self.missing_samples[normalized].append(
151
+ (group_key, "null"))
152
+
153
+ for normalized in present_normalized:
154
+ if status_features.get(normalized) == "present":
155
+ self.present_counts[normalized] += 1
156
+
157
+ for partition_id in seen_partitions:
158
+ if status_partitions.get(partition_id) == "present":
159
+ self.present_counts_partitions[partition_id] += 1
160
+
161
+ tracked_features = (
162
+ self.expected_features if self.expected_features else self.discovered_features
163
+ )
164
+ missing_features = tracked_features - present_normalized
165
+ for feature_id in missing_features:
166
+ if status_features.get(feature_id) != "null":
167
+ status_features[feature_id] = "absent"
168
+ if len(self.missing_samples[feature_id]) < self.sample_limit:
169
+ self.missing_samples[feature_id].append((group_key, "absent"))
170
+
171
+ if self.match_partition == "full":
172
+ tracked_partitions = (
173
+ set(self.expected_features) if self.expected_features else self.discovered_partitions
174
+ )
175
+ else:
176
+ tracked_partitions = self.discovered_partitions
177
+
178
+ missing_partitions = tracked_partitions - present_partitions
179
+ for partition_id in missing_partitions:
180
+ if status_partitions.get(partition_id) != "null":
181
+ status_partitions[partition_id] = "absent"
182
+ if len(self.missing_partition_samples[partition_id]) < self.sample_limit:
183
+ self.missing_partition_samples[partition_id].append(
184
+ (group_key, "absent")
185
+ )
186
+
187
+ def _coverage(
188
+ self, identifier: str, *, partitions: bool = False
189
+ ) -> tuple[int, int, int]:
190
+ present = (
191
+ self.present_counts_partitions[identifier]
192
+ if partitions
193
+ else self.present_counts[identifier]
194
+ )
195
+ opportunities = self.total_vectors
196
+ missing = max(opportunities - present, 0)
197
+ return present, missing, opportunities
198
+
199
+ def _feature_null_count(self, feature_id: str) -> int:
200
+ total = 0
201
+ for partition_id, count in self.null_counts_partitions.items():
202
+ if self._normalize(partition_id) == feature_id:
203
+ total += count
204
+ return total
205
+
206
+ @staticmethod
207
+ def _format_group_key(group_key: Hashable) -> str:
208
+ if isinstance(group_key, tuple):
209
+ return ", ".join(str(part) for part in group_key)
210
+ return str(group_key)
211
+
212
+ @staticmethod
213
+ def _symbol_for(status: str) -> str:
214
+ return {
215
+ "present": "#",
216
+ "null": "!",
217
+ "absent": ".",
218
+ }.get(status, ".")
219
+
220
+ @staticmethod
221
+ def _format_samples(samples: list[tuple[Hashable, str]], limit: int = 3) -> str:
222
+ if not samples:
223
+ return ""
224
+ trimmed = samples[:limit]
225
+ rendered = ", ".join(
226
+ f"{reason}@{sample}" for sample, reason in trimmed)
227
+ if len(samples) > limit:
228
+ rendered += ", …"
229
+ return rendered
230
+
231
+ @staticmethod
232
+ def _partition_suffix(partition_id: str) -> str:
233
+ return partition_id.split("__", 1)[1] if "__" in partition_id else partition_id
234
+
235
+ def _render_matrix(
236
+ self,
237
+ *,
238
+ features: list[str],
239
+ partitions: bool = False,
240
+ column_width: int = 6,
241
+ ) -> None:
242
+ status_map = self.group_partition_status if partitions else self.group_feature_status
243
+ if not status_map or not features:
244
+ return
245
+
246
+ column_width = max(column_width, min(
247
+ 10, max(len(fid) for fid in features)))
248
+
249
+ def status_for(group: Hashable, fid: str) -> str:
250
+ statuses = status_map.get(group, {})
251
+ return statuses.get(fid, "absent")
27
252
 
28
- # 🔍 Check for features present but with missing/invalid values
29
- for fid in present_features & self.expected_feature_ids:
30
- val = feature_vector[fid]
31
- if val is None or (isinstance(val, float) and np.isnan(val)):
32
- self.missing_features[fid] += 1
33
- self.per_group_missing[group_key].add(fid)
253
+ sorted_groups = sorted(status_map.keys(), key=self._group_sort_key)
254
+ focus_groups = [
255
+ g
256
+ for g in sorted_groups
257
+ if any(status_for(g, fid) != "present" for fid in features)
258
+ ]
259
+ if not focus_groups:
260
+ focus_groups = sorted_groups
261
+ if self.matrix_rows is not None:
262
+ focus_groups = focus_groups[: self.matrix_rows]
263
+
264
+ matrix_label = "Partition" if partitions else "Feature"
265
+ print(f"\n→ {matrix_label} availability heatmap:")
266
+
267
+ header = " " * 20 + " ".join(
268
+ f"{fid[-column_width:]:>{column_width}}" for fid in features
269
+ )
270
+ print(header)
271
+
272
+ for group in focus_groups:
273
+ label = self._format_group_key(group)
274
+ label = label[:18].ljust(18)
275
+ cells = " ".join(
276
+ f"{self._symbol_for(status_for(group, fid)):^{column_width}}"
277
+ for fid in features
278
+ )
279
+ print(f" {label} {cells}")
280
+
281
+ print(" Legend: # present | ! null | . missing")
282
+
283
+ def print_report(self) -> None:
284
+ tracked_features = (
285
+ self.expected_features if self.expected_features else self.discovered_features
286
+ )
287
+ tracked_partitions = (
288
+ set(self.expected_features)
289
+ if self.match_partition == "full" and self.expected_features
290
+ else self.discovered_partitions
291
+ )
292
+
293
+ summary: dict[str, Any] = {
294
+ "total_vectors": self.total_vectors,
295
+ "empty_vectors": self.empty_vectors,
296
+ "match_partition": self.match_partition,
297
+ "tracked_features": sorted(tracked_features),
298
+ "tracked_partitions": sorted(tracked_partitions),
299
+ "threshold": self.threshold,
300
+ }
34
301
 
35
- def print_report(self):
36
302
  print("\n=== Vector Quality Report ===")
37
303
  print(f"Total vectors processed: {self.total_vectors}")
38
304
  print(f"Empty vectors: {self.empty_vectors}")
39
305
  print(
40
- f"Features expected: {sorted(self.expected_feature_ids)[:10] }... (total {len(self.expected_feature_ids)})")
306
+ f"Features tracked ({self.match_partition}): {len(tracked_features)}"
307
+ )
308
+ if self.match_partition == "full":
309
+ print(f"Partitions observed: {len(self.discovered_partitions)}")
310
+
311
+ if not self.total_vectors:
312
+ print("(no vectors analyzed)")
313
+ summary.update(
314
+ {
315
+ "feature_stats": [],
316
+ "partition_stats": [],
317
+ "below_features": [],
318
+ "keep_features": [],
319
+ "below_partitions": [],
320
+ "keep_partitions": [],
321
+ "below_suffixes": [],
322
+ "keep_suffixes": [],
323
+ }
324
+ )
325
+ return summary
326
+
327
+ feature_stats = []
328
+ print("\n→ Feature coverage (sorted by missing count):")
329
+ for feature_id in sorted(
330
+ tracked_features,
331
+ key=lambda fid: self._coverage(fid)[1],
332
+ reverse=True,
333
+ ):
334
+ present, missing, opportunities = self._coverage(feature_id)
335
+ coverage = present / opportunities if opportunities else 0.0
336
+ nulls = self._feature_null_count(feature_id)
337
+ raw_samples = self.missing_samples.get(feature_id, [])
338
+ sample_note = self._format_samples(raw_samples)
339
+ samples = [
340
+ {
341
+ "group": self._format_group_key(group_key),
342
+ "status": status,
343
+ }
344
+ for group_key, status in raw_samples
345
+ ]
346
+ line = (
347
+ f" - {feature_id}: present {present}/{opportunities}"
348
+ f" ({coverage:.1%}) | missing {missing} | null {nulls}"
349
+ )
350
+ if sample_note:
351
+ line += f"; samples: {sample_note}"
352
+ print(line)
353
+ feature_stats.append(
354
+ {
355
+ "id": feature_id,
356
+ "present": present,
357
+ "missing": missing,
358
+ "nulls": nulls,
359
+ "coverage": coverage,
360
+ "opportunities": opportunities,
361
+ "samples": samples,
362
+ }
363
+ )
364
+
365
+ summary["feature_stats"] = feature_stats
366
+
367
+ partition_stats = []
368
+ if tracked_partitions:
369
+ for partition_id in tracked_partitions:
370
+ present, missing, opportunities = self._coverage(
371
+ partition_id, partitions=True
372
+ )
373
+ coverage = present / opportunities if opportunities else 0.0
374
+ nulls = self.null_counts_partitions.get(partition_id, 0)
375
+ raw_samples = self.missing_partition_samples.get(
376
+ partition_id, [])
377
+ partition_stats.append(
378
+ {
379
+ "id": partition_id,
380
+ "base": _base_feature_id(partition_id),
381
+ "present": present,
382
+ "missing": missing,
383
+ "nulls": nulls,
384
+ "coverage": coverage,
385
+ "opportunities": opportunities,
386
+ "samples": [
387
+ {
388
+ "group": self._format_group_key(group_key),
389
+ "status": status,
390
+ }
391
+ for group_key, status in raw_samples
392
+ ],
393
+ }
394
+ )
395
+
396
+ print("\n→ Partition details (top by missing count):")
397
+ for stats in sorted(partition_stats, key=lambda s: s["missing"], reverse=True)[
398
+ :20
399
+ ]:
400
+ line = (
401
+ f" - {stats['id']} (base: {stats['base']}): present {stats['present']}/{stats['opportunities']}"
402
+ f" ({stats['coverage']:.1%}) | missing {stats['missing']} | null/invalid {stats['nulls']}"
403
+ )
404
+ print(line)
405
+
406
+ summary["partition_stats"] = partition_stats
407
+
408
+ below_features: list[str] = []
409
+ above_features: list[str] = []
410
+ below_partitions: list[str] = []
411
+ above_partitions: list[str] = []
412
+
413
+ if self.threshold is not None:
414
+ thr = self.threshold
415
+ below_features = [
416
+ stats["id"] for stats in feature_stats if stats["coverage"] < thr
417
+ ]
418
+ above_features = [
419
+ stats["id"] for stats in feature_stats if stats["coverage"] >= thr
420
+ ]
421
+ print(
422
+ f"\n📉 Features below {thr:.0%} coverage:\n below_features = {below_features}"
423
+ )
424
+ print(
425
+ f"📈 Features at/above {thr:.0%} coverage:\n keep_features = {above_features}"
426
+ )
427
+
428
+ if partition_stats:
429
+ below_partitions = [
430
+ stats["id"]
431
+ for stats in partition_stats
432
+ if stats["coverage"] < thr
433
+ ]
434
+ above_partitions = [
435
+ stats["id"]
436
+ for stats in partition_stats
437
+ if stats["coverage"] >= thr
438
+ ]
439
+ below_suffixes = [
440
+ self._partition_suffix(pid)
441
+ for pid in below_partitions
442
+ ]
443
+ above_suffixes = [
444
+ self._partition_suffix(pid)
445
+ for pid in above_partitions
446
+ if self._partition_suffix(pid) != pid
447
+ ]
448
+ if not above_partitions:
449
+ above_suffixes = []
450
+ print(
451
+ f"\n📉 Partitions below {thr:.0%} coverage:\n below_partitions = {below_partitions}"
452
+ )
453
+ print(f" below_suffixes = {below_suffixes}")
454
+ print(
455
+ f"📈 Partitions at/above {thr:.0%} coverage:\n keep_partitions = {above_partitions}"
456
+ )
457
+ print(f" keep_suffixes = {above_suffixes}")
458
+
459
+ summary.update(
460
+ {
461
+ "below_features": below_features,
462
+ "keep_features": above_features,
463
+ "below_partitions": below_partitions,
464
+ "keep_partitions": above_partitions,
465
+ "below_suffixes": below_suffixes,
466
+ "keep_suffixes": above_suffixes,
467
+ }
468
+ )
469
+ else:
470
+ summary.update(
471
+ {
472
+ "below_features": [],
473
+ "keep_features": [stats["id"] for stats in feature_stats],
474
+ "below_partitions": [],
475
+ "keep_partitions": [stats["id"] for stats in partition_stats],
476
+ "below_suffixes": [],
477
+ "keep_suffixes": [
478
+ self._partition_suffix(stats["id"])
479
+ for stats in partition_stats
480
+ if self._partition_suffix(stats["id"]) != stats["id"]
481
+ ]
482
+ if partition_stats
483
+ else [],
484
+ }
485
+ )
486
+
487
+ if self.show_matrix:
488
+ feature_candidates = (
489
+ below_features
490
+ or [stats["id"] for stats in feature_stats if stats["missing"] > 0]
491
+ or [stats["id"] for stats in feature_stats]
492
+ )
493
+ selected_features = (
494
+ feature_candidates
495
+ if self.matrix_cols is None
496
+ else feature_candidates[: self.matrix_cols]
497
+ )
498
+ if selected_features:
499
+ self._render_matrix(features=selected_features)
500
+
501
+ if partition_stats:
502
+ partition_candidates = (
503
+ below_partitions
504
+ or [stats["id"] for stats in partition_stats if stats["missing"] > 0]
505
+ or [stats["id"] for stats in partition_stats]
506
+ )
507
+ selected_partitions = (
508
+ partition_candidates
509
+ if self.matrix_cols is None
510
+ else partition_candidates[: self.matrix_cols]
511
+ )
512
+ if selected_partitions:
513
+ self._render_matrix(
514
+ features=selected_partitions, partitions=True)
515
+
516
+ group_missing = [
517
+ (
518
+ group,
519
+ sum(
520
+ 1
521
+ for fid in tracked_features
522
+ if self.group_feature_status[group].get(fid, "absent") != "present"
523
+ ),
524
+ )
525
+ for group in self.group_feature_status
526
+ ]
527
+ group_missing = [item for item in group_missing if item[1] > 0]
528
+ if group_missing:
529
+ print("\n→ Time buckets with missing features:")
530
+ for group, count in sorted(group_missing, key=lambda item: item[1], reverse=True)[:10]:
531
+ print(
532
+ f" - {self._format_group_key(group)}: {count} features missing")
533
+
534
+ if partition_stats:
535
+ partition_missing = [
536
+ (
537
+ group,
538
+ sum(
539
+ 1
540
+ for pid in self.group_partition_status[group]
541
+ if self.group_partition_status[group].get(pid, "absent") != "present"
542
+ ),
543
+ )
544
+ for group in self.group_partition_status
545
+ ]
546
+ partition_missing = [
547
+ item for item in partition_missing if item[1] > 0]
548
+ if partition_missing:
549
+ print("\n→ Time buckets with missing partitions:")
550
+ for group, count in sorted(
551
+ partition_missing, key=lambda item: item[1], reverse=True
552
+ )[:10]:
553
+ print(
554
+ f" - {self._format_group_key(group)}: {count} partitions missing")
555
+
556
+ if self.matrix_output:
557
+ self._export_matrix_data()
558
+
559
+ return summary
560
+
561
+ def _export_matrix_data(self) -> None:
562
+ if not self.matrix_output:
563
+ return
564
+
565
+ path = self.matrix_output
566
+ path.parent.mkdir(parents=True, exist_ok=True)
567
+ try:
568
+ if self.matrix_format == "html":
569
+ self._write_matrix_html(path)
570
+ else:
571
+ self._write_matrix_csv(path)
572
+ print(f"\n📝 Saved availability matrix to {path}")
573
+ except OSError as exc:
574
+ print(f"\n⚠️ Failed to write availability matrix to {path}: {exc}")
575
+
576
+ def _collect_feature_ids(self) -> list[str]:
577
+ feature_ids: set[str] = set()
578
+ for statuses in self.group_feature_status.values():
579
+ feature_ids.update(statuses.keys())
580
+ return sorted(feature_ids)
581
+
582
+ def _collect_partition_ids(self) -> list[str]:
583
+ partition_ids: set[str] = set()
584
+ for statuses in self.group_partition_status.values():
585
+ partition_ids.update(statuses.keys())
586
+ return sorted(partition_ids)
587
+
588
+ def _collect_group_keys(self) -> list[Hashable]:
589
+ keys = set(self.group_feature_status.keys()) | set(
590
+ self.group_partition_status.keys()
591
+ )
592
+ return sorted(keys, key=self._group_sort_key)
593
+
594
+ def _write_matrix_csv(self, path: Path) -> None:
595
+ rows: list[tuple[str, str, str, str]] = []
596
+ for group, statuses in self.group_feature_status.items():
597
+ group_key = self._format_group_key(group)
598
+ for fid, status in statuses.items():
599
+ rows.append(("feature", fid, group_key, status))
600
+
601
+ for group, statuses in self.group_partition_status.items():
602
+ group_key = self._format_group_key(group)
603
+ for pid, status in statuses.items():
604
+ rows.append(("partition", pid, group_key, status))
605
+
606
+ with path.open("w", newline="", encoding="utf-8") as csvfile:
607
+ writer = csv.writer(csvfile)
608
+ writer.writerow(["kind", "identifier", "group_key", "status"])
609
+ writer.writerows(rows)
610
+
611
+ def _write_matrix_html(self, path: Path) -> None:
612
+ feature_ids = self._collect_feature_ids()
613
+ partition_ids = self._collect_partition_ids()
614
+ group_keys = self._collect_group_keys()
615
+
616
+ def render_table(title: str, identifiers: list[str], status_map: dict, sub_map: dict) -> str:
617
+ if not identifiers:
618
+ return f"<h2>{title}</h2><p>No data.</p>"
619
+
620
+ cell_class = {
621
+ "present": "status-present",
622
+ "null": "status-null",
623
+ "absent": "status-absent",
624
+ }
625
+
626
+ header = "".join(
627
+ f"<th>{id_}</th>" for id_ in identifiers
628
+ )
629
+ rows_html = []
630
+ for group in group_keys:
631
+ key_str = self._format_group_key(group)
632
+ statuses = status_map.get(group, {})
633
+ cells = []
634
+ for identifier in identifiers:
635
+ status = statuses.get(identifier, "absent")
636
+ cls = cell_class.get(status, "status-absent")
637
+ # Render sub-cells when available to convey partial availability
638
+ sub = sub_map.get(group, {}).get(identifier)
639
+ if sub:
640
+ parts = "".join(
641
+ f"<span class='{cell_class.get(s, 'status-absent')}' title='{s}'>&nbsp;</span>"
642
+ for s in sub
643
+ )
644
+ cells.append(
645
+ f"<td title='{status}'><div class='sub'>{parts}</div></td>")
646
+ else:
647
+ symbol = self._symbol_for(status)
648
+ cells.append(
649
+ f"<td class='{cls}' title='{status}'>{symbol}</td>")
650
+ rows_html.append(
651
+ f"<tr><th>{key_str}</th>{''.join(cells)}</tr>"
652
+ )
653
+
654
+ return (
655
+ f"<h2>{title}</h2>"
656
+ "<table class='heatmap'>"
657
+ f"<tr><th>Group</th>{header}</tr>"
658
+ f"{''.join(rows_html)}"
659
+ "</table>"
660
+ )
661
+
662
+ feature_table = render_table(
663
+ "Feature Availability",
664
+ feature_ids,
665
+ self.group_feature_status,
666
+ self.group_feature_sub,
667
+ )
668
+ partition_table = render_table(
669
+ "Partition Availability",
670
+ partition_ids,
671
+ self.group_partition_status,
672
+ self.group_partition_sub,
673
+ )
674
+
675
+ style = """
676
+ body { font-family: Arial, sans-serif; }
677
+ table.heatmap { border-collapse: collapse; margin-bottom: 2rem; }
678
+ .heatmap th, .heatmap td { border: 1px solid #ccc; padding: 4px 6px; }
679
+ .heatmap th { background: #f0f0f0; position: sticky; top: 0; }
680
+ .status-present { background: #2ecc71; color: #fff; }
681
+ .status-null { background: #f1c40f; color: #000; }
682
+ .status-absent { background: #e74c3c; color: #fff; }
683
+ .sub { display: flex; gap: 1px; height: 12px; }
684
+ .sub span { flex: 1; display: block; }
685
+ """
41
686
 
42
- print("\n→ Missing feature counts:")
43
- for fid, count in sorted(self.missing_features.items(), key=lambda x: -x[1]):
44
- print(f" - {fid}: missing in {count} vectors")
687
+ html = (
688
+ "<html><head><meta charset='utf-8'>"
689
+ f"<style>{style}</style>"
690
+ "<title>Feature Availability</title></head><body>"
691
+ f"<h1>Availability Matrix</h1>{feature_table}{partition_table}"
692
+ "</body></html>"
693
+ )
45
694
 
46
- if self.per_group_missing:
47
- print("\n→ Groups with missing features (sample):")
48
- for group_key, missing in list(self.per_group_missing.items())[:5]:
49
- print(f" - Group {group_key}: {sorted(missing)}")
695
+ with path.open("w", encoding="utf-8") as fh:
696
+ fh.write(html)