jerry-thomas 0.0.5__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector_analyzer.py +678 -31
- datapipeline/cli/app.py +278 -61
- datapipeline/cli/commands/domain.py +2 -2
- datapipeline/cli/commands/inspect.py +169 -0
- datapipeline/cli/commands/link.py +48 -14
- datapipeline/cli/commands/plugin.py +2 -2
- datapipeline/cli/commands/run.py +47 -48
- datapipeline/cli/visual_source.py +32 -0
- datapipeline/cli/visuals.py +4 -31
- datapipeline/config/catalog.py +15 -7
- datapipeline/config/dataset/dataset.py +4 -7
- datapipeline/config/dataset/feature.py +6 -17
- datapipeline/config/dataset/loader.py +83 -3
- datapipeline/config/dataset/normalize.py +19 -5
- datapipeline/config/project.py +0 -2
- datapipeline/domain/feature.py +13 -6
- datapipeline/domain/record.py +15 -7
- datapipeline/domain/vector.py +4 -2
- datapipeline/filters/filters.py +0 -1
- datapipeline/integrations/__init__.py +19 -0
- datapipeline/integrations/ml.py +319 -0
- datapipeline/mappers/synthetic/time.py +3 -3
- datapipeline/pipeline/pipelines.py +81 -34
- datapipeline/pipeline/stages.py +104 -49
- datapipeline/pipeline/utils/keygen.py +23 -1
- datapipeline/pipeline/utils/memory_sort.py +1 -1
- datapipeline/pipeline/utils/ordering.py +0 -2
- datapipeline/pipeline/utils/transform_utils.py +14 -79
- datapipeline/plugins.py +15 -1
- datapipeline/registries/registries.py +15 -0
- datapipeline/registries/registry.py +28 -0
- datapipeline/services/bootstrap.py +50 -17
- datapipeline/services/constants.py +2 -0
- datapipeline/services/factories.py +9 -5
- datapipeline/services/project_paths.py +41 -1
- datapipeline/services/scaffold/domain.py +6 -3
- datapipeline/services/scaffold/mappers.py +2 -2
- datapipeline/services/scaffold/plugin.py +5 -5
- datapipeline/services/scaffold/source.py +15 -25
- datapipeline/services/scaffold/templates.py +1 -5
- datapipeline/sources/models/__init__.py +1 -3
- datapipeline/sources/models/loader.py +1 -12
- datapipeline/sources/synthetic/time/parser.py +4 -4
- datapipeline/templates/plugin_skeleton/README.md +14 -11
- datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.yaml +22 -2
- datapipeline/templates/plugin_skeleton/config/contracts/time_linear.yaml +22 -3
- datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +29 -0
- datapipeline/templates/plugin_skeleton/config/{project.yaml → datasets/default/project.yaml} +3 -3
- datapipeline/templates/plugin_skeleton/config/{distilleries → sources}/time_ticks.yaml +2 -0
- datapipeline/templates/plugin_skeleton/pyproject.toml +2 -2
- datapipeline/templates/stubs/dto.py.j2 +1 -2
- datapipeline/templates/stubs/mapper.py.j2 +3 -4
- datapipeline/templates/stubs/record.py.j2 +1 -1
- datapipeline/templates/stubs/source.yaml.j2 +4 -0
- datapipeline/transforms/debug/identity.py +74 -0
- datapipeline/transforms/debug/lint.py +101 -0
- datapipeline/transforms/feature/model.py +12 -0
- datapipeline/transforms/{transforms.py → feature/scaler.py} +9 -67
- datapipeline/transforms/filter.py +57 -0
- datapipeline/transforms/record/floor_time.py +17 -0
- datapipeline/transforms/record/lag.py +18 -0
- datapipeline/transforms/sequence.py +68 -15
- datapipeline/transforms/stream/ensure_ticks.py +33 -0
- datapipeline/transforms/stream/fill.py +103 -0
- datapipeline/transforms/stream/granularity.py +92 -0
- datapipeline/transforms/utils.py +10 -0
- datapipeline/transforms/vector.py +226 -0
- datapipeline/transforms/vector_utils.py +84 -0
- datapipeline/utils/load.py +3 -1
- datapipeline/utils/paths.py +26 -0
- datapipeline/utils/time.py +6 -4
- {jerry_thomas-0.0.5.dist-info → jerry_thomas-0.2.0.dist-info}/METADATA +153 -53
- jerry_thomas-0.2.0.dist-info/RECORD +112 -0
- jerry_thomas-0.2.0.dist-info/entry_points.txt +39 -0
- datapipeline/cli/commands/analyze.py +0 -32
- datapipeline/cli/openers.py +0 -11
- datapipeline/config/dataset/group_by.py +0 -31
- datapipeline/streams/canonical.py +0 -28
- datapipeline/streams/raw.py +0 -16
- datapipeline/templates/plugin_skeleton/config/contracts/time_ticks.yaml +0 -2
- datapipeline/templates/plugin_skeleton/config/recipe.yaml +0 -17
- jerry_thomas-0.0.5.dist-info/RECORD +0 -99
- jerry_thomas-0.0.5.dist-info/entry_points.txt +0 -44
- {jerry_thomas-0.0.5.dist-info → jerry_thomas-0.2.0.dist-info}/WHEEL +0 -0
- {jerry_thomas-0.0.5.dist-info → jerry_thomas-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-0.0.5.dist-info → jerry_thomas-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,49 +1,696 @@
|
|
|
1
|
-
|
|
2
|
-
from
|
|
1
|
+
import csv
|
|
2
|
+
from collections import Counter, defaultdict
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Hashable, Iterable, Literal
|
|
5
|
+
from datetime import datetime
|
|
3
6
|
|
|
4
|
-
|
|
7
|
+
|
|
8
|
+
def _base_feature_id(feature_id: str) -> str:
|
|
9
|
+
"""Return the base feature id without partition suffix."""
|
|
10
|
+
|
|
11
|
+
if "__" in feature_id:
|
|
12
|
+
return feature_id.split("__", 1)[0]
|
|
13
|
+
return feature_id
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _is_missing_value(value: Any) -> bool:
|
|
17
|
+
if value is None:
|
|
18
|
+
return True
|
|
19
|
+
if isinstance(value, float):
|
|
20
|
+
return value != value # NaN without numpy
|
|
21
|
+
return False
|
|
5
22
|
|
|
6
23
|
|
|
7
24
|
class VectorStatsCollector:
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
self
|
|
25
|
+
"""Collect coverage statistics for feature vectors."""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
expected_feature_ids: Iterable[str] | None = None,
|
|
30
|
+
*,
|
|
31
|
+
match_partition: Literal["base", "full"] = "base",
|
|
32
|
+
sample_limit: int = 5,
|
|
33
|
+
threshold: float | None = 0.95,
|
|
34
|
+
show_matrix: bool = False,
|
|
35
|
+
matrix_rows: int = 20,
|
|
36
|
+
matrix_cols: int = 10,
|
|
37
|
+
matrix_output: str | None = None,
|
|
38
|
+
matrix_format: str = "csv",
|
|
39
|
+
) -> None:
|
|
40
|
+
self.match_partition = match_partition
|
|
41
|
+
self.threshold = threshold
|
|
42
|
+
self.show_matrix = show_matrix
|
|
43
|
+
self.matrix_rows = matrix_rows if matrix_rows and matrix_rows > 0 else None
|
|
44
|
+
self.matrix_cols = matrix_cols if matrix_cols and matrix_cols > 0 else None
|
|
45
|
+
self.matrix_output = Path(matrix_output) if matrix_output else None
|
|
46
|
+
self.matrix_format = matrix_format
|
|
47
|
+
|
|
48
|
+
self.expected_features = (
|
|
49
|
+
{self._normalize(fid) for fid in expected_feature_ids}
|
|
50
|
+
if expected_feature_ids
|
|
51
|
+
else set()
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
self.discovered_features: set[str] = set()
|
|
55
|
+
self.discovered_partitions: set[str] = set()
|
|
56
|
+
|
|
12
57
|
self.total_vectors = 0
|
|
13
|
-
self.
|
|
58
|
+
self.empty_vectors = 0
|
|
14
59
|
|
|
15
|
-
|
|
16
|
-
self.
|
|
60
|
+
self.present_counts = Counter()
|
|
61
|
+
self.present_counts_partitions = Counter()
|
|
62
|
+
self.null_counts_partitions = Counter()
|
|
63
|
+
|
|
64
|
+
self.missing_samples = defaultdict(list)
|
|
65
|
+
self.missing_partition_samples = defaultdict(list)
|
|
66
|
+
self.sample_limit = sample_limit
|
|
67
|
+
|
|
68
|
+
self.group_feature_status = defaultdict(dict)
|
|
69
|
+
self.group_partition_status = defaultdict(dict)
|
|
70
|
+
# Optional per-cell sub-status for list-valued entries (finer resolution inside a bucket)
|
|
71
|
+
self.group_feature_sub: dict[Hashable,
|
|
72
|
+
dict[str, list[str]]] = defaultdict(dict)
|
|
73
|
+
self.group_partition_sub: dict[Hashable,
|
|
74
|
+
dict[str, list[str]]] = defaultdict(dict)
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def _group_sort_key(g: Hashable):
|
|
78
|
+
"""Stable, chronological sort key for group keys.
|
|
79
|
+
|
|
80
|
+
Many pipelines use a 1-tuple containing a datetime as the group key.
|
|
81
|
+
Sorting by ``str(g)`` can produce lexicographic mis-ordering (e.g.,
|
|
82
|
+
hours "3" vs "21"). This helper prefers numeric datetime ordering and
|
|
83
|
+
falls back to string representation only when needed.
|
|
84
|
+
"""
|
|
85
|
+
def norm(p: Any):
|
|
86
|
+
if isinstance(p, datetime):
|
|
87
|
+
# Use POSIX timestamp for monotonic ordering
|
|
88
|
+
return p.timestamp()
|
|
89
|
+
return p
|
|
17
90
|
|
|
18
|
-
|
|
91
|
+
if isinstance(g, (tuple, list)):
|
|
92
|
+
return tuple(norm(p) for p in g)
|
|
93
|
+
return norm(g)
|
|
19
94
|
|
|
20
|
-
|
|
95
|
+
def _normalize(self, feature_id: str) -> str:
|
|
96
|
+
if self.match_partition == "full":
|
|
97
|
+
return feature_id
|
|
98
|
+
return _base_feature_id(feature_id)
|
|
99
|
+
|
|
100
|
+
def update(self, group_key: Hashable, feature_vector: dict[str, Any]) -> None:
|
|
101
|
+
self.total_vectors += 1
|
|
102
|
+
|
|
103
|
+
present_partitions = set(feature_vector.keys())
|
|
104
|
+
if not present_partitions:
|
|
21
105
|
self.empty_vectors += 1
|
|
22
106
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
107
|
+
status_features = self.group_feature_status[group_key]
|
|
108
|
+
status_partitions = self.group_partition_status[group_key]
|
|
109
|
+
|
|
110
|
+
present_normalized: set[str] = set()
|
|
111
|
+
seen_partitions: set[str] = set()
|
|
112
|
+
for partition_id in present_partitions:
|
|
113
|
+
normalized = self._normalize(partition_id)
|
|
114
|
+
present_normalized.add(normalized)
|
|
115
|
+
seen_partitions.add(partition_id)
|
|
116
|
+
|
|
117
|
+
value = feature_vector[partition_id]
|
|
118
|
+
|
|
119
|
+
status_features.setdefault(normalized, "present")
|
|
120
|
+
status_partitions.setdefault(partition_id, "present")
|
|
121
|
+
|
|
122
|
+
self.discovered_features.add(normalized)
|
|
123
|
+
self.discovered_partitions.add(partition_id)
|
|
124
|
+
|
|
125
|
+
# Capture sub-status for list-valued entries
|
|
126
|
+
sub: list[str] | None = None
|
|
127
|
+
if isinstance(value, list):
|
|
128
|
+
sub = []
|
|
129
|
+
for v in value:
|
|
130
|
+
if v is None or (isinstance(v, float) and v != v):
|
|
131
|
+
sub.append("null")
|
|
132
|
+
else:
|
|
133
|
+
sub.append("present")
|
|
134
|
+
if sub:
|
|
135
|
+
self.group_partition_sub[group_key][partition_id] = sub
|
|
136
|
+
# Only store one sub per normalized id (first seen)
|
|
137
|
+
self.group_feature_sub[group_key].setdefault(
|
|
138
|
+
normalized, sub)
|
|
139
|
+
|
|
140
|
+
is_null = _is_missing_value(value)
|
|
141
|
+
if is_null:
|
|
142
|
+
status_features[normalized] = "null"
|
|
143
|
+
status_partitions[partition_id] = "null"
|
|
144
|
+
self.null_counts_partitions[partition_id] += 1
|
|
145
|
+
if len(self.missing_partition_samples[partition_id]) < self.sample_limit:
|
|
146
|
+
self.missing_partition_samples[partition_id].append(
|
|
147
|
+
(group_key, "null")
|
|
148
|
+
)
|
|
149
|
+
if len(self.missing_samples[normalized]) < self.sample_limit:
|
|
150
|
+
self.missing_samples[normalized].append(
|
|
151
|
+
(group_key, "null"))
|
|
152
|
+
|
|
153
|
+
for normalized in present_normalized:
|
|
154
|
+
if status_features.get(normalized) == "present":
|
|
155
|
+
self.present_counts[normalized] += 1
|
|
156
|
+
|
|
157
|
+
for partition_id in seen_partitions:
|
|
158
|
+
if status_partitions.get(partition_id) == "present":
|
|
159
|
+
self.present_counts_partitions[partition_id] += 1
|
|
160
|
+
|
|
161
|
+
tracked_features = (
|
|
162
|
+
self.expected_features if self.expected_features else self.discovered_features
|
|
163
|
+
)
|
|
164
|
+
missing_features = tracked_features - present_normalized
|
|
165
|
+
for feature_id in missing_features:
|
|
166
|
+
if status_features.get(feature_id) != "null":
|
|
167
|
+
status_features[feature_id] = "absent"
|
|
168
|
+
if len(self.missing_samples[feature_id]) < self.sample_limit:
|
|
169
|
+
self.missing_samples[feature_id].append((group_key, "absent"))
|
|
170
|
+
|
|
171
|
+
if self.match_partition == "full":
|
|
172
|
+
tracked_partitions = (
|
|
173
|
+
set(self.expected_features) if self.expected_features else self.discovered_partitions
|
|
174
|
+
)
|
|
175
|
+
else:
|
|
176
|
+
tracked_partitions = self.discovered_partitions
|
|
177
|
+
|
|
178
|
+
missing_partitions = tracked_partitions - present_partitions
|
|
179
|
+
for partition_id in missing_partitions:
|
|
180
|
+
if status_partitions.get(partition_id) != "null":
|
|
181
|
+
status_partitions[partition_id] = "absent"
|
|
182
|
+
if len(self.missing_partition_samples[partition_id]) < self.sample_limit:
|
|
183
|
+
self.missing_partition_samples[partition_id].append(
|
|
184
|
+
(group_key, "absent")
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
def _coverage(
|
|
188
|
+
self, identifier: str, *, partitions: bool = False
|
|
189
|
+
) -> tuple[int, int, int]:
|
|
190
|
+
present = (
|
|
191
|
+
self.present_counts_partitions[identifier]
|
|
192
|
+
if partitions
|
|
193
|
+
else self.present_counts[identifier]
|
|
194
|
+
)
|
|
195
|
+
opportunities = self.total_vectors
|
|
196
|
+
missing = max(opportunities - present, 0)
|
|
197
|
+
return present, missing, opportunities
|
|
198
|
+
|
|
199
|
+
def _feature_null_count(self, feature_id: str) -> int:
|
|
200
|
+
total = 0
|
|
201
|
+
for partition_id, count in self.null_counts_partitions.items():
|
|
202
|
+
if self._normalize(partition_id) == feature_id:
|
|
203
|
+
total += count
|
|
204
|
+
return total
|
|
205
|
+
|
|
206
|
+
@staticmethod
|
|
207
|
+
def _format_group_key(group_key: Hashable) -> str:
|
|
208
|
+
if isinstance(group_key, tuple):
|
|
209
|
+
return ", ".join(str(part) for part in group_key)
|
|
210
|
+
return str(group_key)
|
|
211
|
+
|
|
212
|
+
@staticmethod
|
|
213
|
+
def _symbol_for(status: str) -> str:
|
|
214
|
+
return {
|
|
215
|
+
"present": "#",
|
|
216
|
+
"null": "!",
|
|
217
|
+
"absent": ".",
|
|
218
|
+
}.get(status, ".")
|
|
219
|
+
|
|
220
|
+
@staticmethod
|
|
221
|
+
def _format_samples(samples: list[tuple[Hashable, str]], limit: int = 3) -> str:
|
|
222
|
+
if not samples:
|
|
223
|
+
return ""
|
|
224
|
+
trimmed = samples[:limit]
|
|
225
|
+
rendered = ", ".join(
|
|
226
|
+
f"{reason}@{sample}" for sample, reason in trimmed)
|
|
227
|
+
if len(samples) > limit:
|
|
228
|
+
rendered += ", …"
|
|
229
|
+
return rendered
|
|
230
|
+
|
|
231
|
+
@staticmethod
|
|
232
|
+
def _partition_suffix(partition_id: str) -> str:
|
|
233
|
+
return partition_id.split("__", 1)[1] if "__" in partition_id else partition_id
|
|
234
|
+
|
|
235
|
+
def _render_matrix(
|
|
236
|
+
self,
|
|
237
|
+
*,
|
|
238
|
+
features: list[str],
|
|
239
|
+
partitions: bool = False,
|
|
240
|
+
column_width: int = 6,
|
|
241
|
+
) -> None:
|
|
242
|
+
status_map = self.group_partition_status if partitions else self.group_feature_status
|
|
243
|
+
if not status_map or not features:
|
|
244
|
+
return
|
|
245
|
+
|
|
246
|
+
column_width = max(column_width, min(
|
|
247
|
+
10, max(len(fid) for fid in features)))
|
|
248
|
+
|
|
249
|
+
def status_for(group: Hashable, fid: str) -> str:
|
|
250
|
+
statuses = status_map.get(group, {})
|
|
251
|
+
return statuses.get(fid, "absent")
|
|
27
252
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
253
|
+
sorted_groups = sorted(status_map.keys(), key=self._group_sort_key)
|
|
254
|
+
focus_groups = [
|
|
255
|
+
g
|
|
256
|
+
for g in sorted_groups
|
|
257
|
+
if any(status_for(g, fid) != "present" for fid in features)
|
|
258
|
+
]
|
|
259
|
+
if not focus_groups:
|
|
260
|
+
focus_groups = sorted_groups
|
|
261
|
+
if self.matrix_rows is not None:
|
|
262
|
+
focus_groups = focus_groups[: self.matrix_rows]
|
|
263
|
+
|
|
264
|
+
matrix_label = "Partition" if partitions else "Feature"
|
|
265
|
+
print(f"\n→ {matrix_label} availability heatmap:")
|
|
266
|
+
|
|
267
|
+
header = " " * 20 + " ".join(
|
|
268
|
+
f"{fid[-column_width:]:>{column_width}}" for fid in features
|
|
269
|
+
)
|
|
270
|
+
print(header)
|
|
271
|
+
|
|
272
|
+
for group in focus_groups:
|
|
273
|
+
label = self._format_group_key(group)
|
|
274
|
+
label = label[:18].ljust(18)
|
|
275
|
+
cells = " ".join(
|
|
276
|
+
f"{self._symbol_for(status_for(group, fid)):^{column_width}}"
|
|
277
|
+
for fid in features
|
|
278
|
+
)
|
|
279
|
+
print(f" {label} {cells}")
|
|
280
|
+
|
|
281
|
+
print(" Legend: # present | ! null | . missing")
|
|
282
|
+
|
|
283
|
+
def print_report(self) -> None:
|
|
284
|
+
tracked_features = (
|
|
285
|
+
self.expected_features if self.expected_features else self.discovered_features
|
|
286
|
+
)
|
|
287
|
+
tracked_partitions = (
|
|
288
|
+
set(self.expected_features)
|
|
289
|
+
if self.match_partition == "full" and self.expected_features
|
|
290
|
+
else self.discovered_partitions
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
summary: dict[str, Any] = {
|
|
294
|
+
"total_vectors": self.total_vectors,
|
|
295
|
+
"empty_vectors": self.empty_vectors,
|
|
296
|
+
"match_partition": self.match_partition,
|
|
297
|
+
"tracked_features": sorted(tracked_features),
|
|
298
|
+
"tracked_partitions": sorted(tracked_partitions),
|
|
299
|
+
"threshold": self.threshold,
|
|
300
|
+
}
|
|
34
301
|
|
|
35
|
-
def print_report(self):
|
|
36
302
|
print("\n=== Vector Quality Report ===")
|
|
37
303
|
print(f"Total vectors processed: {self.total_vectors}")
|
|
38
304
|
print(f"Empty vectors: {self.empty_vectors}")
|
|
39
305
|
print(
|
|
40
|
-
f"Features
|
|
306
|
+
f"Features tracked ({self.match_partition}): {len(tracked_features)}"
|
|
307
|
+
)
|
|
308
|
+
if self.match_partition == "full":
|
|
309
|
+
print(f"Partitions observed: {len(self.discovered_partitions)}")
|
|
310
|
+
|
|
311
|
+
if not self.total_vectors:
|
|
312
|
+
print("(no vectors analyzed)")
|
|
313
|
+
summary.update(
|
|
314
|
+
{
|
|
315
|
+
"feature_stats": [],
|
|
316
|
+
"partition_stats": [],
|
|
317
|
+
"below_features": [],
|
|
318
|
+
"keep_features": [],
|
|
319
|
+
"below_partitions": [],
|
|
320
|
+
"keep_partitions": [],
|
|
321
|
+
"below_suffixes": [],
|
|
322
|
+
"keep_suffixes": [],
|
|
323
|
+
}
|
|
324
|
+
)
|
|
325
|
+
return summary
|
|
326
|
+
|
|
327
|
+
feature_stats = []
|
|
328
|
+
print("\n→ Feature coverage (sorted by missing count):")
|
|
329
|
+
for feature_id in sorted(
|
|
330
|
+
tracked_features,
|
|
331
|
+
key=lambda fid: self._coverage(fid)[1],
|
|
332
|
+
reverse=True,
|
|
333
|
+
):
|
|
334
|
+
present, missing, opportunities = self._coverage(feature_id)
|
|
335
|
+
coverage = present / opportunities if opportunities else 0.0
|
|
336
|
+
nulls = self._feature_null_count(feature_id)
|
|
337
|
+
raw_samples = self.missing_samples.get(feature_id, [])
|
|
338
|
+
sample_note = self._format_samples(raw_samples)
|
|
339
|
+
samples = [
|
|
340
|
+
{
|
|
341
|
+
"group": self._format_group_key(group_key),
|
|
342
|
+
"status": status,
|
|
343
|
+
}
|
|
344
|
+
for group_key, status in raw_samples
|
|
345
|
+
]
|
|
346
|
+
line = (
|
|
347
|
+
f" - {feature_id}: present {present}/{opportunities}"
|
|
348
|
+
f" ({coverage:.1%}) | missing {missing} | null {nulls}"
|
|
349
|
+
)
|
|
350
|
+
if sample_note:
|
|
351
|
+
line += f"; samples: {sample_note}"
|
|
352
|
+
print(line)
|
|
353
|
+
feature_stats.append(
|
|
354
|
+
{
|
|
355
|
+
"id": feature_id,
|
|
356
|
+
"present": present,
|
|
357
|
+
"missing": missing,
|
|
358
|
+
"nulls": nulls,
|
|
359
|
+
"coverage": coverage,
|
|
360
|
+
"opportunities": opportunities,
|
|
361
|
+
"samples": samples,
|
|
362
|
+
}
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
summary["feature_stats"] = feature_stats
|
|
366
|
+
|
|
367
|
+
partition_stats = []
|
|
368
|
+
if tracked_partitions:
|
|
369
|
+
for partition_id in tracked_partitions:
|
|
370
|
+
present, missing, opportunities = self._coverage(
|
|
371
|
+
partition_id, partitions=True
|
|
372
|
+
)
|
|
373
|
+
coverage = present / opportunities if opportunities else 0.0
|
|
374
|
+
nulls = self.null_counts_partitions.get(partition_id, 0)
|
|
375
|
+
raw_samples = self.missing_partition_samples.get(
|
|
376
|
+
partition_id, [])
|
|
377
|
+
partition_stats.append(
|
|
378
|
+
{
|
|
379
|
+
"id": partition_id,
|
|
380
|
+
"base": _base_feature_id(partition_id),
|
|
381
|
+
"present": present,
|
|
382
|
+
"missing": missing,
|
|
383
|
+
"nulls": nulls,
|
|
384
|
+
"coverage": coverage,
|
|
385
|
+
"opportunities": opportunities,
|
|
386
|
+
"samples": [
|
|
387
|
+
{
|
|
388
|
+
"group": self._format_group_key(group_key),
|
|
389
|
+
"status": status,
|
|
390
|
+
}
|
|
391
|
+
for group_key, status in raw_samples
|
|
392
|
+
],
|
|
393
|
+
}
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
print("\n→ Partition details (top by missing count):")
|
|
397
|
+
for stats in sorted(partition_stats, key=lambda s: s["missing"], reverse=True)[
|
|
398
|
+
:20
|
|
399
|
+
]:
|
|
400
|
+
line = (
|
|
401
|
+
f" - {stats['id']} (base: {stats['base']}): present {stats['present']}/{stats['opportunities']}"
|
|
402
|
+
f" ({stats['coverage']:.1%}) | missing {stats['missing']} | null/invalid {stats['nulls']}"
|
|
403
|
+
)
|
|
404
|
+
print(line)
|
|
405
|
+
|
|
406
|
+
summary["partition_stats"] = partition_stats
|
|
407
|
+
|
|
408
|
+
below_features: list[str] = []
|
|
409
|
+
above_features: list[str] = []
|
|
410
|
+
below_partitions: list[str] = []
|
|
411
|
+
above_partitions: list[str] = []
|
|
412
|
+
|
|
413
|
+
if self.threshold is not None:
|
|
414
|
+
thr = self.threshold
|
|
415
|
+
below_features = [
|
|
416
|
+
stats["id"] for stats in feature_stats if stats["coverage"] < thr
|
|
417
|
+
]
|
|
418
|
+
above_features = [
|
|
419
|
+
stats["id"] for stats in feature_stats if stats["coverage"] >= thr
|
|
420
|
+
]
|
|
421
|
+
print(
|
|
422
|
+
f"\n📉 Features below {thr:.0%} coverage:\n below_features = {below_features}"
|
|
423
|
+
)
|
|
424
|
+
print(
|
|
425
|
+
f"📈 Features at/above {thr:.0%} coverage:\n keep_features = {above_features}"
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
if partition_stats:
|
|
429
|
+
below_partitions = [
|
|
430
|
+
stats["id"]
|
|
431
|
+
for stats in partition_stats
|
|
432
|
+
if stats["coverage"] < thr
|
|
433
|
+
]
|
|
434
|
+
above_partitions = [
|
|
435
|
+
stats["id"]
|
|
436
|
+
for stats in partition_stats
|
|
437
|
+
if stats["coverage"] >= thr
|
|
438
|
+
]
|
|
439
|
+
below_suffixes = [
|
|
440
|
+
self._partition_suffix(pid)
|
|
441
|
+
for pid in below_partitions
|
|
442
|
+
]
|
|
443
|
+
above_suffixes = [
|
|
444
|
+
self._partition_suffix(pid)
|
|
445
|
+
for pid in above_partitions
|
|
446
|
+
if self._partition_suffix(pid) != pid
|
|
447
|
+
]
|
|
448
|
+
if not above_partitions:
|
|
449
|
+
above_suffixes = []
|
|
450
|
+
print(
|
|
451
|
+
f"\n📉 Partitions below {thr:.0%} coverage:\n below_partitions = {below_partitions}"
|
|
452
|
+
)
|
|
453
|
+
print(f" below_suffixes = {below_suffixes}")
|
|
454
|
+
print(
|
|
455
|
+
f"📈 Partitions at/above {thr:.0%} coverage:\n keep_partitions = {above_partitions}"
|
|
456
|
+
)
|
|
457
|
+
print(f" keep_suffixes = {above_suffixes}")
|
|
458
|
+
|
|
459
|
+
summary.update(
|
|
460
|
+
{
|
|
461
|
+
"below_features": below_features,
|
|
462
|
+
"keep_features": above_features,
|
|
463
|
+
"below_partitions": below_partitions,
|
|
464
|
+
"keep_partitions": above_partitions,
|
|
465
|
+
"below_suffixes": below_suffixes,
|
|
466
|
+
"keep_suffixes": above_suffixes,
|
|
467
|
+
}
|
|
468
|
+
)
|
|
469
|
+
else:
|
|
470
|
+
summary.update(
|
|
471
|
+
{
|
|
472
|
+
"below_features": [],
|
|
473
|
+
"keep_features": [stats["id"] for stats in feature_stats],
|
|
474
|
+
"below_partitions": [],
|
|
475
|
+
"keep_partitions": [stats["id"] for stats in partition_stats],
|
|
476
|
+
"below_suffixes": [],
|
|
477
|
+
"keep_suffixes": [
|
|
478
|
+
self._partition_suffix(stats["id"])
|
|
479
|
+
for stats in partition_stats
|
|
480
|
+
if self._partition_suffix(stats["id"]) != stats["id"]
|
|
481
|
+
]
|
|
482
|
+
if partition_stats
|
|
483
|
+
else [],
|
|
484
|
+
}
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
if self.show_matrix:
|
|
488
|
+
feature_candidates = (
|
|
489
|
+
below_features
|
|
490
|
+
or [stats["id"] for stats in feature_stats if stats["missing"] > 0]
|
|
491
|
+
or [stats["id"] for stats in feature_stats]
|
|
492
|
+
)
|
|
493
|
+
selected_features = (
|
|
494
|
+
feature_candidates
|
|
495
|
+
if self.matrix_cols is None
|
|
496
|
+
else feature_candidates[: self.matrix_cols]
|
|
497
|
+
)
|
|
498
|
+
if selected_features:
|
|
499
|
+
self._render_matrix(features=selected_features)
|
|
500
|
+
|
|
501
|
+
if partition_stats:
|
|
502
|
+
partition_candidates = (
|
|
503
|
+
below_partitions
|
|
504
|
+
or [stats["id"] for stats in partition_stats if stats["missing"] > 0]
|
|
505
|
+
or [stats["id"] for stats in partition_stats]
|
|
506
|
+
)
|
|
507
|
+
selected_partitions = (
|
|
508
|
+
partition_candidates
|
|
509
|
+
if self.matrix_cols is None
|
|
510
|
+
else partition_candidates[: self.matrix_cols]
|
|
511
|
+
)
|
|
512
|
+
if selected_partitions:
|
|
513
|
+
self._render_matrix(
|
|
514
|
+
features=selected_partitions, partitions=True)
|
|
515
|
+
|
|
516
|
+
group_missing = [
|
|
517
|
+
(
|
|
518
|
+
group,
|
|
519
|
+
sum(
|
|
520
|
+
1
|
|
521
|
+
for fid in tracked_features
|
|
522
|
+
if self.group_feature_status[group].get(fid, "absent") != "present"
|
|
523
|
+
),
|
|
524
|
+
)
|
|
525
|
+
for group in self.group_feature_status
|
|
526
|
+
]
|
|
527
|
+
group_missing = [item for item in group_missing if item[1] > 0]
|
|
528
|
+
if group_missing:
|
|
529
|
+
print("\n→ Time buckets with missing features:")
|
|
530
|
+
for group, count in sorted(group_missing, key=lambda item: item[1], reverse=True)[:10]:
|
|
531
|
+
print(
|
|
532
|
+
f" - {self._format_group_key(group)}: {count} features missing")
|
|
533
|
+
|
|
534
|
+
if partition_stats:
|
|
535
|
+
partition_missing = [
|
|
536
|
+
(
|
|
537
|
+
group,
|
|
538
|
+
sum(
|
|
539
|
+
1
|
|
540
|
+
for pid in self.group_partition_status[group]
|
|
541
|
+
if self.group_partition_status[group].get(pid, "absent") != "present"
|
|
542
|
+
),
|
|
543
|
+
)
|
|
544
|
+
for group in self.group_partition_status
|
|
545
|
+
]
|
|
546
|
+
partition_missing = [
|
|
547
|
+
item for item in partition_missing if item[1] > 0]
|
|
548
|
+
if partition_missing:
|
|
549
|
+
print("\n→ Time buckets with missing partitions:")
|
|
550
|
+
for group, count in sorted(
|
|
551
|
+
partition_missing, key=lambda item: item[1], reverse=True
|
|
552
|
+
)[:10]:
|
|
553
|
+
print(
|
|
554
|
+
f" - {self._format_group_key(group)}: {count} partitions missing")
|
|
555
|
+
|
|
556
|
+
if self.matrix_output:
|
|
557
|
+
self._export_matrix_data()
|
|
558
|
+
|
|
559
|
+
return summary
|
|
560
|
+
|
|
561
|
+
def _export_matrix_data(self) -> None:
|
|
562
|
+
if not self.matrix_output:
|
|
563
|
+
return
|
|
564
|
+
|
|
565
|
+
path = self.matrix_output
|
|
566
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
567
|
+
try:
|
|
568
|
+
if self.matrix_format == "html":
|
|
569
|
+
self._write_matrix_html(path)
|
|
570
|
+
else:
|
|
571
|
+
self._write_matrix_csv(path)
|
|
572
|
+
print(f"\n📝 Saved availability matrix to {path}")
|
|
573
|
+
except OSError as exc:
|
|
574
|
+
print(f"\n⚠️ Failed to write availability matrix to {path}: {exc}")
|
|
575
|
+
|
|
576
|
+
def _collect_feature_ids(self) -> list[str]:
|
|
577
|
+
feature_ids: set[str] = set()
|
|
578
|
+
for statuses in self.group_feature_status.values():
|
|
579
|
+
feature_ids.update(statuses.keys())
|
|
580
|
+
return sorted(feature_ids)
|
|
581
|
+
|
|
582
|
+
def _collect_partition_ids(self) -> list[str]:
|
|
583
|
+
partition_ids: set[str] = set()
|
|
584
|
+
for statuses in self.group_partition_status.values():
|
|
585
|
+
partition_ids.update(statuses.keys())
|
|
586
|
+
return sorted(partition_ids)
|
|
587
|
+
|
|
588
|
+
def _collect_group_keys(self) -> list[Hashable]:
|
|
589
|
+
keys = set(self.group_feature_status.keys()) | set(
|
|
590
|
+
self.group_partition_status.keys()
|
|
591
|
+
)
|
|
592
|
+
return sorted(keys, key=self._group_sort_key)
|
|
593
|
+
|
|
594
|
+
def _write_matrix_csv(self, path: Path) -> None:
|
|
595
|
+
rows: list[tuple[str, str, str, str]] = []
|
|
596
|
+
for group, statuses in self.group_feature_status.items():
|
|
597
|
+
group_key = self._format_group_key(group)
|
|
598
|
+
for fid, status in statuses.items():
|
|
599
|
+
rows.append(("feature", fid, group_key, status))
|
|
600
|
+
|
|
601
|
+
for group, statuses in self.group_partition_status.items():
|
|
602
|
+
group_key = self._format_group_key(group)
|
|
603
|
+
for pid, status in statuses.items():
|
|
604
|
+
rows.append(("partition", pid, group_key, status))
|
|
605
|
+
|
|
606
|
+
with path.open("w", newline="", encoding="utf-8") as csvfile:
|
|
607
|
+
writer = csv.writer(csvfile)
|
|
608
|
+
writer.writerow(["kind", "identifier", "group_key", "status"])
|
|
609
|
+
writer.writerows(rows)
|
|
610
|
+
|
|
611
|
+
def _write_matrix_html(self, path: Path) -> None:
|
|
612
|
+
feature_ids = self._collect_feature_ids()
|
|
613
|
+
partition_ids = self._collect_partition_ids()
|
|
614
|
+
group_keys = self._collect_group_keys()
|
|
615
|
+
|
|
616
|
+
def render_table(title: str, identifiers: list[str], status_map: dict, sub_map: dict) -> str:
|
|
617
|
+
if not identifiers:
|
|
618
|
+
return f"<h2>{title}</h2><p>No data.</p>"
|
|
619
|
+
|
|
620
|
+
cell_class = {
|
|
621
|
+
"present": "status-present",
|
|
622
|
+
"null": "status-null",
|
|
623
|
+
"absent": "status-absent",
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
header = "".join(
|
|
627
|
+
f"<th>{id_}</th>" for id_ in identifiers
|
|
628
|
+
)
|
|
629
|
+
rows_html = []
|
|
630
|
+
for group in group_keys:
|
|
631
|
+
key_str = self._format_group_key(group)
|
|
632
|
+
statuses = status_map.get(group, {})
|
|
633
|
+
cells = []
|
|
634
|
+
for identifier in identifiers:
|
|
635
|
+
status = statuses.get(identifier, "absent")
|
|
636
|
+
cls = cell_class.get(status, "status-absent")
|
|
637
|
+
# Render sub-cells when available to convey partial availability
|
|
638
|
+
sub = sub_map.get(group, {}).get(identifier)
|
|
639
|
+
if sub:
|
|
640
|
+
parts = "".join(
|
|
641
|
+
f"<span class='{cell_class.get(s, 'status-absent')}' title='{s}'> </span>"
|
|
642
|
+
for s in sub
|
|
643
|
+
)
|
|
644
|
+
cells.append(
|
|
645
|
+
f"<td title='{status}'><div class='sub'>{parts}</div></td>")
|
|
646
|
+
else:
|
|
647
|
+
symbol = self._symbol_for(status)
|
|
648
|
+
cells.append(
|
|
649
|
+
f"<td class='{cls}' title='{status}'>{symbol}</td>")
|
|
650
|
+
rows_html.append(
|
|
651
|
+
f"<tr><th>{key_str}</th>{''.join(cells)}</tr>"
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
return (
|
|
655
|
+
f"<h2>{title}</h2>"
|
|
656
|
+
"<table class='heatmap'>"
|
|
657
|
+
f"<tr><th>Group</th>{header}</tr>"
|
|
658
|
+
f"{''.join(rows_html)}"
|
|
659
|
+
"</table>"
|
|
660
|
+
)
|
|
661
|
+
|
|
662
|
+
feature_table = render_table(
|
|
663
|
+
"Feature Availability",
|
|
664
|
+
feature_ids,
|
|
665
|
+
self.group_feature_status,
|
|
666
|
+
self.group_feature_sub,
|
|
667
|
+
)
|
|
668
|
+
partition_table = render_table(
|
|
669
|
+
"Partition Availability",
|
|
670
|
+
partition_ids,
|
|
671
|
+
self.group_partition_status,
|
|
672
|
+
self.group_partition_sub,
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
style = """
|
|
676
|
+
body { font-family: Arial, sans-serif; }
|
|
677
|
+
table.heatmap { border-collapse: collapse; margin-bottom: 2rem; }
|
|
678
|
+
.heatmap th, .heatmap td { border: 1px solid #ccc; padding: 4px 6px; }
|
|
679
|
+
.heatmap th { background: #f0f0f0; position: sticky; top: 0; }
|
|
680
|
+
.status-present { background: #2ecc71; color: #fff; }
|
|
681
|
+
.status-null { background: #f1c40f; color: #000; }
|
|
682
|
+
.status-absent { background: #e74c3c; color: #fff; }
|
|
683
|
+
.sub { display: flex; gap: 1px; height: 12px; }
|
|
684
|
+
.sub span { flex: 1; display: block; }
|
|
685
|
+
"""
|
|
41
686
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
687
|
+
html = (
|
|
688
|
+
"<html><head><meta charset='utf-8'>"
|
|
689
|
+
f"<style>{style}</style>"
|
|
690
|
+
"<title>Feature Availability</title></head><body>"
|
|
691
|
+
f"<h1>Availability Matrix</h1>{feature_table}{partition_table}"
|
|
692
|
+
"</body></html>"
|
|
693
|
+
)
|
|
45
694
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
for group_key, missing in list(self.per_group_missing.items())[:5]:
|
|
49
|
-
print(f" - Group {group_key}: {sorted(missing)}")
|
|
695
|
+
with path.open("w", encoding="utf-8") as fh:
|
|
696
|
+
fh.write(html)
|