jerry-thomas 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +120 -17
- datapipeline/analysis/vector/matrix.py +33 -8
- datapipeline/analysis/vector/report.py +162 -32
- datapipeline/build/tasks/__init__.py +11 -0
- datapipeline/build/tasks/config.py +74 -0
- datapipeline/build/tasks/metadata.py +170 -0
- datapipeline/build/tasks/scaler.py +73 -0
- datapipeline/build/tasks/schema.py +60 -0
- datapipeline/build/tasks/utils.py +169 -0
- datapipeline/cli/app.py +304 -127
- datapipeline/cli/commands/build.py +240 -16
- datapipeline/cli/commands/contract.py +367 -0
- datapipeline/cli/commands/domain.py +8 -3
- datapipeline/cli/commands/inspect.py +401 -149
- datapipeline/cli/commands/list_.py +30 -7
- datapipeline/cli/commands/plugin.py +1 -1
- datapipeline/cli/commands/run.py +227 -241
- datapipeline/cli/commands/run_config.py +101 -0
- datapipeline/cli/commands/serve_pipeline.py +156 -0
- datapipeline/cli/commands/source.py +44 -8
- datapipeline/cli/visuals/__init__.py +4 -2
- datapipeline/cli/visuals/common.py +239 -0
- datapipeline/cli/visuals/labels.py +15 -15
- datapipeline/cli/visuals/runner.py +66 -0
- datapipeline/cli/visuals/sections.py +20 -0
- datapipeline/cli/visuals/sources.py +132 -119
- datapipeline/cli/visuals/sources_basic.py +260 -0
- datapipeline/cli/visuals/sources_off.py +76 -0
- datapipeline/cli/visuals/sources_rich.py +414 -0
- datapipeline/config/catalog.py +37 -3
- datapipeline/config/context.py +214 -0
- datapipeline/config/dataset/loader.py +21 -4
- datapipeline/config/dataset/normalize.py +4 -4
- datapipeline/config/metadata.py +43 -0
- datapipeline/config/postprocess.py +2 -2
- datapipeline/config/project.py +3 -2
- datapipeline/config/resolution.py +129 -0
- datapipeline/config/tasks.py +309 -0
- datapipeline/config/workspace.py +155 -0
- datapipeline/domain/__init__.py +12 -0
- datapipeline/domain/record.py +11 -0
- datapipeline/domain/sample.py +54 -0
- datapipeline/integrations/ml/adapter.py +34 -20
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +1 -6
- datapipeline/integrations/ml/torch_support.py +1 -3
- datapipeline/io/factory.py +112 -0
- datapipeline/io/output.py +132 -0
- datapipeline/io/protocols.py +21 -0
- datapipeline/io/serializers.py +219 -0
- datapipeline/io/sinks/__init__.py +23 -0
- datapipeline/io/sinks/base.py +2 -0
- datapipeline/io/sinks/files.py +79 -0
- datapipeline/io/sinks/rich.py +57 -0
- datapipeline/io/sinks/stdout.py +18 -0
- datapipeline/io/writers/__init__.py +14 -0
- datapipeline/io/writers/base.py +28 -0
- datapipeline/io/writers/csv_writer.py +25 -0
- datapipeline/io/writers/jsonl.py +52 -0
- datapipeline/io/writers/pickle_writer.py +30 -0
- datapipeline/pipeline/artifacts.py +58 -0
- datapipeline/pipeline/context.py +66 -7
- datapipeline/pipeline/observability.py +65 -0
- datapipeline/pipeline/pipelines.py +65 -13
- datapipeline/pipeline/split.py +11 -10
- datapipeline/pipeline/stages.py +127 -16
- datapipeline/pipeline/utils/keygen.py +20 -7
- datapipeline/pipeline/utils/memory_sort.py +22 -10
- datapipeline/pipeline/utils/transform_utils.py +22 -0
- datapipeline/runtime.py +5 -2
- datapipeline/services/artifacts.py +12 -6
- datapipeline/services/bootstrap/config.py +25 -0
- datapipeline/services/bootstrap/core.py +52 -37
- datapipeline/services/constants.py +6 -5
- datapipeline/services/factories.py +123 -1
- datapipeline/services/project_paths.py +43 -16
- datapipeline/services/runs.py +208 -0
- datapipeline/services/scaffold/domain.py +3 -2
- datapipeline/services/scaffold/filter.py +3 -2
- datapipeline/services/scaffold/mappers.py +9 -6
- datapipeline/services/scaffold/plugin.py +3 -3
- datapipeline/services/scaffold/source.py +93 -56
- datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
- datapipeline/sources/decoders.py +83 -18
- datapipeline/sources/factory.py +26 -16
- datapipeline/sources/models/__init__.py +2 -2
- datapipeline/sources/models/generator.py +0 -7
- datapipeline/sources/models/loader.py +3 -3
- datapipeline/sources/models/parsing_error.py +24 -0
- datapipeline/sources/models/source.py +6 -6
- datapipeline/sources/synthetic/time/loader.py +14 -2
- datapipeline/sources/transports.py +74 -37
- datapipeline/templates/plugin_skeleton/README.md +74 -30
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
- datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
- datapipeline/templates/plugin_skeleton/jerry.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
- datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -0
- datapipeline/templates/stubs/mapper.py.j2 +5 -4
- datapipeline/templates/stubs/parser.py.j2 +2 -0
- datapipeline/templates/stubs/record.py.j2 +2 -0
- datapipeline/templates/stubs/source.yaml.j2 +2 -3
- datapipeline/transforms/debug/lint.py +26 -41
- datapipeline/transforms/feature/scaler.py +89 -13
- datapipeline/transforms/record/floor_time.py +4 -4
- datapipeline/transforms/sequence.py +2 -35
- datapipeline/transforms/stream/dedupe.py +24 -0
- datapipeline/transforms/stream/ensure_ticks.py +7 -6
- datapipeline/transforms/vector/__init__.py +5 -0
- datapipeline/transforms/vector/common.py +98 -0
- datapipeline/transforms/vector/drop/__init__.py +4 -0
- datapipeline/transforms/vector/drop/horizontal.py +79 -0
- datapipeline/transforms/vector/drop/orchestrator.py +59 -0
- datapipeline/transforms/vector/drop/vertical.py +182 -0
- datapipeline/transforms/vector/ensure_schema.py +184 -0
- datapipeline/transforms/vector/fill.py +87 -0
- datapipeline/transforms/vector/replace.py +62 -0
- datapipeline/utils/load.py +24 -3
- datapipeline/utils/rich_compat.py +38 -0
- datapipeline/utils/window.py +76 -0
- jerry_thomas-1.0.0.dist-info/METADATA +825 -0
- jerry_thomas-1.0.0.dist-info/RECORD +199 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/entry_points.txt +9 -8
- datapipeline/build/tasks.py +0 -186
- datapipeline/cli/commands/link.py +0 -128
- datapipeline/cli/commands/writers.py +0 -138
- datapipeline/config/build.py +0 -64
- datapipeline/config/run.py +0 -116
- datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
- datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
- datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
- datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
- datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
- datapipeline/transforms/vector.py +0 -210
- jerry_thomas-0.3.0.dist-info/METADATA +0 -502
- jerry_thomas-0.3.0.dist-info/RECORD +0 -139
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/WHEEL +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -27,6 +27,7 @@ class VectorStatsCollector:
|
|
|
27
27
|
expected_feature_ids: Iterable[str] | None = None,
|
|
28
28
|
*,
|
|
29
29
|
match_partition: Literal["base", "full"] = "base",
|
|
30
|
+
schema_meta: dict[str, dict[str, Any]] | None = None,
|
|
30
31
|
sample_limit: int = 5,
|
|
31
32
|
threshold: float | None = 0.95,
|
|
32
33
|
show_matrix: bool = False,
|
|
@@ -48,6 +49,7 @@ class VectorStatsCollector:
|
|
|
48
49
|
if expected_feature_ids
|
|
49
50
|
else set()
|
|
50
51
|
)
|
|
52
|
+
self.schema_meta = schema_meta or {}
|
|
51
53
|
|
|
52
54
|
self.discovered_features: set[str] = set()
|
|
53
55
|
self.discovered_partitions: set[str] = set()
|
|
@@ -55,9 +57,14 @@ class VectorStatsCollector:
|
|
|
55
57
|
self.total_vectors = 0
|
|
56
58
|
self.empty_vectors = 0
|
|
57
59
|
|
|
58
|
-
self.
|
|
59
|
-
self.
|
|
60
|
+
self.seen_counts = Counter()
|
|
61
|
+
self.null_counts_features = Counter()
|
|
62
|
+
self.seen_counts_partitions = Counter()
|
|
60
63
|
self.null_counts_partitions = Counter()
|
|
64
|
+
self.cadence_null_counts = Counter()
|
|
65
|
+
self.cadence_opportunities = Counter()
|
|
66
|
+
self.cadence_null_counts_partitions = Counter()
|
|
67
|
+
self.cadence_opportunities_partitions = Counter()
|
|
61
68
|
|
|
62
69
|
self.missing_samples = defaultdict(list)
|
|
63
70
|
self.missing_partition_samples = defaultdict(list)
|
|
@@ -107,6 +114,8 @@ class VectorStatsCollector:
|
|
|
107
114
|
|
|
108
115
|
present_normalized: set[str] = set()
|
|
109
116
|
seen_partitions: set[str] = set()
|
|
117
|
+
feature_seen_present: dict[str, bool] = {}
|
|
118
|
+
feature_seen_null: dict[str, bool] = {}
|
|
110
119
|
for partition_id in present_partitions:
|
|
111
120
|
normalized = self._normalize(partition_id)
|
|
112
121
|
present_normalized.add(normalized)
|
|
@@ -122,12 +131,14 @@ class VectorStatsCollector:
|
|
|
122
131
|
|
|
123
132
|
# Capture sub-status for list-valued entries
|
|
124
133
|
sub: list[str] | None = None
|
|
134
|
+
has_present_element = False
|
|
125
135
|
if isinstance(value, list):
|
|
126
136
|
sub = []
|
|
127
137
|
for v in value:
|
|
128
138
|
if v is None or (isinstance(v, float) and v != v):
|
|
129
139
|
sub.append("null")
|
|
130
140
|
else:
|
|
141
|
+
has_present_element = True
|
|
131
142
|
sub.append("present")
|
|
132
143
|
if sub:
|
|
133
144
|
self.group_partition_sub[group_key][partition_id] = sub
|
|
@@ -135,10 +146,10 @@ class VectorStatsCollector:
|
|
|
135
146
|
self.group_feature_sub[group_key].setdefault(
|
|
136
147
|
normalized, sub)
|
|
137
148
|
|
|
138
|
-
is_null = _is_missing_value(value)
|
|
149
|
+
is_null = (not has_present_element) if isinstance(value, list) else _is_missing_value(value)
|
|
139
150
|
if is_null:
|
|
140
|
-
status_features[normalized] = "null"
|
|
141
151
|
status_partitions[partition_id] = "null"
|
|
152
|
+
feature_seen_null[normalized] = True
|
|
142
153
|
self.null_counts_partitions[partition_id] += 1
|
|
143
154
|
if len(self.missing_partition_samples[partition_id]) < self.sample_limit:
|
|
144
155
|
self.missing_partition_samples[partition_id].append(
|
|
@@ -147,14 +158,30 @@ class VectorStatsCollector:
|
|
|
147
158
|
if len(self.missing_samples[normalized]) < self.sample_limit:
|
|
148
159
|
self.missing_samples[normalized].append(
|
|
149
160
|
(group_key, "null"))
|
|
161
|
+
else:
|
|
162
|
+
feature_seen_present[normalized] = True
|
|
163
|
+
|
|
164
|
+
# Cadence-aware null accounting (per schema metadata)
|
|
165
|
+
meta = self.schema_meta.get(normalized) or self.schema_meta.get(partition_id)
|
|
166
|
+
expected_len = self._cadence_expected_length(meta) if meta else None
|
|
167
|
+
if expected_len is not None:
|
|
168
|
+
self._update_cadence(normalized, expected_len, value, partitions=False)
|
|
169
|
+
self._update_cadence(partition_id, expected_len, value, partitions=True)
|
|
150
170
|
|
|
151
171
|
for normalized in present_normalized:
|
|
152
|
-
if
|
|
153
|
-
|
|
172
|
+
if feature_seen_present.get(normalized):
|
|
173
|
+
status_features[normalized] = "present"
|
|
174
|
+
# Drop stale null samples when the feature is ultimately present
|
|
175
|
+
self.missing_samples.pop(normalized, None)
|
|
176
|
+
elif feature_seen_null.get(normalized):
|
|
177
|
+
status_features[normalized] = "null"
|
|
178
|
+
self.null_counts_features[normalized] += 1
|
|
179
|
+
# Count availability (seen) regardless of value
|
|
180
|
+
self.seen_counts[normalized] += 1
|
|
154
181
|
|
|
155
182
|
for partition_id in seen_partitions:
|
|
156
|
-
|
|
157
|
-
|
|
183
|
+
# Availability regardless of value
|
|
184
|
+
self.seen_counts_partitions[partition_id] += 1
|
|
158
185
|
|
|
159
186
|
tracked_features = (
|
|
160
187
|
self.expected_features if self.expected_features else self.discovered_features
|
|
@@ -186,20 +213,16 @@ class VectorStatsCollector:
|
|
|
186
213
|
self, identifier: str, *, partitions: bool = False
|
|
187
214
|
) -> tuple[int, int, int]:
|
|
188
215
|
present = (
|
|
189
|
-
self.
|
|
216
|
+
self.seen_counts_partitions[identifier]
|
|
190
217
|
if partitions
|
|
191
|
-
else self.
|
|
218
|
+
else self.seen_counts[identifier]
|
|
192
219
|
)
|
|
193
220
|
opportunities = self.total_vectors
|
|
194
221
|
missing = max(opportunities - present, 0)
|
|
195
222
|
return present, missing, opportunities
|
|
196
223
|
|
|
197
224
|
def _feature_null_count(self, feature_id: str) -> int:
|
|
198
|
-
|
|
199
|
-
for partition_id, count in self.null_counts_partitions.items():
|
|
200
|
-
if self._normalize(partition_id) == feature_id:
|
|
201
|
-
total += count
|
|
202
|
-
return total
|
|
225
|
+
return self.null_counts_features.get(feature_id, 0)
|
|
203
226
|
|
|
204
227
|
@staticmethod
|
|
205
228
|
def _format_group_key(group_key: Hashable) -> str:
|
|
@@ -230,6 +253,86 @@ class VectorStatsCollector:
|
|
|
230
253
|
def _partition_suffix(partition_id: str) -> str:
|
|
231
254
|
return partition_id.split("__", 1)[1] if "__" in partition_id else partition_id
|
|
232
255
|
|
|
256
|
+
@staticmethod
|
|
257
|
+
def _partition_values(partition_id: str) -> list[str]:
|
|
258
|
+
"""Return partition values without base id or field names."""
|
|
259
|
+
suffix = partition_id.split("__", 1)[1] if "__" in partition_id else partition_id
|
|
260
|
+
if not suffix:
|
|
261
|
+
return []
|
|
262
|
+
|
|
263
|
+
def _components(raw: str) -> list[str]:
|
|
264
|
+
if raw.startswith("@"):
|
|
265
|
+
parts = raw.split("_@")
|
|
266
|
+
return [parts[0]] + [f"@{rest}" for rest in parts[1:]]
|
|
267
|
+
return [raw]
|
|
268
|
+
|
|
269
|
+
values: list[str] = []
|
|
270
|
+
for component in _components(suffix):
|
|
271
|
+
field_value = component.lstrip("@")
|
|
272
|
+
_, _, value = field_value.partition(":")
|
|
273
|
+
candidate = value or field_value
|
|
274
|
+
# If no explicit value delimiter, drop leading field name-ish prefixes
|
|
275
|
+
if not value and "_" in candidate:
|
|
276
|
+
candidate = candidate.rsplit("_", 1)[-1]
|
|
277
|
+
values.append(candidate)
|
|
278
|
+
return values
|
|
279
|
+
|
|
280
|
+
@classmethod
|
|
281
|
+
def _partition_value(cls, partition_id: str) -> str:
|
|
282
|
+
values = cls._partition_values(partition_id)
|
|
283
|
+
if not values:
|
|
284
|
+
return ""
|
|
285
|
+
return values[0] if len(values) == 1 else "_".join(values)
|
|
286
|
+
|
|
287
|
+
@staticmethod
|
|
288
|
+
def _expected_lengths(meta: dict[str, Any]) -> list[int]:
|
|
289
|
+
cadence = meta.get("cadence")
|
|
290
|
+
if isinstance(cadence, dict):
|
|
291
|
+
target = cadence.get("target")
|
|
292
|
+
if isinstance(target, (int, float)) and target > 0:
|
|
293
|
+
return [int(target)]
|
|
294
|
+
modes = meta.get("list_length", {}).get("modes")
|
|
295
|
+
if isinstance(modes, (list, tuple)) and modes:
|
|
296
|
+
ints = [int(m) for m in modes if isinstance(m, (int, float))]
|
|
297
|
+
if ints:
|
|
298
|
+
return sorted(ints)
|
|
299
|
+
expected = meta.get("expected_length")
|
|
300
|
+
if isinstance(expected, (int, float)):
|
|
301
|
+
return [int(expected)]
|
|
302
|
+
max_len = meta.get("list_length", {}).get("max")
|
|
303
|
+
if isinstance(max_len, (int, float)) and max_len > 0:
|
|
304
|
+
return [int(max_len)]
|
|
305
|
+
return []
|
|
306
|
+
|
|
307
|
+
@staticmethod
|
|
308
|
+
def _cadence_expected_length(meta: dict[str, Any]) -> int | None:
|
|
309
|
+
lengths = VectorStatsCollector._expected_lengths(meta)
|
|
310
|
+
return max(lengths) if lengths else None
|
|
311
|
+
|
|
312
|
+
def _update_cadence(
|
|
313
|
+
self, identifier: str, expected_len: int | None, value: Any, *, partitions: bool
|
|
314
|
+
) -> None:
|
|
315
|
+
if expected_len is None:
|
|
316
|
+
return
|
|
317
|
+
counter_nulls = (
|
|
318
|
+
self.cadence_null_counts_partitions if partitions else self.cadence_null_counts
|
|
319
|
+
)
|
|
320
|
+
counter_opps = (
|
|
321
|
+
self.cadence_opportunities_partitions
|
|
322
|
+
if partitions
|
|
323
|
+
else self.cadence_opportunities
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
present = 0
|
|
327
|
+
if isinstance(value, list):
|
|
328
|
+
trimmed = value[:expected_len]
|
|
329
|
+
present = sum(0 if _is_missing_value(v) else 1 for v in trimmed)
|
|
330
|
+
else:
|
|
331
|
+
present = 0 if _is_missing_value(value) else 1
|
|
332
|
+
missing = max(expected_len - present, 0)
|
|
333
|
+
counter_opps[identifier] += expected_len
|
|
334
|
+
counter_nulls[identifier] += missing
|
|
335
|
+
|
|
233
336
|
def _render_matrix(
|
|
234
337
|
self,
|
|
235
338
|
*,
|
|
@@ -246,10 +349,10 @@ class VectorStatsCollector:
|
|
|
246
349
|
column_width=column_width,
|
|
247
350
|
)
|
|
248
351
|
|
|
249
|
-
def print_report(self) -> dict[str, Any]:
|
|
352
|
+
def print_report(self, *, sort_key: str = "missing") -> dict[str, Any]:
|
|
250
353
|
from .report import print_report as _print_report
|
|
251
354
|
|
|
252
|
-
return _print_report(self)
|
|
355
|
+
return _print_report(self, sort_key=sort_key)
|
|
253
356
|
|
|
254
357
|
def _export_matrix_data(self) -> None:
|
|
255
358
|
from .matrix import export_matrix_data
|
|
@@ -81,11 +81,11 @@ def export_matrix_data(collector: VectorStatsCollector) -> None:
|
|
|
81
81
|
_write_matrix_html(collector, path)
|
|
82
82
|
else:
|
|
83
83
|
_write_matrix_csv(collector, path)
|
|
84
|
-
|
|
84
|
+
message = f"[write] Saved availability matrix to {path}"
|
|
85
|
+
logger.info("\n%s", message)
|
|
85
86
|
except OSError as exc:
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
)
|
|
87
|
+
warning = f"[warn] Failed to write availability matrix to {path}: {exc}"
|
|
88
|
+
logger.warning("\n%s", warning)
|
|
89
89
|
|
|
90
90
|
|
|
91
91
|
def _write_matrix_csv(collector: VectorStatsCollector, path: Path) -> None:
|
|
@@ -341,10 +341,11 @@ def _write_matrix_html(collector: VectorStatsCollector, path: Path) -> None:
|
|
|
341
341
|
.heatmap th,
|
|
342
342
|
.heatmap td {
|
|
343
343
|
border: 1px solid #d0d0d0;
|
|
344
|
-
padding:
|
|
344
|
+
padding: 0 5px;
|
|
345
345
|
text-align: center;
|
|
346
346
|
font-size: 13px;
|
|
347
347
|
line-height: 1.2;
|
|
348
|
+
vertical-align: middle;
|
|
348
349
|
}
|
|
349
350
|
.heatmap thead th {
|
|
350
351
|
position: sticky;
|
|
@@ -369,9 +370,33 @@ def _write_matrix_html(collector: VectorStatsCollector, path: Path) -> None:
|
|
|
369
370
|
.status-null { background: #f1c40f; color: #000; font-weight: bold; }
|
|
370
371
|
.status-absent { background: #e74c3c; color: #fff; font-weight: bold; }
|
|
371
372
|
.status-missing { background: #bdc3c7; color: #000; font-weight: bold; }
|
|
372
|
-
.sub {
|
|
373
|
-
|
|
374
|
-
|
|
373
|
+
.sub {
|
|
374
|
+
display: flex;
|
|
375
|
+
gap: 5px;
|
|
376
|
+
height: calc(100% - 2px);
|
|
377
|
+
min-height: 24px;
|
|
378
|
+
padding: 0 2px;
|
|
379
|
+
margin: 1px 0;
|
|
380
|
+
align-items: stretch;
|
|
381
|
+
justify-content: center;
|
|
382
|
+
}
|
|
383
|
+
.sub span {
|
|
384
|
+
flex: 1;
|
|
385
|
+
display: block;
|
|
386
|
+
position: relative;
|
|
387
|
+
border-radius: 4px;
|
|
388
|
+
overflow: hidden;
|
|
389
|
+
border: 1px solid rgba(0,0,0,0.15);
|
|
390
|
+
background: #fff;
|
|
391
|
+
min-width: 12px;
|
|
392
|
+
}
|
|
393
|
+
.sub span::after {
|
|
394
|
+
content: "";
|
|
395
|
+
position: absolute;
|
|
396
|
+
inset: 0;
|
|
397
|
+
display: block;
|
|
398
|
+
border-radius: 4px;
|
|
399
|
+
}
|
|
375
400
|
.sub .status-present::after { background: #2ecc71; }
|
|
376
401
|
.sub .status-null::after { background: #f1c40f; }
|
|
377
402
|
.sub .status-absent::after { background: #e74c3c; }
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any, Literal, TYPE_CHECKING
|
|
3
3
|
import logging
|
|
4
4
|
|
|
5
5
|
from .matrix import export_matrix_data, render_matrix
|
|
6
|
-
from typing import TYPE_CHECKING
|
|
7
6
|
|
|
8
7
|
if TYPE_CHECKING:
|
|
9
8
|
from .collector import VectorStatsCollector
|
|
@@ -12,7 +11,11 @@ if TYPE_CHECKING:
|
|
|
12
11
|
logger = logging.getLogger(__name__)
|
|
13
12
|
|
|
14
13
|
|
|
15
|
-
def print_report(
|
|
14
|
+
def print_report(
|
|
15
|
+
collector: VectorStatsCollector,
|
|
16
|
+
*,
|
|
17
|
+
sort_key: Literal["missing", "nulls"] = "missing",
|
|
18
|
+
) -> dict[str, Any]:
|
|
16
19
|
tracked_features = (
|
|
17
20
|
collector.expected_features
|
|
18
21
|
if collector.expected_features
|
|
@@ -62,15 +65,25 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
|
|
|
62
65
|
return summary
|
|
63
66
|
|
|
64
67
|
feature_stats = []
|
|
65
|
-
|
|
68
|
+
sort_label = "null" if sort_key == "nulls" else "missing"
|
|
69
|
+
logger.info("\n-> Feature coverage (sorted by %s count):", sort_label)
|
|
70
|
+
if sort_key == "nulls":
|
|
71
|
+
def _feature_sort(fid):
|
|
72
|
+
return collector._feature_null_count(fid)
|
|
73
|
+
else:
|
|
74
|
+
def _feature_sort(fid):
|
|
75
|
+
return collector._coverage(fid)[1]
|
|
76
|
+
|
|
66
77
|
for feature_id in sorted(
|
|
67
78
|
tracked_features,
|
|
68
|
-
key=
|
|
79
|
+
key=_feature_sort,
|
|
69
80
|
reverse=True,
|
|
70
81
|
):
|
|
71
82
|
present, missing, opportunities = collector._coverage(feature_id)
|
|
72
83
|
coverage = present / opportunities if opportunities else 0.0
|
|
73
84
|
nulls = collector._feature_null_count(feature_id)
|
|
85
|
+
cadence_nulls = collector.cadence_null_counts.get(feature_id, 0)
|
|
86
|
+
cadence_opps = collector.cadence_opportunities.get(feature_id, 0)
|
|
74
87
|
raw_samples = collector.missing_samples.get(feature_id, [])
|
|
75
88
|
sample_note = collector._format_samples(raw_samples)
|
|
76
89
|
samples = [
|
|
@@ -82,7 +95,7 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
|
|
|
82
95
|
]
|
|
83
96
|
line = (
|
|
84
97
|
f" - {feature_id}: present {present}/{opportunities}"
|
|
85
|
-
f" ({coverage:.1%}) |
|
|
98
|
+
f" ({coverage:.1%}) | absent {missing} | null {nulls}"
|
|
86
99
|
)
|
|
87
100
|
if sample_note:
|
|
88
101
|
line += f"; samples: {sample_note}"
|
|
@@ -93,6 +106,11 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
|
|
|
93
106
|
"present": present,
|
|
94
107
|
"missing": missing,
|
|
95
108
|
"nulls": nulls,
|
|
109
|
+
"cadence_nulls": cadence_nulls,
|
|
110
|
+
"cadence_opportunities": cadence_opps,
|
|
111
|
+
"cadence_null_fraction": (
|
|
112
|
+
cadence_nulls / cadence_opps if cadence_opps else None
|
|
113
|
+
),
|
|
96
114
|
"coverage": coverage,
|
|
97
115
|
"opportunities": opportunities,
|
|
98
116
|
"samples": samples,
|
|
@@ -109,6 +127,12 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
|
|
|
109
127
|
)
|
|
110
128
|
coverage = present / opportunities if opportunities else 0.0
|
|
111
129
|
nulls = collector.null_counts_partitions.get(partition_id, 0)
|
|
130
|
+
cadence_nulls = collector.cadence_null_counts_partitions.get(
|
|
131
|
+
partition_id, 0
|
|
132
|
+
)
|
|
133
|
+
cadence_opps = collector.cadence_opportunities_partitions.get(
|
|
134
|
+
partition_id, 0
|
|
135
|
+
)
|
|
112
136
|
raw_samples = collector.missing_partition_samples.get(
|
|
113
137
|
partition_id, [])
|
|
114
138
|
partition_stats.append(
|
|
@@ -118,6 +142,11 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
|
|
|
118
142
|
"present": present,
|
|
119
143
|
"missing": missing,
|
|
120
144
|
"nulls": nulls,
|
|
145
|
+
"cadence_nulls": cadence_nulls,
|
|
146
|
+
"cadence_opportunities": cadence_opps,
|
|
147
|
+
"cadence_null_fraction": (
|
|
148
|
+
cadence_nulls / cadence_opps if cadence_opps else None
|
|
149
|
+
),
|
|
121
150
|
"coverage": coverage,
|
|
122
151
|
"opportunities": opportunities,
|
|
123
152
|
"samples": [
|
|
@@ -130,13 +159,16 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
|
|
|
130
159
|
}
|
|
131
160
|
)
|
|
132
161
|
|
|
133
|
-
|
|
162
|
+
sort_label_partitions = "null" if sort_key == "nulls" else "absent"
|
|
163
|
+
logger.info("\n-> Partition details (top by %s count):", sort_label_partitions)
|
|
164
|
+
def _partition_sort(stats):
|
|
165
|
+
return stats["nulls"] if sort_key == "nulls" else stats["missing"]
|
|
134
166
|
for stats in sorted(
|
|
135
|
-
partition_stats, key=
|
|
167
|
+
partition_stats, key=_partition_sort, reverse=True
|
|
136
168
|
)[:20]:
|
|
137
169
|
line = (
|
|
138
170
|
f" - {stats['id']} (base: {stats['base']}): present {stats['present']}/{stats['opportunities']}"
|
|
139
|
-
f" ({stats['coverage']:.1%}) |
|
|
171
|
+
f" ({stats['coverage']:.1%}) | absent {stats['missing']} | null/invalid {stats['nulls']}"
|
|
140
172
|
)
|
|
141
173
|
logger.info(line)
|
|
142
174
|
|
|
@@ -148,6 +180,10 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
|
|
|
148
180
|
above_partitions: list[str] = []
|
|
149
181
|
below_suffixes: list[str] = []
|
|
150
182
|
above_suffixes: list[str] = []
|
|
183
|
+
below_partition_values: list[str] = []
|
|
184
|
+
above_partition_values: list[str] = []
|
|
185
|
+
below_partitions_cadence: list[str] = []
|
|
186
|
+
above_partitions_cadence: list[str] = []
|
|
151
187
|
|
|
152
188
|
if collector.threshold is not None:
|
|
153
189
|
thr = collector.threshold
|
|
@@ -157,17 +193,6 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
|
|
|
157
193
|
above_features = [
|
|
158
194
|
stats["id"] for stats in feature_stats if stats["coverage"] >= thr
|
|
159
195
|
]
|
|
160
|
-
logger.warning(
|
|
161
|
-
"\n[low] Features below %.0f%% coverage:\n below_features = %s",
|
|
162
|
-
thr * 100,
|
|
163
|
-
below_features,
|
|
164
|
-
)
|
|
165
|
-
logger.info(
|
|
166
|
-
"[high] Features at/above %.0f%% coverage:\n keep_features = %s",
|
|
167
|
-
thr * 100,
|
|
168
|
-
above_features,
|
|
169
|
-
)
|
|
170
|
-
|
|
171
196
|
if partition_stats:
|
|
172
197
|
below_partitions = [
|
|
173
198
|
stats["id"] for stats in partition_stats if stats["coverage"] < thr
|
|
@@ -185,18 +210,26 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
|
|
|
185
210
|
]
|
|
186
211
|
if not above_partitions:
|
|
187
212
|
above_suffixes = []
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
213
|
+
below_partition_values = [
|
|
214
|
+
v
|
|
215
|
+
for pid in below_partitions
|
|
216
|
+
if "__" in pid and (v := collector._partition_value(pid))
|
|
217
|
+
]
|
|
218
|
+
above_partition_values = [
|
|
219
|
+
v
|
|
220
|
+
for pid in above_partitions
|
|
221
|
+
if "__" in pid and (v := collector._partition_value(pid))
|
|
222
|
+
]
|
|
223
|
+
below_partitions_cadence = [
|
|
224
|
+
stats["id"]
|
|
225
|
+
for stats in partition_stats
|
|
226
|
+
if (stats.get("cadence_null_fraction") or 0) > (1 - thr)
|
|
227
|
+
]
|
|
228
|
+
above_partitions_cadence = [
|
|
229
|
+
stats["id"]
|
|
230
|
+
for stats in partition_stats
|
|
231
|
+
if (stats.get("cadence_null_fraction") or 0) <= (1 - thr)
|
|
232
|
+
]
|
|
200
233
|
|
|
201
234
|
summary.update(
|
|
202
235
|
{
|
|
@@ -216,6 +249,21 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
|
|
|
216
249
|
if partition_stats
|
|
217
250
|
else []
|
|
218
251
|
),
|
|
252
|
+
"below_partition_values": below_partition_values,
|
|
253
|
+
"keep_partition_values": above_partition_values
|
|
254
|
+
or (
|
|
255
|
+
[
|
|
256
|
+
collector._partition_value(stats["id"])
|
|
257
|
+
for stats in partition_stats
|
|
258
|
+
if "__" in stats["id"]
|
|
259
|
+
and collector._partition_value(stats["id"])
|
|
260
|
+
]
|
|
261
|
+
if partition_stats
|
|
262
|
+
else []
|
|
263
|
+
),
|
|
264
|
+
"below_partitions_cadence": below_partitions_cadence,
|
|
265
|
+
"keep_partitions_cadence": above_partitions_cadence
|
|
266
|
+
or [stats["id"] for stats in partition_stats],
|
|
219
267
|
}
|
|
220
268
|
)
|
|
221
269
|
|
|
@@ -310,6 +358,88 @@ def print_report(collector: VectorStatsCollector) -> dict[str, Any]:
|
|
|
310
358
|
if collector.matrix_output:
|
|
311
359
|
export_matrix_data(collector)
|
|
312
360
|
|
|
361
|
+
# Record-level (cadence) gaps for list features/partitions
|
|
362
|
+
partition_cadence = [
|
|
363
|
+
stats
|
|
364
|
+
for stats in partition_stats
|
|
365
|
+
if stats.get("cadence_opportunities")
|
|
366
|
+
]
|
|
367
|
+
if partition_cadence:
|
|
368
|
+
logger.info("\n-> Record-level gaps (expected cadence; null/invalid elements):")
|
|
369
|
+
total_missing = sum(s.get("cadence_nulls", 0) or 0 for s in partition_cadence)
|
|
370
|
+
total_opps = sum(s.get("cadence_opportunities", 0) or 0 for s in partition_cadence)
|
|
371
|
+
if total_opps:
|
|
372
|
+
logger.info(
|
|
373
|
+
" Total null/invalid elements: %d/%d (%.1f%%)",
|
|
374
|
+
total_missing,
|
|
375
|
+
total_opps,
|
|
376
|
+
(total_missing / total_opps) * 100,
|
|
377
|
+
)
|
|
378
|
+
logger.info(" Top partitions by null/invalid elements:")
|
|
379
|
+
for stats in sorted(
|
|
380
|
+
partition_cadence,
|
|
381
|
+
key=lambda s: (s.get("cadence_nulls") or 0),
|
|
382
|
+
reverse=True,
|
|
383
|
+
)[:20]:
|
|
384
|
+
missing_elems = stats.get("cadence_nulls") or 0
|
|
385
|
+
opps = stats.get("cadence_opportunities") or 0
|
|
386
|
+
frac = (missing_elems / opps) if opps else 0
|
|
387
|
+
logger.info(
|
|
388
|
+
" - %s (base: %s): vectors present %d/%d | absent %d | cadence null/invalid %d/%d elements (%.1f%%)",
|
|
389
|
+
stats["id"],
|
|
390
|
+
stats.get("base"),
|
|
391
|
+
stats.get("present", 0),
|
|
392
|
+
stats.get("opportunities", 0),
|
|
393
|
+
stats.get("missing", 0),
|
|
394
|
+
missing_elems,
|
|
395
|
+
opps,
|
|
396
|
+
frac * 100,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
if collector.threshold is not None:
|
|
400
|
+
thr = collector.threshold
|
|
401
|
+
logger.warning(
|
|
402
|
+
"\n[low] Features below %.0f%% coverage:\n below_features = %s",
|
|
403
|
+
thr * 100,
|
|
404
|
+
below_features,
|
|
405
|
+
)
|
|
406
|
+
logger.info(
|
|
407
|
+
"[high] Features at/above %.0f%% coverage:\n keep_features = %s",
|
|
408
|
+
thr * 100,
|
|
409
|
+
above_features,
|
|
410
|
+
)
|
|
411
|
+
if partition_stats:
|
|
412
|
+
logger.warning(
|
|
413
|
+
"\n[low] Partitions below %.0f%% coverage:\n below_partitions = %s",
|
|
414
|
+
thr * 100,
|
|
415
|
+
below_partitions,
|
|
416
|
+
)
|
|
417
|
+
logger.warning(" below_suffixes = %s", below_suffixes)
|
|
418
|
+
if below_partition_values:
|
|
419
|
+
logger.warning(" below_partition_values = %s",
|
|
420
|
+
below_partition_values)
|
|
421
|
+
logger.info(
|
|
422
|
+
"[high] Partitions at/above %.0f%% coverage:\n keep_partitions = %s",
|
|
423
|
+
thr * 100,
|
|
424
|
+
above_partitions,
|
|
425
|
+
)
|
|
426
|
+
logger.info(" keep_suffixes = %s", above_suffixes)
|
|
427
|
+
if above_partition_values:
|
|
428
|
+
logger.info(
|
|
429
|
+
" keep_partition_values = %s", above_partition_values)
|
|
430
|
+
if below_partitions_cadence:
|
|
431
|
+
logger.warning(
|
|
432
|
+
"[low] Partitions below %.0f%% cadence fill:\n below_partitions_cadence = %s",
|
|
433
|
+
thr * 100,
|
|
434
|
+
below_partitions_cadence,
|
|
435
|
+
)
|
|
436
|
+
if above_partitions_cadence:
|
|
437
|
+
logger.info(
|
|
438
|
+
"[high] Partitions at/above %.0f%% cadence fill:\n keep_partitions_cadence = %s",
|
|
439
|
+
thr * 100,
|
|
440
|
+
above_partitions_cadence,
|
|
441
|
+
)
|
|
442
|
+
|
|
313
443
|
return summary
|
|
314
444
|
|
|
315
445
|
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from .config import compute_config_hash
|
|
2
|
+
from .schema import materialize_vector_schema
|
|
3
|
+
from .metadata import materialize_metadata
|
|
4
|
+
from .scaler import materialize_scaler_statistics
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"compute_config_hash",
|
|
8
|
+
"materialize_vector_schema",
|
|
9
|
+
"materialize_metadata",
|
|
10
|
+
"materialize_scaler_statistics",
|
|
11
|
+
]
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Iterable
|
|
6
|
+
|
|
7
|
+
from datapipeline.services.project_paths import read_project
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _resolve_relative(project_yaml: Path, value: str) -> Path:
|
|
11
|
+
path = Path(value)
|
|
12
|
+
return path if path.is_absolute() else (project_yaml.parent / path)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _normalized_label(path: Path, base_dir: Path) -> str:
|
|
16
|
+
try:
|
|
17
|
+
return str(path.resolve().relative_to(base_dir))
|
|
18
|
+
except ValueError:
|
|
19
|
+
return str(path.resolve())
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _hash_file(hasher, path: Path, base_dir: Path) -> None:
|
|
23
|
+
hasher.update(_normalized_label(path, base_dir).encode("utf-8"))
|
|
24
|
+
hasher.update(b"\0")
|
|
25
|
+
hasher.update(path.read_bytes())
|
|
26
|
+
hasher.update(b"\0")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _yaml_files(directory: Path) -> Iterable[Path]:
|
|
30
|
+
if not directory.exists():
|
|
31
|
+
return []
|
|
32
|
+
return sorted(p for p in directory.rglob("*.y*ml") if p.is_file())
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def compute_config_hash(project_yaml: Path, tasks_path: Path) -> str:
|
|
36
|
+
"""Compute a deterministic hash across relevant config inputs."""
|
|
37
|
+
|
|
38
|
+
hasher = hashlib.sha256()
|
|
39
|
+
base_dir = project_yaml.parent.resolve()
|
|
40
|
+
cfg = read_project(project_yaml)
|
|
41
|
+
|
|
42
|
+
required = [
|
|
43
|
+
project_yaml.resolve(),
|
|
44
|
+
_resolve_relative(project_yaml, cfg.paths.dataset).resolve(),
|
|
45
|
+
_resolve_relative(project_yaml, cfg.paths.postprocess).resolve(),
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
for path in required:
|
|
49
|
+
if not path.exists():
|
|
50
|
+
raise FileNotFoundError(f"Expected config file missing: {path}")
|
|
51
|
+
_hash_file(hasher, path, base_dir)
|
|
52
|
+
|
|
53
|
+
if not tasks_path.is_dir():
|
|
54
|
+
raise TypeError(
|
|
55
|
+
f"project.paths.tasks must point to a directory, got: {tasks_path}"
|
|
56
|
+
)
|
|
57
|
+
hasher.update(
|
|
58
|
+
f"[dir]{_normalized_label(tasks_path, base_dir)}".encode("utf-8")
|
|
59
|
+
)
|
|
60
|
+
for p in _yaml_files(tasks_path):
|
|
61
|
+
_hash_file(hasher, p, base_dir)
|
|
62
|
+
|
|
63
|
+
for dir_value in (cfg.paths.sources, cfg.paths.streams):
|
|
64
|
+
directory = _resolve_relative(project_yaml, dir_value)
|
|
65
|
+
hasher.update(
|
|
66
|
+
f"[dir]{_normalized_label(directory, base_dir)}".encode("utf-8")
|
|
67
|
+
)
|
|
68
|
+
if not directory.exists():
|
|
69
|
+
hasher.update(b"[missing]")
|
|
70
|
+
continue
|
|
71
|
+
for path in _yaml_files(directory):
|
|
72
|
+
_hash_file(hasher, path, base_dir)
|
|
73
|
+
|
|
74
|
+
return hasher.hexdigest()
|