jerry-thomas 0.2.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. datapipeline/analysis/vector/collector.py +378 -0
  2. datapipeline/analysis/vector/matrix.py +552 -0
  3. datapipeline/analysis/vector/report.py +447 -0
  4. datapipeline/analysis/vector_analyzer.py +3 -694
  5. datapipeline/build/__init__.py +6 -0
  6. datapipeline/build/state.py +52 -0
  7. datapipeline/build/tasks/__init__.py +11 -0
  8. datapipeline/build/tasks/config.py +74 -0
  9. datapipeline/build/tasks/metadata.py +170 -0
  10. datapipeline/build/tasks/scaler.py +73 -0
  11. datapipeline/build/tasks/schema.py +60 -0
  12. datapipeline/build/tasks/utils.py +169 -0
  13. datapipeline/cli/app.py +383 -137
  14. datapipeline/cli/commands/build.py +263 -0
  15. datapipeline/cli/commands/contract.py +367 -0
  16. datapipeline/cli/commands/domain.py +8 -3
  17. datapipeline/cli/commands/filter.py +1 -2
  18. datapipeline/cli/commands/inspect.py +421 -118
  19. datapipeline/cli/commands/list_.py +30 -7
  20. datapipeline/cli/commands/plugin.py +1 -1
  21. datapipeline/cli/commands/run.py +237 -127
  22. datapipeline/cli/commands/run_config.py +101 -0
  23. datapipeline/cli/commands/serve_pipeline.py +156 -0
  24. datapipeline/cli/commands/source.py +44 -8
  25. datapipeline/cli/visuals/__init__.py +16 -0
  26. datapipeline/cli/visuals/common.py +239 -0
  27. datapipeline/cli/visuals/labels.py +75 -0
  28. datapipeline/cli/visuals/runner.py +66 -0
  29. datapipeline/cli/visuals/sections.py +20 -0
  30. datapipeline/cli/visuals/sources.py +151 -0
  31. datapipeline/cli/visuals/sources_basic.py +260 -0
  32. datapipeline/cli/visuals/sources_off.py +76 -0
  33. datapipeline/cli/visuals/sources_rich.py +414 -0
  34. datapipeline/config/catalog.py +37 -3
  35. datapipeline/config/context.py +214 -0
  36. datapipeline/config/dataset/dataset.py +1 -2
  37. datapipeline/config/dataset/loader.py +20 -83
  38. datapipeline/config/dataset/normalize.py +4 -4
  39. datapipeline/config/metadata.py +43 -0
  40. datapipeline/config/postprocess.py +14 -0
  41. datapipeline/config/project.py +14 -1
  42. datapipeline/config/resolution.py +129 -0
  43. datapipeline/config/split.py +35 -0
  44. datapipeline/config/tasks.py +309 -0
  45. datapipeline/config/workspace.py +155 -0
  46. datapipeline/domain/__init__.py +12 -0
  47. datapipeline/domain/record.py +11 -0
  48. datapipeline/domain/sample.py +54 -0
  49. datapipeline/domain/vector.py +0 -9
  50. datapipeline/filters/filters.py +1 -1
  51. datapipeline/integrations/ml/__init__.py +16 -0
  52. datapipeline/integrations/ml/adapter.py +134 -0
  53. datapipeline/integrations/ml/pandas_support.py +44 -0
  54. datapipeline/integrations/ml/rows.py +77 -0
  55. datapipeline/integrations/ml/torch_support.py +92 -0
  56. datapipeline/io/factory.py +112 -0
  57. datapipeline/io/output.py +132 -0
  58. datapipeline/io/protocols.py +21 -0
  59. datapipeline/io/serializers.py +219 -0
  60. datapipeline/io/sinks/__init__.py +23 -0
  61. datapipeline/io/sinks/base.py +2 -0
  62. datapipeline/io/sinks/files.py +79 -0
  63. datapipeline/io/sinks/rich.py +57 -0
  64. datapipeline/io/sinks/stdout.py +18 -0
  65. datapipeline/io/writers/__init__.py +14 -0
  66. datapipeline/io/writers/base.py +28 -0
  67. datapipeline/io/writers/csv_writer.py +25 -0
  68. datapipeline/io/writers/jsonl.py +52 -0
  69. datapipeline/io/writers/pickle_writer.py +30 -0
  70. datapipeline/pipeline/artifacts.py +58 -0
  71. datapipeline/pipeline/context.py +128 -0
  72. datapipeline/pipeline/observability.py +65 -0
  73. datapipeline/pipeline/pipelines.py +76 -26
  74. datapipeline/pipeline/split.py +172 -0
  75. datapipeline/pipeline/stages.py +175 -25
  76. datapipeline/pipeline/utils/keygen.py +22 -9
  77. datapipeline/pipeline/utils/memory_sort.py +22 -10
  78. datapipeline/pipeline/utils/transform_utils.py +86 -23
  79. datapipeline/plugins.py +1 -1
  80. datapipeline/runtime.py +76 -0
  81. datapipeline/services/artifacts.py +102 -0
  82. datapipeline/services/bootstrap/__init__.py +12 -0
  83. datapipeline/services/bootstrap/config.py +166 -0
  84. datapipeline/services/bootstrap/core.py +201 -0
  85. datapipeline/services/constants.py +9 -3
  86. datapipeline/services/entrypoints.py +1 -1
  87. datapipeline/services/factories.py +128 -3
  88. datapipeline/services/paths.py +1 -1
  89. datapipeline/services/project_paths.py +50 -2
  90. datapipeline/services/runs.py +208 -0
  91. datapipeline/services/scaffold/domain.py +3 -3
  92. datapipeline/services/scaffold/filter.py +3 -3
  93. datapipeline/services/scaffold/mappers.py +9 -6
  94. datapipeline/services/scaffold/plugin.py +32 -6
  95. datapipeline/services/scaffold/source.py +93 -58
  96. datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
  97. datapipeline/sources/decoders.py +83 -18
  98. datapipeline/sources/factory.py +26 -16
  99. datapipeline/sources/models/__init__.py +2 -2
  100. datapipeline/sources/models/generator.py +0 -3
  101. datapipeline/sources/models/loader.py +3 -6
  102. datapipeline/sources/models/parsing_error.py +24 -0
  103. datapipeline/sources/models/source.py +6 -6
  104. datapipeline/sources/models/synthetic.py +1 -1
  105. datapipeline/sources/synthetic/time/loader.py +21 -1
  106. datapipeline/sources/transports.py +74 -37
  107. datapipeline/templates/plugin_skeleton/README.md +107 -18
  108. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
  109. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
  110. datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
  111. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
  112. datapipeline/templates/plugin_skeleton/example/project.yaml +23 -0
  113. datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
  114. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
  115. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
  116. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
  117. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
  118. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
  119. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
  120. datapipeline/templates/plugin_skeleton/jerry.yaml +28 -0
  121. datapipeline/templates/plugin_skeleton/pyproject.toml +2 -2
  122. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
  123. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
  124. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
  125. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
  126. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
  127. datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
  128. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
  129. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
  130. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
  131. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
  132. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
  133. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
  134. datapipeline/templates/stubs/dto.py.j2 +4 -0
  135. datapipeline/templates/stubs/mapper.py.j2 +10 -7
  136. datapipeline/templates/stubs/parser.py.j2 +3 -0
  137. datapipeline/templates/stubs/record.py.j2 +2 -0
  138. datapipeline/templates/stubs/source.yaml.j2 +2 -3
  139. datapipeline/transforms/debug/lint.py +26 -41
  140. datapipeline/transforms/feature/scaler.py +203 -62
  141. datapipeline/transforms/filter.py +5 -2
  142. datapipeline/transforms/record/floor_time.py +4 -4
  143. datapipeline/transforms/sequence.py +2 -35
  144. datapipeline/transforms/stream/dedupe.py +24 -0
  145. datapipeline/transforms/stream/ensure_ticks.py +7 -6
  146. datapipeline/transforms/stream/fill.py +3 -25
  147. datapipeline/transforms/utils.py +16 -0
  148. datapipeline/transforms/vector/__init__.py +5 -0
  149. datapipeline/transforms/vector/common.py +98 -0
  150. datapipeline/transforms/vector/drop/__init__.py +4 -0
  151. datapipeline/transforms/vector/drop/horizontal.py +79 -0
  152. datapipeline/transforms/vector/drop/orchestrator.py +59 -0
  153. datapipeline/transforms/vector/drop/vertical.py +182 -0
  154. datapipeline/transforms/vector/ensure_schema.py +184 -0
  155. datapipeline/transforms/vector/fill.py +87 -0
  156. datapipeline/transforms/vector/replace.py +62 -0
  157. datapipeline/transforms/vector_utils.py +19 -67
  158. datapipeline/utils/load.py +26 -5
  159. datapipeline/utils/pickle_model.py +30 -0
  160. datapipeline/utils/placeholders.py +35 -0
  161. datapipeline/utils/rich_compat.py +38 -0
  162. datapipeline/utils/window.py +76 -0
  163. jerry_thomas-1.0.0.dist-info/METADATA +825 -0
  164. jerry_thomas-1.0.0.dist-info/RECORD +199 -0
  165. {jerry_thomas-0.2.0.dist-info → jerry_thomas-1.0.0.dist-info}/entry_points.txt +9 -8
  166. datapipeline/cli/commands/link.py +0 -129
  167. datapipeline/cli/visual_source.py +0 -32
  168. datapipeline/cli/visuals.py +0 -64
  169. datapipeline/common/__init__.py +0 -0
  170. datapipeline/common/geo.py +0 -13
  171. datapipeline/integrations/ml.py +0 -319
  172. datapipeline/registries/registries.py +0 -15
  173. datapipeline/services/bootstrap.py +0 -191
  174. datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.yaml +0 -24
  175. datapipeline/templates/plugin_skeleton/config/contracts/time_linear.yaml +0 -23
  176. datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -29
  177. datapipeline/templates/plugin_skeleton/config/datasets/default/project.yaml +0 -8
  178. datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
  179. datapipeline/transforms/vector.py +0 -226
  180. jerry_thomas-0.2.0.dist-info/METADATA +0 -402
  181. jerry_thomas-0.2.0.dist-info/RECORD +0 -112
  182. {jerry_thomas-0.2.0.dist-info → jerry_thomas-1.0.0.dist-info}/WHEEL +0 -0
  183. {jerry_thomas-0.2.0.dist-info → jerry_thomas-1.0.0.dist-info}/licenses/LICENSE +0 -0
  184. {jerry_thomas-0.2.0.dist-info → jerry_thomas-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,378 @@
1
+ from __future__ import annotations
2
+ from collections import Counter, defaultdict
3
+ from typing import Any, Hashable, Iterable, Literal
4
+ from datapipeline.transforms.vector_utils import base_id as _base_id
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+
8
+
9
+ def _base_feature_id(feature_id: str) -> str:
10
+ """Return the base feature id without partition suffix."""
11
+ return _base_id(feature_id)
12
+
13
+
14
+ def _is_missing_value(value: Any) -> bool:
15
+ if value is None:
16
+ return True
17
+ if isinstance(value, float):
18
+ return value != value # NaN without numpy
19
+ return False
20
+
21
+
22
+ class VectorStatsCollector:
23
+ """Collect coverage statistics for feature vectors."""
24
+
25
+ def __init__(
26
+ self,
27
+ expected_feature_ids: Iterable[str] | None = None,
28
+ *,
29
+ match_partition: Literal["base", "full"] = "base",
30
+ schema_meta: dict[str, dict[str, Any]] | None = None,
31
+ sample_limit: int = 5,
32
+ threshold: float | None = 0.95,
33
+ show_matrix: bool = False,
34
+ matrix_rows: int = 20,
35
+ matrix_cols: int = 10,
36
+ matrix_output: str | None = None,
37
+ matrix_format: str = "html",
38
+ ) -> None:
39
+ self.match_partition = match_partition
40
+ self.threshold = threshold
41
+ self.show_matrix = show_matrix
42
+ self.matrix_rows = matrix_rows if matrix_rows and matrix_rows > 0 else None
43
+ self.matrix_cols = matrix_cols if matrix_cols and matrix_cols > 0 else None
44
+ self.matrix_output = Path(matrix_output) if matrix_output else None
45
+ self.matrix_format = matrix_format
46
+
47
+ self.expected_features = (
48
+ {self._normalize(fid) for fid in expected_feature_ids}
49
+ if expected_feature_ids
50
+ else set()
51
+ )
52
+ self.schema_meta = schema_meta or {}
53
+
54
+ self.discovered_features: set[str] = set()
55
+ self.discovered_partitions: set[str] = set()
56
+
57
+ self.total_vectors = 0
58
+ self.empty_vectors = 0
59
+
60
+ self.seen_counts = Counter()
61
+ self.null_counts_features = Counter()
62
+ self.seen_counts_partitions = Counter()
63
+ self.null_counts_partitions = Counter()
64
+ self.cadence_null_counts = Counter()
65
+ self.cadence_opportunities = Counter()
66
+ self.cadence_null_counts_partitions = Counter()
67
+ self.cadence_opportunities_partitions = Counter()
68
+
69
+ self.missing_samples = defaultdict(list)
70
+ self.missing_partition_samples = defaultdict(list)
71
+ self.sample_limit = sample_limit
72
+
73
+ self.group_feature_status = defaultdict(dict)
74
+ self.group_partition_status = defaultdict(dict)
75
+ # Optional per-cell sub-status for list-valued entries (finer resolution inside a bucket)
76
+ self.group_feature_sub: dict[Hashable,
77
+ dict[str, list[str]]] = defaultdict(dict)
78
+ self.group_partition_sub: dict[Hashable,
79
+ dict[str, list[str]]] = defaultdict(dict)
80
+
81
+ @staticmethod
82
+ def _group_sort_key(g: Hashable):
83
+ """Stable, chronological sort key for group keys.
84
+
85
+ Many pipelines use a 1-tuple containing a datetime as the group key.
86
+ Sorting by ``str(g)`` can produce lexicographic mis-ordering (e.g.,
87
+ hours "3" vs "21"). This helper prefers numeric datetime ordering and
88
+ falls back to string representation only when needed.
89
+ """
90
+ def norm(p: Any):
91
+ if isinstance(p, datetime):
92
+ # Use POSIX timestamp for monotonic ordering
93
+ return p.timestamp()
94
+ return p
95
+
96
+ if isinstance(g, (tuple, list)):
97
+ return tuple(norm(p) for p in g)
98
+ return norm(g)
99
+
100
+ def _normalize(self, feature_id: str) -> str:
101
+ if self.match_partition == "full":
102
+ return feature_id
103
+ return _base_feature_id(feature_id)
104
+
105
+ def update(self, group_key: Hashable, feature_vector: dict[str, Any]) -> None:
106
+ self.total_vectors += 1
107
+
108
+ present_partitions = set(feature_vector.keys())
109
+ if not present_partitions:
110
+ self.empty_vectors += 1
111
+
112
+ status_features = self.group_feature_status[group_key]
113
+ status_partitions = self.group_partition_status[group_key]
114
+
115
+ present_normalized: set[str] = set()
116
+ seen_partitions: set[str] = set()
117
+ feature_seen_present: dict[str, bool] = {}
118
+ feature_seen_null: dict[str, bool] = {}
119
+ for partition_id in present_partitions:
120
+ normalized = self._normalize(partition_id)
121
+ present_normalized.add(normalized)
122
+ seen_partitions.add(partition_id)
123
+
124
+ value = feature_vector[partition_id]
125
+
126
+ status_features.setdefault(normalized, "present")
127
+ status_partitions.setdefault(partition_id, "present")
128
+
129
+ self.discovered_features.add(normalized)
130
+ self.discovered_partitions.add(partition_id)
131
+
132
+ # Capture sub-status for list-valued entries
133
+ sub: list[str] | None = None
134
+ has_present_element = False
135
+ if isinstance(value, list):
136
+ sub = []
137
+ for v in value:
138
+ if v is None or (isinstance(v, float) and v != v):
139
+ sub.append("null")
140
+ else:
141
+ has_present_element = True
142
+ sub.append("present")
143
+ if sub:
144
+ self.group_partition_sub[group_key][partition_id] = sub
145
+ # Only store one sub per normalized id (first seen)
146
+ self.group_feature_sub[group_key].setdefault(
147
+ normalized, sub)
148
+
149
+ is_null = (not has_present_element) if isinstance(value, list) else _is_missing_value(value)
150
+ if is_null:
151
+ status_partitions[partition_id] = "null"
152
+ feature_seen_null[normalized] = True
153
+ self.null_counts_partitions[partition_id] += 1
154
+ if len(self.missing_partition_samples[partition_id]) < self.sample_limit:
155
+ self.missing_partition_samples[partition_id].append(
156
+ (group_key, "null")
157
+ )
158
+ if len(self.missing_samples[normalized]) < self.sample_limit:
159
+ self.missing_samples[normalized].append(
160
+ (group_key, "null"))
161
+ else:
162
+ feature_seen_present[normalized] = True
163
+
164
+ # Cadence-aware null accounting (per schema metadata)
165
+ meta = self.schema_meta.get(normalized) or self.schema_meta.get(partition_id)
166
+ expected_len = self._cadence_expected_length(meta) if meta else None
167
+ if expected_len is not None:
168
+ self._update_cadence(normalized, expected_len, value, partitions=False)
169
+ self._update_cadence(partition_id, expected_len, value, partitions=True)
170
+
171
+ for normalized in present_normalized:
172
+ if feature_seen_present.get(normalized):
173
+ status_features[normalized] = "present"
174
+ # Drop stale null samples when the feature is ultimately present
175
+ self.missing_samples.pop(normalized, None)
176
+ elif feature_seen_null.get(normalized):
177
+ status_features[normalized] = "null"
178
+ self.null_counts_features[normalized] += 1
179
+ # Count availability (seen) regardless of value
180
+ self.seen_counts[normalized] += 1
181
+
182
+ for partition_id in seen_partitions:
183
+ # Availability regardless of value
184
+ self.seen_counts_partitions[partition_id] += 1
185
+
186
+ tracked_features = (
187
+ self.expected_features if self.expected_features else self.discovered_features
188
+ )
189
+ missing_features = tracked_features - present_normalized
190
+ for feature_id in missing_features:
191
+ if status_features.get(feature_id) != "null":
192
+ status_features[feature_id] = "absent"
193
+ if len(self.missing_samples[feature_id]) < self.sample_limit:
194
+ self.missing_samples[feature_id].append((group_key, "absent"))
195
+
196
+ if self.match_partition == "full":
197
+ tracked_partitions = (
198
+ set(self.expected_features) if self.expected_features else self.discovered_partitions
199
+ )
200
+ else:
201
+ tracked_partitions = self.discovered_partitions
202
+
203
+ missing_partitions = tracked_partitions - present_partitions
204
+ for partition_id in missing_partitions:
205
+ if status_partitions.get(partition_id) != "null":
206
+ status_partitions[partition_id] = "absent"
207
+ if len(self.missing_partition_samples[partition_id]) < self.sample_limit:
208
+ self.missing_partition_samples[partition_id].append(
209
+ (group_key, "absent")
210
+ )
211
+
212
+ def _coverage(
213
+ self, identifier: str, *, partitions: bool = False
214
+ ) -> tuple[int, int, int]:
215
+ present = (
216
+ self.seen_counts_partitions[identifier]
217
+ if partitions
218
+ else self.seen_counts[identifier]
219
+ )
220
+ opportunities = self.total_vectors
221
+ missing = max(opportunities - present, 0)
222
+ return present, missing, opportunities
223
+
224
+ def _feature_null_count(self, feature_id: str) -> int:
225
+ return self.null_counts_features.get(feature_id, 0)
226
+
227
+ @staticmethod
228
+ def _format_group_key(group_key: Hashable) -> str:
229
+ if isinstance(group_key, tuple):
230
+ return ", ".join(str(part) for part in group_key)
231
+ return str(group_key)
232
+
233
+ @staticmethod
234
+ def _symbol_for(status: str) -> str:
235
+ return {
236
+ "present": "#",
237
+ "null": "!",
238
+ "absent": ".",
239
+ }.get(status, ".")
240
+
241
+ @staticmethod
242
+ def _format_samples(samples: list[tuple[Hashable, str]], limit: int = 3) -> str:
243
+ if not samples:
244
+ return ""
245
+ trimmed = samples[:limit]
246
+ rendered = ", ".join(
247
+ f"{reason}@{sample}" for sample, reason in trimmed)
248
+ if len(samples) > limit:
249
+ rendered += ", ..."
250
+ return rendered
251
+
252
+ @staticmethod
253
+ def _partition_suffix(partition_id: str) -> str:
254
+ return partition_id.split("__", 1)[1] if "__" in partition_id else partition_id
255
+
256
+ @staticmethod
257
+ def _partition_values(partition_id: str) -> list[str]:
258
+ """Return partition values without base id or field names."""
259
+ suffix = partition_id.split("__", 1)[1] if "__" in partition_id else partition_id
260
+ if not suffix:
261
+ return []
262
+
263
+ def _components(raw: str) -> list[str]:
264
+ if raw.startswith("@"):
265
+ parts = raw.split("_@")
266
+ return [parts[0]] + [f"@{rest}" for rest in parts[1:]]
267
+ return [raw]
268
+
269
+ values: list[str] = []
270
+ for component in _components(suffix):
271
+ field_value = component.lstrip("@")
272
+ _, _, value = field_value.partition(":")
273
+ candidate = value or field_value
274
+ # If no explicit value delimiter, drop leading field name-ish prefixes
275
+ if not value and "_" in candidate:
276
+ candidate = candidate.rsplit("_", 1)[-1]
277
+ values.append(candidate)
278
+ return values
279
+
280
+ @classmethod
281
+ def _partition_value(cls, partition_id: str) -> str:
282
+ values = cls._partition_values(partition_id)
283
+ if not values:
284
+ return ""
285
+ return values[0] if len(values) == 1 else "_".join(values)
286
+
287
+ @staticmethod
288
+ def _expected_lengths(meta: dict[str, Any]) -> list[int]:
289
+ cadence = meta.get("cadence")
290
+ if isinstance(cadence, dict):
291
+ target = cadence.get("target")
292
+ if isinstance(target, (int, float)) and target > 0:
293
+ return [int(target)]
294
+ modes = meta.get("list_length", {}).get("modes")
295
+ if isinstance(modes, (list, tuple)) and modes:
296
+ ints = [int(m) for m in modes if isinstance(m, (int, float))]
297
+ if ints:
298
+ return sorted(ints)
299
+ expected = meta.get("expected_length")
300
+ if isinstance(expected, (int, float)):
301
+ return [int(expected)]
302
+ max_len = meta.get("list_length", {}).get("max")
303
+ if isinstance(max_len, (int, float)) and max_len > 0:
304
+ return [int(max_len)]
305
+ return []
306
+
307
+ @staticmethod
308
+ def _cadence_expected_length(meta: dict[str, Any]) -> int | None:
309
+ lengths = VectorStatsCollector._expected_lengths(meta)
310
+ return max(lengths) if lengths else None
311
+
312
+ def _update_cadence(
313
+ self, identifier: str, expected_len: int | None, value: Any, *, partitions: bool
314
+ ) -> None:
315
+ if expected_len is None:
316
+ return
317
+ counter_nulls = (
318
+ self.cadence_null_counts_partitions if partitions else self.cadence_null_counts
319
+ )
320
+ counter_opps = (
321
+ self.cadence_opportunities_partitions
322
+ if partitions
323
+ else self.cadence_opportunities
324
+ )
325
+
326
+ present = 0
327
+ if isinstance(value, list):
328
+ trimmed = value[:expected_len]
329
+ present = sum(0 if _is_missing_value(v) else 1 for v in trimmed)
330
+ else:
331
+ present = 0 if _is_missing_value(value) else 1
332
+ missing = max(expected_len - present, 0)
333
+ counter_opps[identifier] += expected_len
334
+ counter_nulls[identifier] += missing
335
+
336
+ def _render_matrix(
337
+ self,
338
+ *,
339
+ features: list[str],
340
+ partitions: bool = False,
341
+ column_width: int = 6,
342
+ ) -> None:
343
+ from .matrix import render_matrix
344
+
345
+ render_matrix(
346
+ self,
347
+ features=features,
348
+ partitions=partitions,
349
+ column_width=column_width,
350
+ )
351
+
352
+ def print_report(self, *, sort_key: str = "missing") -> dict[str, Any]:
353
+ from .report import print_report as _print_report
354
+
355
+ return _print_report(self, sort_key=sort_key)
356
+
357
+ def _export_matrix_data(self) -> None:
358
+ from .matrix import export_matrix_data
359
+
360
+ export_matrix_data(self)
361
+
362
+ def _collect_feature_ids(self) -> list[str]:
363
+ feature_ids: set[str] = set()
364
+ for statuses in self.group_feature_status.values():
365
+ feature_ids.update(statuses.keys())
366
+ return sorted(feature_ids)
367
+
368
+ def _collect_partition_ids(self) -> list[str]:
369
+ partition_ids: set[str] = set()
370
+ for statuses in self.group_partition_status.values():
371
+ partition_ids.update(statuses.keys())
372
+ return sorted(partition_ids)
373
+
374
+ def _collect_group_keys(self) -> list[Hashable]:
375
+ keys = set(self.group_feature_status.keys()) | set(
376
+ self.group_partition_status.keys()
377
+ )
378
+ return sorted(keys, key=self._group_sort_key)