jerry-thomas 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +275 -0
- datapipeline/analysis/vector/matrix.py +527 -0
- datapipeline/analysis/vector/report.py +317 -0
- datapipeline/analysis/vector_analyzer.py +3 -694
- datapipeline/build/__init__.py +6 -0
- datapipeline/build/state.py +52 -0
- datapipeline/build/tasks.py +186 -0
- datapipeline/cli/app.py +125 -56
- datapipeline/cli/commands/build.py +39 -0
- datapipeline/cli/commands/domain.py +1 -1
- datapipeline/cli/commands/filter.py +1 -2
- datapipeline/cli/commands/inspect.py +77 -26
- datapipeline/cli/commands/link.py +11 -12
- datapipeline/cli/commands/plugin.py +1 -1
- datapipeline/cli/commands/run.py +234 -110
- datapipeline/cli/commands/source.py +3 -3
- datapipeline/cli/commands/writers.py +138 -0
- datapipeline/cli/visuals/__init__.py +14 -0
- datapipeline/cli/{visuals.py → visuals/labels.py} +35 -24
- datapipeline/cli/visuals/sources.py +138 -0
- datapipeline/config/build.py +64 -0
- datapipeline/config/dataset/dataset.py +1 -2
- datapipeline/config/dataset/loader.py +1 -81
- datapipeline/config/postprocess.py +14 -0
- datapipeline/config/project.py +13 -1
- datapipeline/config/run.py +116 -0
- datapipeline/config/split.py +35 -0
- datapipeline/domain/vector.py +0 -9
- datapipeline/filters/filters.py +1 -1
- datapipeline/integrations/ml/__init__.py +16 -0
- datapipeline/integrations/ml/adapter.py +120 -0
- datapipeline/integrations/ml/pandas_support.py +46 -0
- datapipeline/integrations/ml/rows.py +82 -0
- datapipeline/integrations/ml/torch_support.py +94 -0
- datapipeline/pipeline/context.py +69 -0
- datapipeline/pipeline/pipelines.py +21 -23
- datapipeline/pipeline/split.py +171 -0
- datapipeline/pipeline/stages.py +54 -15
- datapipeline/pipeline/utils/keygen.py +2 -2
- datapipeline/pipeline/utils/transform_utils.py +64 -23
- datapipeline/plugins.py +1 -1
- datapipeline/runtime.py +73 -0
- datapipeline/services/artifacts.py +96 -0
- datapipeline/services/bootstrap/__init__.py +12 -0
- datapipeline/services/bootstrap/config.py +141 -0
- datapipeline/services/bootstrap/core.py +186 -0
- datapipeline/services/constants.py +5 -0
- datapipeline/services/entrypoints.py +1 -1
- datapipeline/services/factories.py +5 -2
- datapipeline/services/paths.py +1 -1
- datapipeline/services/project_paths.py +21 -0
- datapipeline/services/scaffold/domain.py +1 -2
- datapipeline/services/scaffold/filter.py +1 -2
- datapipeline/services/scaffold/mappers.py +1 -1
- datapipeline/services/scaffold/plugin.py +31 -5
- datapipeline/services/scaffold/source.py +2 -4
- datapipeline/sources/models/generator.py +6 -2
- datapipeline/sources/models/loader.py +0 -3
- datapipeline/sources/models/synthetic.py +1 -1
- datapipeline/sources/synthetic/time/loader.py +10 -2
- datapipeline/templates/plugin_skeleton/README.md +52 -7
- datapipeline/templates/plugin_skeleton/config/contracts/{time_hour_sin.yaml → time_hour_sin.synthetic.yaml} +3 -3
- datapipeline/templates/plugin_skeleton/config/contracts/{time_linear.yaml → time_linear.synthetic.yaml} +3 -3
- datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +9 -0
- datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +3 -18
- datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +13 -0
- datapipeline/templates/plugin_skeleton/config/datasets/default/project.yaml +12 -0
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +10 -0
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +10 -0
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +10 -0
- datapipeline/templates/plugin_skeleton/pyproject.toml +2 -2
- datapipeline/templates/stubs/dto.py.j2 +2 -0
- datapipeline/templates/stubs/mapper.py.j2 +5 -3
- datapipeline/templates/stubs/parser.py.j2 +1 -0
- datapipeline/transforms/feature/scaler.py +127 -62
- datapipeline/transforms/filter.py +5 -2
- datapipeline/transforms/stream/fill.py +3 -25
- datapipeline/transforms/utils.py +16 -0
- datapipeline/transforms/vector.py +62 -78
- datapipeline/transforms/vector_utils.py +19 -67
- datapipeline/utils/load.py +2 -2
- datapipeline/utils/pickle_model.py +30 -0
- datapipeline/utils/placeholders.py +35 -0
- jerry_thomas-0.3.0.dist-info/METADATA +502 -0
- jerry_thomas-0.3.0.dist-info/RECORD +139 -0
- datapipeline/cli/visual_source.py +0 -32
- datapipeline/common/__init__.py +0 -0
- datapipeline/common/geo.py +0 -13
- datapipeline/integrations/ml.py +0 -319
- datapipeline/registries/registries.py +0 -15
- datapipeline/services/bootstrap.py +0 -191
- jerry_thomas-0.2.0.dist-info/METADATA +0 -402
- jerry_thomas-0.2.0.dist-info/RECORD +0 -112
- {jerry_thomas-0.2.0.dist-info → jerry_thomas-0.3.0.dist-info}/WHEEL +0 -0
- {jerry_thomas-0.2.0.dist-info → jerry_thomas-0.3.0.dist-info}/entry_points.txt +0 -0
- {jerry_thomas-0.2.0.dist-info → jerry_thomas-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-0.2.0.dist-info → jerry_thomas-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from collections import Counter, defaultdict
|
|
3
|
+
from typing import Any, Hashable, Iterable, Literal
|
|
4
|
+
from datapipeline.transforms.vector_utils import base_id as _base_id
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _base_feature_id(feature_id: str) -> str:
|
|
10
|
+
"""Return the base feature id without partition suffix."""
|
|
11
|
+
return _base_id(feature_id)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _is_missing_value(value: Any) -> bool:
|
|
15
|
+
if value is None:
|
|
16
|
+
return True
|
|
17
|
+
if isinstance(value, float):
|
|
18
|
+
return value != value # NaN without numpy
|
|
19
|
+
return False
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class VectorStatsCollector:
|
|
23
|
+
"""Collect coverage statistics for feature vectors."""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
expected_feature_ids: Iterable[str] | None = None,
|
|
28
|
+
*,
|
|
29
|
+
match_partition: Literal["base", "full"] = "base",
|
|
30
|
+
sample_limit: int = 5,
|
|
31
|
+
threshold: float | None = 0.95,
|
|
32
|
+
show_matrix: bool = False,
|
|
33
|
+
matrix_rows: int = 20,
|
|
34
|
+
matrix_cols: int = 10,
|
|
35
|
+
matrix_output: str | None = None,
|
|
36
|
+
matrix_format: str = "html",
|
|
37
|
+
) -> None:
|
|
38
|
+
self.match_partition = match_partition
|
|
39
|
+
self.threshold = threshold
|
|
40
|
+
self.show_matrix = show_matrix
|
|
41
|
+
self.matrix_rows = matrix_rows if matrix_rows and matrix_rows > 0 else None
|
|
42
|
+
self.matrix_cols = matrix_cols if matrix_cols and matrix_cols > 0 else None
|
|
43
|
+
self.matrix_output = Path(matrix_output) if matrix_output else None
|
|
44
|
+
self.matrix_format = matrix_format
|
|
45
|
+
|
|
46
|
+
self.expected_features = (
|
|
47
|
+
{self._normalize(fid) for fid in expected_feature_ids}
|
|
48
|
+
if expected_feature_ids
|
|
49
|
+
else set()
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
self.discovered_features: set[str] = set()
|
|
53
|
+
self.discovered_partitions: set[str] = set()
|
|
54
|
+
|
|
55
|
+
self.total_vectors = 0
|
|
56
|
+
self.empty_vectors = 0
|
|
57
|
+
|
|
58
|
+
self.present_counts = Counter()
|
|
59
|
+
self.present_counts_partitions = Counter()
|
|
60
|
+
self.null_counts_partitions = Counter()
|
|
61
|
+
|
|
62
|
+
self.missing_samples = defaultdict(list)
|
|
63
|
+
self.missing_partition_samples = defaultdict(list)
|
|
64
|
+
self.sample_limit = sample_limit
|
|
65
|
+
|
|
66
|
+
self.group_feature_status = defaultdict(dict)
|
|
67
|
+
self.group_partition_status = defaultdict(dict)
|
|
68
|
+
# Optional per-cell sub-status for list-valued entries (finer resolution inside a bucket)
|
|
69
|
+
self.group_feature_sub: dict[Hashable,
|
|
70
|
+
dict[str, list[str]]] = defaultdict(dict)
|
|
71
|
+
self.group_partition_sub: dict[Hashable,
|
|
72
|
+
dict[str, list[str]]] = defaultdict(dict)
|
|
73
|
+
|
|
74
|
+
@staticmethod
|
|
75
|
+
def _group_sort_key(g: Hashable):
|
|
76
|
+
"""Stable, chronological sort key for group keys.
|
|
77
|
+
|
|
78
|
+
Many pipelines use a 1-tuple containing a datetime as the group key.
|
|
79
|
+
Sorting by ``str(g)`` can produce lexicographic mis-ordering (e.g.,
|
|
80
|
+
hours "3" vs "21"). This helper prefers numeric datetime ordering and
|
|
81
|
+
falls back to string representation only when needed.
|
|
82
|
+
"""
|
|
83
|
+
def norm(p: Any):
|
|
84
|
+
if isinstance(p, datetime):
|
|
85
|
+
# Use POSIX timestamp for monotonic ordering
|
|
86
|
+
return p.timestamp()
|
|
87
|
+
return p
|
|
88
|
+
|
|
89
|
+
if isinstance(g, (tuple, list)):
|
|
90
|
+
return tuple(norm(p) for p in g)
|
|
91
|
+
return norm(g)
|
|
92
|
+
|
|
93
|
+
def _normalize(self, feature_id: str) -> str:
|
|
94
|
+
if self.match_partition == "full":
|
|
95
|
+
return feature_id
|
|
96
|
+
return _base_feature_id(feature_id)
|
|
97
|
+
|
|
98
|
+
def update(self, group_key: Hashable, feature_vector: dict[str, Any]) -> None:
|
|
99
|
+
self.total_vectors += 1
|
|
100
|
+
|
|
101
|
+
present_partitions = set(feature_vector.keys())
|
|
102
|
+
if not present_partitions:
|
|
103
|
+
self.empty_vectors += 1
|
|
104
|
+
|
|
105
|
+
status_features = self.group_feature_status[group_key]
|
|
106
|
+
status_partitions = self.group_partition_status[group_key]
|
|
107
|
+
|
|
108
|
+
present_normalized: set[str] = set()
|
|
109
|
+
seen_partitions: set[str] = set()
|
|
110
|
+
for partition_id in present_partitions:
|
|
111
|
+
normalized = self._normalize(partition_id)
|
|
112
|
+
present_normalized.add(normalized)
|
|
113
|
+
seen_partitions.add(partition_id)
|
|
114
|
+
|
|
115
|
+
value = feature_vector[partition_id]
|
|
116
|
+
|
|
117
|
+
status_features.setdefault(normalized, "present")
|
|
118
|
+
status_partitions.setdefault(partition_id, "present")
|
|
119
|
+
|
|
120
|
+
self.discovered_features.add(normalized)
|
|
121
|
+
self.discovered_partitions.add(partition_id)
|
|
122
|
+
|
|
123
|
+
# Capture sub-status for list-valued entries
|
|
124
|
+
sub: list[str] | None = None
|
|
125
|
+
if isinstance(value, list):
|
|
126
|
+
sub = []
|
|
127
|
+
for v in value:
|
|
128
|
+
if v is None or (isinstance(v, float) and v != v):
|
|
129
|
+
sub.append("null")
|
|
130
|
+
else:
|
|
131
|
+
sub.append("present")
|
|
132
|
+
if sub:
|
|
133
|
+
self.group_partition_sub[group_key][partition_id] = sub
|
|
134
|
+
# Only store one sub per normalized id (first seen)
|
|
135
|
+
self.group_feature_sub[group_key].setdefault(
|
|
136
|
+
normalized, sub)
|
|
137
|
+
|
|
138
|
+
is_null = _is_missing_value(value)
|
|
139
|
+
if is_null:
|
|
140
|
+
status_features[normalized] = "null"
|
|
141
|
+
status_partitions[partition_id] = "null"
|
|
142
|
+
self.null_counts_partitions[partition_id] += 1
|
|
143
|
+
if len(self.missing_partition_samples[partition_id]) < self.sample_limit:
|
|
144
|
+
self.missing_partition_samples[partition_id].append(
|
|
145
|
+
(group_key, "null")
|
|
146
|
+
)
|
|
147
|
+
if len(self.missing_samples[normalized]) < self.sample_limit:
|
|
148
|
+
self.missing_samples[normalized].append(
|
|
149
|
+
(group_key, "null"))
|
|
150
|
+
|
|
151
|
+
for normalized in present_normalized:
|
|
152
|
+
if status_features.get(normalized) == "present":
|
|
153
|
+
self.present_counts[normalized] += 1
|
|
154
|
+
|
|
155
|
+
for partition_id in seen_partitions:
|
|
156
|
+
if status_partitions.get(partition_id) == "present":
|
|
157
|
+
self.present_counts_partitions[partition_id] += 1
|
|
158
|
+
|
|
159
|
+
tracked_features = (
|
|
160
|
+
self.expected_features if self.expected_features else self.discovered_features
|
|
161
|
+
)
|
|
162
|
+
missing_features = tracked_features - present_normalized
|
|
163
|
+
for feature_id in missing_features:
|
|
164
|
+
if status_features.get(feature_id) != "null":
|
|
165
|
+
status_features[feature_id] = "absent"
|
|
166
|
+
if len(self.missing_samples[feature_id]) < self.sample_limit:
|
|
167
|
+
self.missing_samples[feature_id].append((group_key, "absent"))
|
|
168
|
+
|
|
169
|
+
if self.match_partition == "full":
|
|
170
|
+
tracked_partitions = (
|
|
171
|
+
set(self.expected_features) if self.expected_features else self.discovered_partitions
|
|
172
|
+
)
|
|
173
|
+
else:
|
|
174
|
+
tracked_partitions = self.discovered_partitions
|
|
175
|
+
|
|
176
|
+
missing_partitions = tracked_partitions - present_partitions
|
|
177
|
+
for partition_id in missing_partitions:
|
|
178
|
+
if status_partitions.get(partition_id) != "null":
|
|
179
|
+
status_partitions[partition_id] = "absent"
|
|
180
|
+
if len(self.missing_partition_samples[partition_id]) < self.sample_limit:
|
|
181
|
+
self.missing_partition_samples[partition_id].append(
|
|
182
|
+
(group_key, "absent")
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
def _coverage(
|
|
186
|
+
self, identifier: str, *, partitions: bool = False
|
|
187
|
+
) -> tuple[int, int, int]:
|
|
188
|
+
present = (
|
|
189
|
+
self.present_counts_partitions[identifier]
|
|
190
|
+
if partitions
|
|
191
|
+
else self.present_counts[identifier]
|
|
192
|
+
)
|
|
193
|
+
opportunities = self.total_vectors
|
|
194
|
+
missing = max(opportunities - present, 0)
|
|
195
|
+
return present, missing, opportunities
|
|
196
|
+
|
|
197
|
+
def _feature_null_count(self, feature_id: str) -> int:
|
|
198
|
+
total = 0
|
|
199
|
+
for partition_id, count in self.null_counts_partitions.items():
|
|
200
|
+
if self._normalize(partition_id) == feature_id:
|
|
201
|
+
total += count
|
|
202
|
+
return total
|
|
203
|
+
|
|
204
|
+
@staticmethod
|
|
205
|
+
def _format_group_key(group_key: Hashable) -> str:
|
|
206
|
+
if isinstance(group_key, tuple):
|
|
207
|
+
return ", ".join(str(part) for part in group_key)
|
|
208
|
+
return str(group_key)
|
|
209
|
+
|
|
210
|
+
@staticmethod
|
|
211
|
+
def _symbol_for(status: str) -> str:
|
|
212
|
+
return {
|
|
213
|
+
"present": "#",
|
|
214
|
+
"null": "!",
|
|
215
|
+
"absent": ".",
|
|
216
|
+
}.get(status, ".")
|
|
217
|
+
|
|
218
|
+
@staticmethod
|
|
219
|
+
def _format_samples(samples: list[tuple[Hashable, str]], limit: int = 3) -> str:
|
|
220
|
+
if not samples:
|
|
221
|
+
return ""
|
|
222
|
+
trimmed = samples[:limit]
|
|
223
|
+
rendered = ", ".join(
|
|
224
|
+
f"{reason}@{sample}" for sample, reason in trimmed)
|
|
225
|
+
if len(samples) > limit:
|
|
226
|
+
rendered += ", ..."
|
|
227
|
+
return rendered
|
|
228
|
+
|
|
229
|
+
@staticmethod
|
|
230
|
+
def _partition_suffix(partition_id: str) -> str:
|
|
231
|
+
return partition_id.split("__", 1)[1] if "__" in partition_id else partition_id
|
|
232
|
+
|
|
233
|
+
def _render_matrix(
|
|
234
|
+
self,
|
|
235
|
+
*,
|
|
236
|
+
features: list[str],
|
|
237
|
+
partitions: bool = False,
|
|
238
|
+
column_width: int = 6,
|
|
239
|
+
) -> None:
|
|
240
|
+
from .matrix import render_matrix
|
|
241
|
+
|
|
242
|
+
render_matrix(
|
|
243
|
+
self,
|
|
244
|
+
features=features,
|
|
245
|
+
partitions=partitions,
|
|
246
|
+
column_width=column_width,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
def print_report(self) -> dict[str, Any]:
|
|
250
|
+
from .report import print_report as _print_report
|
|
251
|
+
|
|
252
|
+
return _print_report(self)
|
|
253
|
+
|
|
254
|
+
def _export_matrix_data(self) -> None:
|
|
255
|
+
from .matrix import export_matrix_data
|
|
256
|
+
|
|
257
|
+
export_matrix_data(self)
|
|
258
|
+
|
|
259
|
+
def _collect_feature_ids(self) -> list[str]:
|
|
260
|
+
feature_ids: set[str] = set()
|
|
261
|
+
for statuses in self.group_feature_status.values():
|
|
262
|
+
feature_ids.update(statuses.keys())
|
|
263
|
+
return sorted(feature_ids)
|
|
264
|
+
|
|
265
|
+
def _collect_partition_ids(self) -> list[str]:
|
|
266
|
+
partition_ids: set[str] = set()
|
|
267
|
+
for statuses in self.group_partition_status.values():
|
|
268
|
+
partition_ids.update(statuses.keys())
|
|
269
|
+
return sorted(partition_ids)
|
|
270
|
+
|
|
271
|
+
def _collect_group_keys(self) -> list[Hashable]:
|
|
272
|
+
keys = set(self.group_feature_status.keys()) | set(
|
|
273
|
+
self.group_partition_status.keys()
|
|
274
|
+
)
|
|
275
|
+
return sorted(keys, key=self._group_sort_key)
|