jerry-thomas 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. datapipeline/analysis/vector/collector.py +275 -0
  2. datapipeline/analysis/vector/matrix.py +527 -0
  3. datapipeline/analysis/vector/report.py +317 -0
  4. datapipeline/analysis/vector_analyzer.py +3 -694
  5. datapipeline/build/__init__.py +6 -0
  6. datapipeline/build/state.py +52 -0
  7. datapipeline/build/tasks.py +186 -0
  8. datapipeline/cli/app.py +125 -56
  9. datapipeline/cli/commands/build.py +39 -0
  10. datapipeline/cli/commands/domain.py +1 -1
  11. datapipeline/cli/commands/filter.py +1 -2
  12. datapipeline/cli/commands/inspect.py +77 -26
  13. datapipeline/cli/commands/link.py +11 -12
  14. datapipeline/cli/commands/plugin.py +1 -1
  15. datapipeline/cli/commands/run.py +234 -110
  16. datapipeline/cli/commands/source.py +3 -3
  17. datapipeline/cli/commands/writers.py +138 -0
  18. datapipeline/cli/visuals/__init__.py +14 -0
  19. datapipeline/cli/{visuals.py → visuals/labels.py} +35 -24
  20. datapipeline/cli/visuals/sources.py +138 -0
  21. datapipeline/config/build.py +64 -0
  22. datapipeline/config/dataset/dataset.py +1 -2
  23. datapipeline/config/dataset/loader.py +1 -81
  24. datapipeline/config/postprocess.py +14 -0
  25. datapipeline/config/project.py +13 -1
  26. datapipeline/config/run.py +116 -0
  27. datapipeline/config/split.py +35 -0
  28. datapipeline/domain/vector.py +0 -9
  29. datapipeline/filters/filters.py +1 -1
  30. datapipeline/integrations/ml/__init__.py +16 -0
  31. datapipeline/integrations/ml/adapter.py +120 -0
  32. datapipeline/integrations/ml/pandas_support.py +46 -0
  33. datapipeline/integrations/ml/rows.py +82 -0
  34. datapipeline/integrations/ml/torch_support.py +94 -0
  35. datapipeline/pipeline/context.py +69 -0
  36. datapipeline/pipeline/pipelines.py +21 -23
  37. datapipeline/pipeline/split.py +171 -0
  38. datapipeline/pipeline/stages.py +54 -15
  39. datapipeline/pipeline/utils/keygen.py +2 -2
  40. datapipeline/pipeline/utils/transform_utils.py +64 -23
  41. datapipeline/plugins.py +1 -1
  42. datapipeline/runtime.py +73 -0
  43. datapipeline/services/artifacts.py +96 -0
  44. datapipeline/services/bootstrap/__init__.py +12 -0
  45. datapipeline/services/bootstrap/config.py +141 -0
  46. datapipeline/services/bootstrap/core.py +186 -0
  47. datapipeline/services/constants.py +5 -0
  48. datapipeline/services/entrypoints.py +1 -1
  49. datapipeline/services/factories.py +5 -2
  50. datapipeline/services/paths.py +1 -1
  51. datapipeline/services/project_paths.py +21 -0
  52. datapipeline/services/scaffold/domain.py +1 -2
  53. datapipeline/services/scaffold/filter.py +1 -2
  54. datapipeline/services/scaffold/mappers.py +1 -1
  55. datapipeline/services/scaffold/plugin.py +31 -5
  56. datapipeline/services/scaffold/source.py +2 -4
  57. datapipeline/sources/models/generator.py +6 -2
  58. datapipeline/sources/models/loader.py +0 -3
  59. datapipeline/sources/models/synthetic.py +1 -1
  60. datapipeline/sources/synthetic/time/loader.py +10 -2
  61. datapipeline/templates/plugin_skeleton/README.md +52 -7
  62. datapipeline/templates/plugin_skeleton/config/contracts/{time_hour_sin.yaml → time_hour_sin.synthetic.yaml} +3 -3
  63. datapipeline/templates/plugin_skeleton/config/contracts/{time_linear.yaml → time_linear.synthetic.yaml} +3 -3
  64. datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +9 -0
  65. datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +3 -18
  66. datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +13 -0
  67. datapipeline/templates/plugin_skeleton/config/datasets/default/project.yaml +12 -0
  68. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +10 -0
  69. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +10 -0
  70. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +10 -0
  71. datapipeline/templates/plugin_skeleton/pyproject.toml +2 -2
  72. datapipeline/templates/stubs/dto.py.j2 +2 -0
  73. datapipeline/templates/stubs/mapper.py.j2 +5 -3
  74. datapipeline/templates/stubs/parser.py.j2 +1 -0
  75. datapipeline/transforms/feature/scaler.py +127 -62
  76. datapipeline/transforms/filter.py +5 -2
  77. datapipeline/transforms/stream/fill.py +3 -25
  78. datapipeline/transforms/utils.py +16 -0
  79. datapipeline/transforms/vector.py +62 -78
  80. datapipeline/transforms/vector_utils.py +19 -67
  81. datapipeline/utils/load.py +2 -2
  82. datapipeline/utils/pickle_model.py +30 -0
  83. datapipeline/utils/placeholders.py +35 -0
  84. jerry_thomas-0.3.0.dist-info/METADATA +502 -0
  85. jerry_thomas-0.3.0.dist-info/RECORD +139 -0
  86. datapipeline/cli/visual_source.py +0 -32
  87. datapipeline/common/__init__.py +0 -0
  88. datapipeline/common/geo.py +0 -13
  89. datapipeline/integrations/ml.py +0 -319
  90. datapipeline/registries/registries.py +0 -15
  91. datapipeline/services/bootstrap.py +0 -191
  92. jerry_thomas-0.2.0.dist-info/METADATA +0 -402
  93. jerry_thomas-0.2.0.dist-info/RECORD +0 -112
  94. {jerry_thomas-0.2.0.dist-info → jerry_thomas-0.3.0.dist-info}/WHEEL +0 -0
  95. {jerry_thomas-0.2.0.dist-info → jerry_thomas-0.3.0.dist-info}/entry_points.txt +0 -0
  96. {jerry_thomas-0.2.0.dist-info → jerry_thomas-0.3.0.dist-info}/licenses/LICENSE +0 -0
  97. {jerry_thomas-0.2.0.dist-info → jerry_thomas-0.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,275 @@
1
+ from __future__ import annotations
2
+ from collections import Counter, defaultdict
3
+ from typing import Any, Hashable, Iterable, Literal
4
+ from datapipeline.transforms.vector_utils import base_id as _base_id
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+
8
+
9
+ def _base_feature_id(feature_id: str) -> str:
10
+ """Return the base feature id without partition suffix."""
11
+ return _base_id(feature_id)
12
+
13
+
14
+ def _is_missing_value(value: Any) -> bool:
15
+ if value is None:
16
+ return True
17
+ if isinstance(value, float):
18
+ return value != value # NaN without numpy
19
+ return False
20
+
21
+
22
+ class VectorStatsCollector:
23
+ """Collect coverage statistics for feature vectors."""
24
+
25
+ def __init__(
26
+ self,
27
+ expected_feature_ids: Iterable[str] | None = None,
28
+ *,
29
+ match_partition: Literal["base", "full"] = "base",
30
+ sample_limit: int = 5,
31
+ threshold: float | None = 0.95,
32
+ show_matrix: bool = False,
33
+ matrix_rows: int = 20,
34
+ matrix_cols: int = 10,
35
+ matrix_output: str | None = None,
36
+ matrix_format: str = "html",
37
+ ) -> None:
38
+ self.match_partition = match_partition
39
+ self.threshold = threshold
40
+ self.show_matrix = show_matrix
41
+ self.matrix_rows = matrix_rows if matrix_rows and matrix_rows > 0 else None
42
+ self.matrix_cols = matrix_cols if matrix_cols and matrix_cols > 0 else None
43
+ self.matrix_output = Path(matrix_output) if matrix_output else None
44
+ self.matrix_format = matrix_format
45
+
46
+ self.expected_features = (
47
+ {self._normalize(fid) for fid in expected_feature_ids}
48
+ if expected_feature_ids
49
+ else set()
50
+ )
51
+
52
+ self.discovered_features: set[str] = set()
53
+ self.discovered_partitions: set[str] = set()
54
+
55
+ self.total_vectors = 0
56
+ self.empty_vectors = 0
57
+
58
+ self.present_counts = Counter()
59
+ self.present_counts_partitions = Counter()
60
+ self.null_counts_partitions = Counter()
61
+
62
+ self.missing_samples = defaultdict(list)
63
+ self.missing_partition_samples = defaultdict(list)
64
+ self.sample_limit = sample_limit
65
+
66
+ self.group_feature_status = defaultdict(dict)
67
+ self.group_partition_status = defaultdict(dict)
68
+ # Optional per-cell sub-status for list-valued entries (finer resolution inside a bucket)
69
+ self.group_feature_sub: dict[Hashable,
70
+ dict[str, list[str]]] = defaultdict(dict)
71
+ self.group_partition_sub: dict[Hashable,
72
+ dict[str, list[str]]] = defaultdict(dict)
73
+
74
+ @staticmethod
75
+ def _group_sort_key(g: Hashable):
76
+ """Stable, chronological sort key for group keys.
77
+
78
+ Many pipelines use a 1-tuple containing a datetime as the group key.
79
+ Sorting by ``str(g)`` can produce lexicographic mis-ordering (e.g.,
80
+ hours "3" vs "21"). This helper prefers numeric datetime ordering and
81
+ falls back to string representation only when needed.
82
+ """
83
+ def norm(p: Any):
84
+ if isinstance(p, datetime):
85
+ # Use POSIX timestamp for monotonic ordering
86
+ return p.timestamp()
87
+ return p
88
+
89
+ if isinstance(g, (tuple, list)):
90
+ return tuple(norm(p) for p in g)
91
+ return norm(g)
92
+
93
+ def _normalize(self, feature_id: str) -> str:
94
+ if self.match_partition == "full":
95
+ return feature_id
96
+ return _base_feature_id(feature_id)
97
+
98
+ def update(self, group_key: Hashable, feature_vector: dict[str, Any]) -> None:
99
+ self.total_vectors += 1
100
+
101
+ present_partitions = set(feature_vector.keys())
102
+ if not present_partitions:
103
+ self.empty_vectors += 1
104
+
105
+ status_features = self.group_feature_status[group_key]
106
+ status_partitions = self.group_partition_status[group_key]
107
+
108
+ present_normalized: set[str] = set()
109
+ seen_partitions: set[str] = set()
110
+ for partition_id in present_partitions:
111
+ normalized = self._normalize(partition_id)
112
+ present_normalized.add(normalized)
113
+ seen_partitions.add(partition_id)
114
+
115
+ value = feature_vector[partition_id]
116
+
117
+ status_features.setdefault(normalized, "present")
118
+ status_partitions.setdefault(partition_id, "present")
119
+
120
+ self.discovered_features.add(normalized)
121
+ self.discovered_partitions.add(partition_id)
122
+
123
+ # Capture sub-status for list-valued entries
124
+ sub: list[str] | None = None
125
+ if isinstance(value, list):
126
+ sub = []
127
+ for v in value:
128
+ if v is None or (isinstance(v, float) and v != v):
129
+ sub.append("null")
130
+ else:
131
+ sub.append("present")
132
+ if sub:
133
+ self.group_partition_sub[group_key][partition_id] = sub
134
+ # Only store one sub per normalized id (first seen)
135
+ self.group_feature_sub[group_key].setdefault(
136
+ normalized, sub)
137
+
138
+ is_null = _is_missing_value(value)
139
+ if is_null:
140
+ status_features[normalized] = "null"
141
+ status_partitions[partition_id] = "null"
142
+ self.null_counts_partitions[partition_id] += 1
143
+ if len(self.missing_partition_samples[partition_id]) < self.sample_limit:
144
+ self.missing_partition_samples[partition_id].append(
145
+ (group_key, "null")
146
+ )
147
+ if len(self.missing_samples[normalized]) < self.sample_limit:
148
+ self.missing_samples[normalized].append(
149
+ (group_key, "null"))
150
+
151
+ for normalized in present_normalized:
152
+ if status_features.get(normalized) == "present":
153
+ self.present_counts[normalized] += 1
154
+
155
+ for partition_id in seen_partitions:
156
+ if status_partitions.get(partition_id) == "present":
157
+ self.present_counts_partitions[partition_id] += 1
158
+
159
+ tracked_features = (
160
+ self.expected_features if self.expected_features else self.discovered_features
161
+ )
162
+ missing_features = tracked_features - present_normalized
163
+ for feature_id in missing_features:
164
+ if status_features.get(feature_id) != "null":
165
+ status_features[feature_id] = "absent"
166
+ if len(self.missing_samples[feature_id]) < self.sample_limit:
167
+ self.missing_samples[feature_id].append((group_key, "absent"))
168
+
169
+ if self.match_partition == "full":
170
+ tracked_partitions = (
171
+ set(self.expected_features) if self.expected_features else self.discovered_partitions
172
+ )
173
+ else:
174
+ tracked_partitions = self.discovered_partitions
175
+
176
+ missing_partitions = tracked_partitions - present_partitions
177
+ for partition_id in missing_partitions:
178
+ if status_partitions.get(partition_id) != "null":
179
+ status_partitions[partition_id] = "absent"
180
+ if len(self.missing_partition_samples[partition_id]) < self.sample_limit:
181
+ self.missing_partition_samples[partition_id].append(
182
+ (group_key, "absent")
183
+ )
184
+
185
+ def _coverage(
186
+ self, identifier: str, *, partitions: bool = False
187
+ ) -> tuple[int, int, int]:
188
+ present = (
189
+ self.present_counts_partitions[identifier]
190
+ if partitions
191
+ else self.present_counts[identifier]
192
+ )
193
+ opportunities = self.total_vectors
194
+ missing = max(opportunities - present, 0)
195
+ return present, missing, opportunities
196
+
197
+ def _feature_null_count(self, feature_id: str) -> int:
198
+ total = 0
199
+ for partition_id, count in self.null_counts_partitions.items():
200
+ if self._normalize(partition_id) == feature_id:
201
+ total += count
202
+ return total
203
+
204
+ @staticmethod
205
+ def _format_group_key(group_key: Hashable) -> str:
206
+ if isinstance(group_key, tuple):
207
+ return ", ".join(str(part) for part in group_key)
208
+ return str(group_key)
209
+
210
+ @staticmethod
211
+ def _symbol_for(status: str) -> str:
212
+ return {
213
+ "present": "#",
214
+ "null": "!",
215
+ "absent": ".",
216
+ }.get(status, ".")
217
+
218
+ @staticmethod
219
+ def _format_samples(samples: list[tuple[Hashable, str]], limit: int = 3) -> str:
220
+ if not samples:
221
+ return ""
222
+ trimmed = samples[:limit]
223
+ rendered = ", ".join(
224
+ f"{reason}@{sample}" for sample, reason in trimmed)
225
+ if len(samples) > limit:
226
+ rendered += ", ..."
227
+ return rendered
228
+
229
+ @staticmethod
230
+ def _partition_suffix(partition_id: str) -> str:
231
+ return partition_id.split("__", 1)[1] if "__" in partition_id else partition_id
232
+
233
+ def _render_matrix(
234
+ self,
235
+ *,
236
+ features: list[str],
237
+ partitions: bool = False,
238
+ column_width: int = 6,
239
+ ) -> None:
240
+ from .matrix import render_matrix
241
+
242
+ render_matrix(
243
+ self,
244
+ features=features,
245
+ partitions=partitions,
246
+ column_width=column_width,
247
+ )
248
+
249
+ def print_report(self) -> dict[str, Any]:
250
+ from .report import print_report as _print_report
251
+
252
+ return _print_report(self)
253
+
254
+ def _export_matrix_data(self) -> None:
255
+ from .matrix import export_matrix_data
256
+
257
+ export_matrix_data(self)
258
+
259
+ def _collect_feature_ids(self) -> list[str]:
260
+ feature_ids: set[str] = set()
261
+ for statuses in self.group_feature_status.values():
262
+ feature_ids.update(statuses.keys())
263
+ return sorted(feature_ids)
264
+
265
+ def _collect_partition_ids(self) -> list[str]:
266
+ partition_ids: set[str] = set()
267
+ for statuses in self.group_partition_status.values():
268
+ partition_ids.update(statuses.keys())
269
+ return sorted(partition_ids)
270
+
271
+ def _collect_group_keys(self) -> list[Hashable]:
272
+ keys = set(self.group_feature_status.keys()) | set(
273
+ self.group_partition_status.keys()
274
+ )
275
+ return sorted(keys, key=self._group_sort_key)