transformplan 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformplan/__init__.py +52 -0
- transformplan/chunking.py +611 -0
- transformplan/core.py +667 -0
- transformplan/filters.py +1049 -0
- transformplan/plan.py +47 -0
- transformplan/protocol.py +532 -0
- transformplan/py.typed +0 -0
- transformplan/validation.py +1579 -0
- transformplan-0.1.0.dist-info/METADATA +151 -0
- transformplan-0.1.0.dist-info/RECORD +13 -0
- transformplan-0.1.0.dist-info/WHEEL +5 -0
- transformplan-0.1.0.dist-info/licenses/LICENSE +21 -0
- transformplan-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""TransformPlan - Safe and reproducible data transformation.
|
|
2
|
+
|
|
3
|
+
TransformPlan is a Python library for building data transformation pipelines
|
|
4
|
+
with built-in schema validation, audit trails, and reproducibility guarantees.
|
|
5
|
+
|
|
6
|
+
Main Classes:
|
|
7
|
+
TransformPlan: Build and execute transformation pipelines with method chaining.
|
|
8
|
+
Protocol: Audit trail capturing transformation history with deterministic hashes.
|
|
9
|
+
Col: Column reference for building filter expressions.
|
|
10
|
+
Filter: Base class for serializable filter expressions.
|
|
11
|
+
|
|
12
|
+
Validation Classes:
|
|
13
|
+
ValidationResult: Result of schema validation.
|
|
14
|
+
SchemaValidationError: Exception raised on validation failure.
|
|
15
|
+
DryRunResult: Preview of pipeline execution without modifying data.
|
|
16
|
+
|
|
17
|
+
Utility Functions:
|
|
18
|
+
frame_hash: Compute deterministic hash of a DataFrame.
|
|
19
|
+
|
|
20
|
+
Example:
|
|
21
|
+
>>> import polars as pl
|
|
22
|
+
>>> from transformplan import TransformPlan, Col
|
|
23
|
+
>>>
|
|
24
|
+
>>> df = pl.DataFrame({"name": ["Alice", "Bob"], "age": [25, 30]})
|
|
25
|
+
>>> plan = TransformPlan().rows_filter(Col("age") >= 18)
|
|
26
|
+
>>> result, protocol = plan.process(df)
|
|
27
|
+
>>> protocol.print()
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from transformplan.chunking import ChunkedProtocol, ChunkingError, ChunkValidationResult
|
|
31
|
+
from transformplan.filters import Col, Filter
|
|
32
|
+
from transformplan.plan import TransformPlan
|
|
33
|
+
from transformplan.protocol import Protocol, frame_hash
|
|
34
|
+
from transformplan.validation import (
|
|
35
|
+
DryRunResult,
|
|
36
|
+
SchemaValidationError,
|
|
37
|
+
ValidationResult,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
__all__ = [
|
|
41
|
+
"ChunkValidationResult",
|
|
42
|
+
"ChunkedProtocol",
|
|
43
|
+
"ChunkingError",
|
|
44
|
+
"Col",
|
|
45
|
+
"DryRunResult",
|
|
46
|
+
"Filter",
|
|
47
|
+
"Protocol",
|
|
48
|
+
"SchemaValidationError",
|
|
49
|
+
"TransformPlan",
|
|
50
|
+
"ValidationResult",
|
|
51
|
+
"frame_hash",
|
|
52
|
+
]
|
|
@@ -0,0 +1,611 @@
|
|
|
1
|
+
"""Chunked processing support for large files.
|
|
2
|
+
|
|
3
|
+
This module provides infrastructure for processing large Parquet files in chunks,
|
|
4
|
+
with support for partition keys to keep related rows together.
|
|
5
|
+
|
|
6
|
+
Classes:
|
|
7
|
+
ChunkMode: Enum classifying operation compatibility with chunking.
|
|
8
|
+
OperationMeta: Metadata about an operation's chunking behavior.
|
|
9
|
+
ChunkValidationResult: Result of validating a pipeline for chunked processing.
|
|
10
|
+
ChunkingError: Exception raised when pipeline is incompatible with chunking.
|
|
11
|
+
ChunkInfo: Information about a processed chunk.
|
|
12
|
+
ChunkedProtocol: Protocol for tracking chunked processing.
|
|
13
|
+
|
|
14
|
+
Example:
|
|
15
|
+
>>> plan = TransformPlan().col_rename("id", "patient_id")
|
|
16
|
+
>>> result, protocol = plan.process_chunked(
|
|
17
|
+
... source="patients.parquet",
|
|
18
|
+
... partition_key="patient_id",
|
|
19
|
+
... chunk_size=100_000,
|
|
20
|
+
... )
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import hashlib
|
|
26
|
+
import json
|
|
27
|
+
from dataclasses import dataclass, field
|
|
28
|
+
from datetime import datetime, timezone
|
|
29
|
+
from enum import Enum, auto
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
from typing import Any
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ChunkMode(Enum):
|
|
35
|
+
"""Classification of operation compatibility with chunked processing.
|
|
36
|
+
|
|
37
|
+
CHUNKABLE: Can process any chunk independently.
|
|
38
|
+
GROUP_DEPENDENT: Needs all rows for a group together (e.g., rows_unique).
|
|
39
|
+
GLOBAL: Requires full dataset, blocked in chunked mode (e.g., rows_sort).
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
CHUNKABLE = auto()
|
|
43
|
+
GROUP_DEPENDENT = auto()
|
|
44
|
+
GLOBAL = auto()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class OperationMeta:
|
|
49
|
+
"""Metadata about an operation's chunking behavior.
|
|
50
|
+
|
|
51
|
+
Attributes:
|
|
52
|
+
chunk_mode: How the operation behaves in chunked processing.
|
|
53
|
+
group_param: Name of parameter containing group columns (for GROUP_DEPENDENT).
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
chunk_mode: ChunkMode
|
|
57
|
+
group_param: str | None = None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# Registry mapping operation names to their chunking metadata.
|
|
61
|
+
# All operations are categorized based on their data dependencies.
|
|
62
|
+
OPERATION_CHUNK_REGISTRY: dict[str, OperationMeta] = {
|
|
63
|
+
# Column operations - all chunkable (row-independent)
|
|
64
|
+
"col_drop": OperationMeta(ChunkMode.CHUNKABLE),
|
|
65
|
+
"col_rename": OperationMeta(ChunkMode.CHUNKABLE),
|
|
66
|
+
"col_cast": OperationMeta(ChunkMode.CHUNKABLE),
|
|
67
|
+
"col_reorder": OperationMeta(ChunkMode.CHUNKABLE),
|
|
68
|
+
"col_select": OperationMeta(ChunkMode.CHUNKABLE),
|
|
69
|
+
"col_duplicate": OperationMeta(ChunkMode.CHUNKABLE),
|
|
70
|
+
"col_fill_null": OperationMeta(ChunkMode.CHUNKABLE),
|
|
71
|
+
"col_drop_null": OperationMeta(ChunkMode.CHUNKABLE),
|
|
72
|
+
"col_drop_zero": OperationMeta(ChunkMode.CHUNKABLE),
|
|
73
|
+
"col_add": OperationMeta(ChunkMode.CHUNKABLE),
|
|
74
|
+
"col_add_uuid": OperationMeta(ChunkMode.CHUNKABLE),
|
|
75
|
+
"col_hash": OperationMeta(ChunkMode.CHUNKABLE),
|
|
76
|
+
"col_coalesce": OperationMeta(ChunkMode.CHUNKABLE),
|
|
77
|
+
# Math scalar operations - all chunkable
|
|
78
|
+
"math_add": OperationMeta(ChunkMode.CHUNKABLE),
|
|
79
|
+
"math_subtract": OperationMeta(ChunkMode.CHUNKABLE),
|
|
80
|
+
"math_multiply": OperationMeta(ChunkMode.CHUNKABLE),
|
|
81
|
+
"math_divide": OperationMeta(ChunkMode.CHUNKABLE),
|
|
82
|
+
"math_clamp": OperationMeta(ChunkMode.CHUNKABLE),
|
|
83
|
+
"math_abs": OperationMeta(ChunkMode.CHUNKABLE),
|
|
84
|
+
"math_round": OperationMeta(ChunkMode.CHUNKABLE),
|
|
85
|
+
"math_set_min": OperationMeta(ChunkMode.CHUNKABLE),
|
|
86
|
+
"math_set_max": OperationMeta(ChunkMode.CHUNKABLE),
|
|
87
|
+
# Math column operations - all chunkable
|
|
88
|
+
"math_add_columns": OperationMeta(ChunkMode.CHUNKABLE),
|
|
89
|
+
"math_subtract_columns": OperationMeta(ChunkMode.CHUNKABLE),
|
|
90
|
+
"math_multiply_columns": OperationMeta(ChunkMode.CHUNKABLE),
|
|
91
|
+
"math_divide_columns": OperationMeta(ChunkMode.CHUNKABLE),
|
|
92
|
+
"math_percent_of": OperationMeta(ChunkMode.CHUNKABLE),
|
|
93
|
+
# Math aggregate operations - group-dependent
|
|
94
|
+
"math_cumsum": OperationMeta(ChunkMode.GROUP_DEPENDENT, group_param="group_by"),
|
|
95
|
+
"math_rank": OperationMeta(ChunkMode.GROUP_DEPENDENT, group_param="group_by"),
|
|
96
|
+
# String operations - all chunkable
|
|
97
|
+
"str_replace": OperationMeta(ChunkMode.CHUNKABLE),
|
|
98
|
+
"str_slice": OperationMeta(ChunkMode.CHUNKABLE),
|
|
99
|
+
"str_truncate": OperationMeta(ChunkMode.CHUNKABLE),
|
|
100
|
+
"str_lower": OperationMeta(ChunkMode.CHUNKABLE),
|
|
101
|
+
"str_upper": OperationMeta(ChunkMode.CHUNKABLE),
|
|
102
|
+
"str_strip": OperationMeta(ChunkMode.CHUNKABLE),
|
|
103
|
+
"str_pad": OperationMeta(ChunkMode.CHUNKABLE),
|
|
104
|
+
"str_split": OperationMeta(ChunkMode.CHUNKABLE),
|
|
105
|
+
"str_concat": OperationMeta(ChunkMode.CHUNKABLE),
|
|
106
|
+
"str_extract": OperationMeta(ChunkMode.CHUNKABLE),
|
|
107
|
+
# Datetime operations - all chunkable
|
|
108
|
+
"dt_year": OperationMeta(ChunkMode.CHUNKABLE),
|
|
109
|
+
"dt_month": OperationMeta(ChunkMode.CHUNKABLE),
|
|
110
|
+
"dt_day": OperationMeta(ChunkMode.CHUNKABLE),
|
|
111
|
+
"dt_week": OperationMeta(ChunkMode.CHUNKABLE),
|
|
112
|
+
"dt_quarter": OperationMeta(ChunkMode.CHUNKABLE),
|
|
113
|
+
"dt_year_month": OperationMeta(ChunkMode.CHUNKABLE),
|
|
114
|
+
"dt_quarter_year": OperationMeta(ChunkMode.CHUNKABLE),
|
|
115
|
+
"dt_calendar_week": OperationMeta(ChunkMode.CHUNKABLE),
|
|
116
|
+
"dt_parse": OperationMeta(ChunkMode.CHUNKABLE),
|
|
117
|
+
"dt_format": OperationMeta(ChunkMode.CHUNKABLE),
|
|
118
|
+
"dt_diff_days": OperationMeta(ChunkMode.CHUNKABLE),
|
|
119
|
+
"dt_age_years": OperationMeta(ChunkMode.CHUNKABLE),
|
|
120
|
+
"dt_is_between": OperationMeta(ChunkMode.CHUNKABLE),
|
|
121
|
+
"dt_truncate": OperationMeta(ChunkMode.CHUNKABLE),
|
|
122
|
+
# Map operations - all chunkable
|
|
123
|
+
"map_values": OperationMeta(ChunkMode.CHUNKABLE),
|
|
124
|
+
"map_discretize": OperationMeta(ChunkMode.CHUNKABLE),
|
|
125
|
+
"map_bool_to_int": OperationMeta(ChunkMode.CHUNKABLE),
|
|
126
|
+
"map_null_to_value": OperationMeta(ChunkMode.CHUNKABLE),
|
|
127
|
+
"map_value_to_null": OperationMeta(ChunkMode.CHUNKABLE),
|
|
128
|
+
"map_case": OperationMeta(ChunkMode.CHUNKABLE),
|
|
129
|
+
"map_from_column": OperationMeta(ChunkMode.CHUNKABLE),
|
|
130
|
+
# Row operations - mixed
|
|
131
|
+
"rows_filter": OperationMeta(ChunkMode.CHUNKABLE),
|
|
132
|
+
"rows_drop": OperationMeta(ChunkMode.CHUNKABLE),
|
|
133
|
+
"rows_flag": OperationMeta(ChunkMode.CHUNKABLE),
|
|
134
|
+
"rows_explode": OperationMeta(ChunkMode.CHUNKABLE),
|
|
135
|
+
"rows_drop_nulls": OperationMeta(ChunkMode.CHUNKABLE),
|
|
136
|
+
"rows_melt": OperationMeta(ChunkMode.CHUNKABLE),
|
|
137
|
+
# Row operations - group-dependent
|
|
138
|
+
"rows_unique": OperationMeta(ChunkMode.GROUP_DEPENDENT, group_param="columns"),
|
|
139
|
+
"rows_deduplicate": OperationMeta(ChunkMode.GROUP_DEPENDENT, group_param="columns"),
|
|
140
|
+
# Row operations - global (blocked)
|
|
141
|
+
"rows_sort": OperationMeta(ChunkMode.GLOBAL),
|
|
142
|
+
"rows_pivot": OperationMeta(ChunkMode.GLOBAL),
|
|
143
|
+
"rows_sample": OperationMeta(ChunkMode.GLOBAL),
|
|
144
|
+
"rows_head": OperationMeta(ChunkMode.GLOBAL),
|
|
145
|
+
"rows_tail": OperationMeta(ChunkMode.GLOBAL),
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dataclass
|
|
150
|
+
class ChunkValidationResult:
|
|
151
|
+
"""Result of validating a pipeline for chunked processing.
|
|
152
|
+
|
|
153
|
+
Attributes:
|
|
154
|
+
is_valid: Whether the pipeline can be processed in chunks.
|
|
155
|
+
errors: List of error messages explaining incompatibilities.
|
|
156
|
+
warnings: List of warning messages (non-blocking).
|
|
157
|
+
global_operations: Names of operations that require full dataset.
|
|
158
|
+
group_dependent_ops: List of (operation, columns) for group-dependent ops.
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
is_valid: bool
|
|
162
|
+
errors: list[str] = field(default_factory=list)
|
|
163
|
+
warnings: list[str] = field(default_factory=list)
|
|
164
|
+
global_operations: list[str] = field(default_factory=list)
|
|
165
|
+
group_dependent_ops: list[tuple[str, list[str] | None]] = field(
|
|
166
|
+
default_factory=list
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
def __str__(self) -> str:
|
|
170
|
+
"""Return string representation of validation result."""
|
|
171
|
+
lines = []
|
|
172
|
+
if self.is_valid:
|
|
173
|
+
lines.append("Pipeline is compatible with chunked processing.")
|
|
174
|
+
else:
|
|
175
|
+
lines.append("Pipeline is NOT compatible with chunked processing.")
|
|
176
|
+
|
|
177
|
+
if self.errors:
|
|
178
|
+
lines.append("\nErrors:")
|
|
179
|
+
lines.extend(f" - {error}" for error in self.errors)
|
|
180
|
+
|
|
181
|
+
if self.warnings:
|
|
182
|
+
lines.append("\nWarnings:")
|
|
183
|
+
lines.extend(f" - {warning}" for warning in self.warnings)
|
|
184
|
+
|
|
185
|
+
if self.global_operations:
|
|
186
|
+
lines.append(f"\nGlobal operations (blocked): {self.global_operations}")
|
|
187
|
+
|
|
188
|
+
if self.group_dependent_ops:
|
|
189
|
+
lines.append("\nGroup-dependent operations:")
|
|
190
|
+
for op, cols in self.group_dependent_ops:
|
|
191
|
+
lines.append(f" - {op}: groups by {cols}")
|
|
192
|
+
|
|
193
|
+
return "\n".join(lines)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class ChunkingError(Exception):
|
|
197
|
+
"""Raised when a pipeline is incompatible with chunked processing.
|
|
198
|
+
|
|
199
|
+
Attributes:
|
|
200
|
+
validation_result: The validation result containing error details.
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
def __init__(
|
|
204
|
+
self, message: str, validation_result: ChunkValidationResult | None = None
|
|
205
|
+
) -> None:
|
|
206
|
+
"""Initialize ChunkingError with message and optional validation result."""
|
|
207
|
+
super().__init__(message)
|
|
208
|
+
self.validation_result = validation_result
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@dataclass
|
|
212
|
+
class ChunkInfo:
|
|
213
|
+
"""Information about a processed chunk.
|
|
214
|
+
|
|
215
|
+
Attributes:
|
|
216
|
+
chunk_index: Zero-based index of this chunk.
|
|
217
|
+
input_rows: Number of rows in the input chunk.
|
|
218
|
+
output_rows: Number of rows after processing.
|
|
219
|
+
input_hash: Hash of the input chunk data.
|
|
220
|
+
output_hash: Hash of the output chunk data.
|
|
221
|
+
elapsed_seconds: Processing time for this chunk.
|
|
222
|
+
"""
|
|
223
|
+
|
|
224
|
+
chunk_index: int
|
|
225
|
+
input_rows: int
|
|
226
|
+
output_rows: int
|
|
227
|
+
input_hash: str
|
|
228
|
+
output_hash: str
|
|
229
|
+
elapsed_seconds: float
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class ChunkedProtocol:
|
|
233
|
+
"""Protocol for tracking chunked processing with per-chunk information.
|
|
234
|
+
|
|
235
|
+
Tracks the overall processing as well as individual chunk statistics.
|
|
236
|
+
|
|
237
|
+
Attributes:
|
|
238
|
+
VERSION: Protocol version string.
|
|
239
|
+
"""
|
|
240
|
+
|
|
241
|
+
VERSION = "1.0"
|
|
242
|
+
|
|
243
|
+
def __init__(self) -> None:
|
|
244
|
+
"""Initialize an empty ChunkedProtocol."""
|
|
245
|
+
self._chunks: list[ChunkInfo] = []
|
|
246
|
+
self._source_path: str | None = None
|
|
247
|
+
self._partition_key: list[str] | None = None
|
|
248
|
+
self._chunk_size: int | None = None
|
|
249
|
+
self._created_at: str = datetime.now(timezone.utc).isoformat()
|
|
250
|
+
self._metadata: dict[str, Any] = {}
|
|
251
|
+
self._operations: list[dict[str, Any]] = []
|
|
252
|
+
|
|
253
|
+
def set_source(
|
|
254
|
+
self,
|
|
255
|
+
path: str,
|
|
256
|
+
partition_key: list[str] | None,
|
|
257
|
+
chunk_size: int,
|
|
258
|
+
) -> None:
|
|
259
|
+
"""Set source file information."""
|
|
260
|
+
self._source_path = path
|
|
261
|
+
self._partition_key = partition_key
|
|
262
|
+
self._chunk_size = chunk_size
|
|
263
|
+
|
|
264
|
+
def set_operations(self, operations: list[dict[str, Any]]) -> None:
|
|
265
|
+
"""Record the operations that were applied."""
|
|
266
|
+
self._operations = operations
|
|
267
|
+
|
|
268
|
+
def set_metadata(self, **kwargs: Any) -> None: # noqa: ANN401
|
|
269
|
+
"""Set arbitrary metadata on the protocol."""
|
|
270
|
+
self._metadata.update(kwargs)
|
|
271
|
+
|
|
272
|
+
def add_chunk(self, chunk_info: ChunkInfo) -> None:
|
|
273
|
+
"""Add information about a processed chunk."""
|
|
274
|
+
self._chunks.append(chunk_info)
|
|
275
|
+
|
|
276
|
+
@property
|
|
277
|
+
def chunks(self) -> list[ChunkInfo]:
|
|
278
|
+
"""List of chunk information.
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
List of ChunkInfo instances.
|
|
282
|
+
"""
|
|
283
|
+
return self._chunks
|
|
284
|
+
|
|
285
|
+
@property
|
|
286
|
+
def total_input_rows(self) -> int:
|
|
287
|
+
"""Total rows across all input chunks.
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
Sum of input rows.
|
|
291
|
+
"""
|
|
292
|
+
return sum(c.input_rows for c in self._chunks)
|
|
293
|
+
|
|
294
|
+
@property
|
|
295
|
+
def total_output_rows(self) -> int:
|
|
296
|
+
"""Total rows across all output chunks.
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
Sum of output rows.
|
|
300
|
+
"""
|
|
301
|
+
return sum(c.output_rows for c in self._chunks)
|
|
302
|
+
|
|
303
|
+
@property
|
|
304
|
+
def total_elapsed_seconds(self) -> float:
|
|
305
|
+
"""Total processing time across all chunks.
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
Sum of elapsed seconds.
|
|
309
|
+
"""
|
|
310
|
+
return sum(c.elapsed_seconds for c in self._chunks)
|
|
311
|
+
|
|
312
|
+
@property
|
|
313
|
+
def num_chunks(self) -> int:
|
|
314
|
+
"""Number of chunks processed.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
Count of chunks.
|
|
318
|
+
"""
|
|
319
|
+
return len(self._chunks)
|
|
320
|
+
|
|
321
|
+
@property
|
|
322
|
+
def metadata(self) -> dict[str, Any]:
|
|
323
|
+
"""Protocol metadata.
|
|
324
|
+
|
|
325
|
+
Returns:
|
|
326
|
+
Dictionary of metadata.
|
|
327
|
+
"""
|
|
328
|
+
return self._metadata
|
|
329
|
+
|
|
330
|
+
def output_hash(self) -> str:
|
|
331
|
+
"""Compute a combined hash of all output chunk hashes.
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
A 16-character hex hash of all chunk output hashes combined.
|
|
335
|
+
"""
|
|
336
|
+
if not self._chunks:
|
|
337
|
+
return ""
|
|
338
|
+
combined = "|".join(c.output_hash for c in self._chunks)
|
|
339
|
+
return hashlib.sha256(combined.encode()).hexdigest()[:16]
|
|
340
|
+
|
|
341
|
+
def to_dict(self) -> dict[str, Any]:
|
|
342
|
+
"""Serialize protocol to a dictionary.
|
|
343
|
+
|
|
344
|
+
Returns:
|
|
345
|
+
Dictionary representation of the protocol.
|
|
346
|
+
"""
|
|
347
|
+
return {
|
|
348
|
+
"version": self.VERSION,
|
|
349
|
+
"created_at": self._created_at,
|
|
350
|
+
"metadata": self._metadata,
|
|
351
|
+
"source": {
|
|
352
|
+
"path": self._source_path,
|
|
353
|
+
"partition_key": self._partition_key,
|
|
354
|
+
"chunk_size": self._chunk_size,
|
|
355
|
+
},
|
|
356
|
+
"operations": self._operations,
|
|
357
|
+
"summary": {
|
|
358
|
+
"num_chunks": self.num_chunks,
|
|
359
|
+
"total_input_rows": self.total_input_rows,
|
|
360
|
+
"total_output_rows": self.total_output_rows,
|
|
361
|
+
"total_elapsed_seconds": round(self.total_elapsed_seconds, 4),
|
|
362
|
+
"output_hash": self.output_hash(),
|
|
363
|
+
},
|
|
364
|
+
"chunks": [
|
|
365
|
+
{
|
|
366
|
+
"chunk_index": c.chunk_index,
|
|
367
|
+
"input_rows": c.input_rows,
|
|
368
|
+
"output_rows": c.output_rows,
|
|
369
|
+
"input_hash": c.input_hash,
|
|
370
|
+
"output_hash": c.output_hash,
|
|
371
|
+
"elapsed_seconds": round(c.elapsed_seconds, 4),
|
|
372
|
+
}
|
|
373
|
+
for c in self._chunks
|
|
374
|
+
],
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
@classmethod
|
|
378
|
+
def from_dict(cls, data: dict[str, Any]) -> ChunkedProtocol:
|
|
379
|
+
"""Deserialize protocol from a dictionary.
|
|
380
|
+
|
|
381
|
+
Returns:
|
|
382
|
+
ChunkedProtocol instance.
|
|
383
|
+
"""
|
|
384
|
+
protocol = cls()
|
|
385
|
+
protocol._created_at = data.get("created_at", protocol._created_at)
|
|
386
|
+
protocol._metadata = data.get("metadata", {})
|
|
387
|
+
|
|
388
|
+
source = data.get("source", {})
|
|
389
|
+
protocol._source_path = source.get("path")
|
|
390
|
+
protocol._partition_key = source.get("partition_key")
|
|
391
|
+
protocol._chunk_size = source.get("chunk_size")
|
|
392
|
+
|
|
393
|
+
protocol._operations = data.get("operations", [])
|
|
394
|
+
|
|
395
|
+
for chunk_data in data.get("chunks", []):
|
|
396
|
+
protocol._chunks.append(
|
|
397
|
+
ChunkInfo(
|
|
398
|
+
chunk_index=chunk_data["chunk_index"],
|
|
399
|
+
input_rows=chunk_data["input_rows"],
|
|
400
|
+
output_rows=chunk_data["output_rows"],
|
|
401
|
+
input_hash=chunk_data["input_hash"],
|
|
402
|
+
output_hash=chunk_data["output_hash"],
|
|
403
|
+
elapsed_seconds=chunk_data["elapsed_seconds"],
|
|
404
|
+
)
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
return protocol
|
|
408
|
+
|
|
409
|
+
def to_json(self, path: str | Path | None = None, indent: int = 2) -> str:
|
|
410
|
+
"""Serialize protocol to JSON.
|
|
411
|
+
|
|
412
|
+
Args:
|
|
413
|
+
path: Optional file path to write to.
|
|
414
|
+
indent: JSON indentation level.
|
|
415
|
+
|
|
416
|
+
Returns:
|
|
417
|
+
JSON string.
|
|
418
|
+
"""
|
|
419
|
+
json_str = json.dumps(self.to_dict(), indent=indent)
|
|
420
|
+
|
|
421
|
+
if path is not None:
|
|
422
|
+
Path(path).write_text(json_str)
|
|
423
|
+
|
|
424
|
+
return json_str
|
|
425
|
+
|
|
426
|
+
@classmethod
|
|
427
|
+
def from_json(cls, source: str | Path) -> ChunkedProtocol:
|
|
428
|
+
"""Deserialize protocol from JSON.
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
source: Either a JSON string or a path to a JSON file.
|
|
432
|
+
|
|
433
|
+
Returns:
|
|
434
|
+
ChunkedProtocol instance.
|
|
435
|
+
"""
|
|
436
|
+
if isinstance(source, Path) or not source.strip().startswith("{"):
|
|
437
|
+
content = Path(source).read_text()
|
|
438
|
+
else:
|
|
439
|
+
content = source
|
|
440
|
+
|
|
441
|
+
return cls.from_dict(json.loads(content))
|
|
442
|
+
|
|
443
|
+
def __repr__(self) -> str:
|
|
444
|
+
"""Return string representation of the protocol.
|
|
445
|
+
|
|
446
|
+
Returns:
|
|
447
|
+
Human-readable representation.
|
|
448
|
+
"""
|
|
449
|
+
return (
|
|
450
|
+
f"ChunkedProtocol({self.num_chunks} chunks, {self.total_input_rows} rows)"
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
def __len__(self) -> int:
|
|
454
|
+
"""Return number of chunks processed.
|
|
455
|
+
|
|
456
|
+
Returns:
|
|
457
|
+
Count of chunks.
|
|
458
|
+
"""
|
|
459
|
+
return self.num_chunks
|
|
460
|
+
|
|
461
|
+
def summary(self) -> str:
|
|
462
|
+
"""Generate a human-readable summary of the chunked processing.
|
|
463
|
+
|
|
464
|
+
Returns:
|
|
465
|
+
Formatted string summary of the protocol.
|
|
466
|
+
"""
|
|
467
|
+
lines = [
|
|
468
|
+
"=" * 70,
|
|
469
|
+
"CHUNKED PROCESSING PROTOCOL",
|
|
470
|
+
"=" * 70,
|
|
471
|
+
]
|
|
472
|
+
|
|
473
|
+
if self._metadata:
|
|
474
|
+
for key, value in self._metadata.items():
|
|
475
|
+
lines.append(f"{key}: {value}")
|
|
476
|
+
lines.append("-" * 70)
|
|
477
|
+
|
|
478
|
+
# Source info
|
|
479
|
+
if self._source_path:
|
|
480
|
+
lines.append(f"Source: {self._source_path}")
|
|
481
|
+
if self._partition_key:
|
|
482
|
+
lines.append(f"Partition key: {self._partition_key}")
|
|
483
|
+
if self._chunk_size:
|
|
484
|
+
lines.append(f"Target chunk size: {self._chunk_size:,}")
|
|
485
|
+
lines.extend(
|
|
486
|
+
[
|
|
487
|
+
"-" * 70,
|
|
488
|
+
f"Chunks processed: {self.num_chunks}",
|
|
489
|
+
f"Total input rows: {self.total_input_rows:,}",
|
|
490
|
+
f"Total output rows: {self.total_output_rows:,}",
|
|
491
|
+
]
|
|
492
|
+
)
|
|
493
|
+
rows_diff = self.total_output_rows - self.total_input_rows
|
|
494
|
+
if rows_diff != 0:
|
|
495
|
+
lines.append(f"Row change: {rows_diff:+,}")
|
|
496
|
+
lines.append(f"Total time: {self.total_elapsed_seconds:.4f}s")
|
|
497
|
+
if self.num_chunks > 0:
|
|
498
|
+
avg_time = self.total_elapsed_seconds / self.num_chunks
|
|
499
|
+
lines.append(f"Avg time per chunk: {avg_time:.4f}s")
|
|
500
|
+
lines.extend((f"Output hash: {self.output_hash()}", "-" * 70))
|
|
501
|
+
|
|
502
|
+
# Per-chunk details
|
|
503
|
+
if self._chunks:
|
|
504
|
+
lines.extend(
|
|
505
|
+
(
|
|
506
|
+
"",
|
|
507
|
+
f"{'#':<6} {'Input':<12} {'Output':<12} {'Change':<10} {'Time':<10} {'Hash':<16}",
|
|
508
|
+
"-" * 70,
|
|
509
|
+
)
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
for chunk in self._chunks:
|
|
513
|
+
idx = str(chunk.chunk_index)
|
|
514
|
+
input_rows = f"{chunk.input_rows:,}"
|
|
515
|
+
output_rows = f"{chunk.output_rows:,}"
|
|
516
|
+
change = chunk.output_rows - chunk.input_rows
|
|
517
|
+
change_str = f"{change:+,}" if change != 0 else "-"
|
|
518
|
+
time_str = f"{chunk.elapsed_seconds:.4f}s"
|
|
519
|
+
hash_str = chunk.output_hash
|
|
520
|
+
|
|
521
|
+
lines.append(
|
|
522
|
+
f"{idx:<6} {input_rows:<12} {output_rows:<12} {change_str:<10} {time_str:<10} {hash_str:<16}"
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
lines.append("=" * 70)
|
|
526
|
+
return "\n".join(lines)
|
|
527
|
+
|
|
528
|
+
def print(self) -> None:
|
|
529
|
+
"""Print the protocol summary to stdout."""
|
|
530
|
+
print(self.summary()) # noqa: T201
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
def validate_chunked_pipeline( # noqa: C901
|
|
534
|
+
operations: list[tuple[Any, dict[str, Any]]],
|
|
535
|
+
partition_key: str | list[str] | None = None,
|
|
536
|
+
) -> ChunkValidationResult:
|
|
537
|
+
"""Validate that a pipeline is compatible with chunked processing.
|
|
538
|
+
|
|
539
|
+
Args:
|
|
540
|
+
operations: List of (method, params) tuples from the pipeline.
|
|
541
|
+
partition_key: Column(s) used for partitioning.
|
|
542
|
+
|
|
543
|
+
Returns:
|
|
544
|
+
ChunkValidationResult with validation details.
|
|
545
|
+
"""
|
|
546
|
+
errors: list[str] = []
|
|
547
|
+
warnings: list[str] = []
|
|
548
|
+
global_ops: list[str] = []
|
|
549
|
+
group_ops: list[tuple[str, list[str] | None]] = []
|
|
550
|
+
|
|
551
|
+
# Normalize partition key to list
|
|
552
|
+
if partition_key is None:
|
|
553
|
+
partition_cols: set[str] = set()
|
|
554
|
+
elif isinstance(partition_key, str):
|
|
555
|
+
partition_cols = {partition_key}
|
|
556
|
+
else:
|
|
557
|
+
partition_cols = set(partition_key)
|
|
558
|
+
|
|
559
|
+
for method, params in operations:
|
|
560
|
+
op_name = method.__name__.lstrip("_")
|
|
561
|
+
meta = OPERATION_CHUNK_REGISTRY.get(op_name)
|
|
562
|
+
|
|
563
|
+
if meta is None:
|
|
564
|
+
# Unknown operation - warn but don't block
|
|
565
|
+
warnings.append(f"Unknown operation '{op_name}' - assuming chunkable")
|
|
566
|
+
continue
|
|
567
|
+
|
|
568
|
+
if meta.chunk_mode == ChunkMode.GLOBAL:
|
|
569
|
+
global_ops.append(op_name)
|
|
570
|
+
errors.append(
|
|
571
|
+
f"Operation '{op_name}' requires the full dataset and cannot be used "
|
|
572
|
+
"with chunked processing"
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
elif meta.chunk_mode == ChunkMode.GROUP_DEPENDENT:
|
|
576
|
+
group_param = meta.group_param
|
|
577
|
+
group_cols = params.get(group_param) if group_param else None
|
|
578
|
+
|
|
579
|
+
# Normalize to list
|
|
580
|
+
if isinstance(group_cols, str):
|
|
581
|
+
group_cols = [group_cols]
|
|
582
|
+
|
|
583
|
+
group_ops.append((op_name, group_cols))
|
|
584
|
+
|
|
585
|
+
if group_cols is None:
|
|
586
|
+
# No grouping specified = global operation
|
|
587
|
+
errors.append(
|
|
588
|
+
f"Operation '{op_name}' without '{group_param}' parameter requires "
|
|
589
|
+
"the full dataset. Either specify group columns or remove this operation."
|
|
590
|
+
)
|
|
591
|
+
elif not partition_cols:
|
|
592
|
+
# Group-dependent but no partition key
|
|
593
|
+
errors.append(
|
|
594
|
+
f"Operation '{op_name}' groups by {group_cols} but no partition_key "
|
|
595
|
+
"is specified. Set partition_key to include these columns."
|
|
596
|
+
)
|
|
597
|
+
elif not set(group_cols).issubset(partition_cols):
|
|
598
|
+
# Group columns not covered by partition key
|
|
599
|
+
missing = set(group_cols) - partition_cols
|
|
600
|
+
errors.append(
|
|
601
|
+
f"Operation '{op_name}' groups by {group_cols} but partition_key "
|
|
602
|
+
f"is {list(partition_cols)}. Missing columns: {list(missing)}"
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
return ChunkValidationResult(
|
|
606
|
+
is_valid=len(errors) == 0,
|
|
607
|
+
errors=errors,
|
|
608
|
+
warnings=warnings,
|
|
609
|
+
global_operations=global_ops,
|
|
610
|
+
group_dependent_ops=group_ops,
|
|
611
|
+
)
|