transformplan 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,52 @@
1
+ """TransformPlan - Safe and reproducible data transformation.
2
+
3
+ TransformPlan is a Python library for building data transformation pipelines
4
+ with built-in schema validation, audit trails, and reproducibility guarantees.
5
+
6
+ Main Classes:
7
+ TransformPlan: Build and execute transformation pipelines with method chaining.
8
+ Protocol: Audit trail capturing transformation history with deterministic hashes.
9
+ Col: Column reference for building filter expressions.
10
+ Filter: Base class for serializable filter expressions.
11
+
12
+ Validation Classes:
13
+ ValidationResult: Result of schema validation.
14
+ SchemaValidationError: Exception raised on validation failure.
15
+ DryRunResult: Preview of pipeline execution without modifying data.
16
+
17
+ Utility Functions:
18
+ frame_hash: Compute deterministic hash of a DataFrame.
19
+
20
+ Example:
21
+ >>> import polars as pl
22
+ >>> from transformplan import TransformPlan, Col
23
+ >>>
24
+ >>> df = pl.DataFrame({"name": ["Alice", "Bob"], "age": [25, 30]})
25
+ >>> plan = TransformPlan().rows_filter(Col("age") >= 18)
26
+ >>> result, protocol = plan.process(df)
27
+ >>> protocol.print()
28
+ """
29
+
30
+ from transformplan.chunking import ChunkedProtocol, ChunkingError, ChunkValidationResult
31
+ from transformplan.filters import Col, Filter
32
+ from transformplan.plan import TransformPlan
33
+ from transformplan.protocol import Protocol, frame_hash
34
+ from transformplan.validation import (
35
+ DryRunResult,
36
+ SchemaValidationError,
37
+ ValidationResult,
38
+ )
39
+
40
+ __all__ = [
41
+ "ChunkValidationResult",
42
+ "ChunkedProtocol",
43
+ "ChunkingError",
44
+ "Col",
45
+ "DryRunResult",
46
+ "Filter",
47
+ "Protocol",
48
+ "SchemaValidationError",
49
+ "TransformPlan",
50
+ "ValidationResult",
51
+ "frame_hash",
52
+ ]
@@ -0,0 +1,611 @@
1
+ """Chunked processing support for large files.
2
+
3
+ This module provides infrastructure for processing large Parquet files in chunks,
4
+ with support for partition keys to keep related rows together.
5
+
6
+ Classes:
7
+ ChunkMode: Enum classifying operation compatibility with chunking.
8
+ OperationMeta: Metadata about an operation's chunking behavior.
9
+ ChunkValidationResult: Result of validating a pipeline for chunked processing.
10
+ ChunkingError: Exception raised when pipeline is incompatible with chunking.
11
+ ChunkInfo: Information about a processed chunk.
12
+ ChunkedProtocol: Protocol for tracking chunked processing.
13
+
14
+ Example:
15
+ >>> plan = TransformPlan().col_rename("id", "patient_id")
16
+ >>> result, protocol = plan.process_chunked(
17
+ ... source="patients.parquet",
18
+ ... partition_key="patient_id",
19
+ ... chunk_size=100_000,
20
+ ... )
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import hashlib
26
+ import json
27
+ from dataclasses import dataclass, field
28
+ from datetime import datetime, timezone
29
+ from enum import Enum, auto
30
+ from pathlib import Path
31
+ from typing import Any
32
+
33
+
34
+ class ChunkMode(Enum):
35
+ """Classification of operation compatibility with chunked processing.
36
+
37
+ CHUNKABLE: Can process any chunk independently.
38
+ GROUP_DEPENDENT: Needs all rows for a group together (e.g., rows_unique).
39
+ GLOBAL: Requires full dataset, blocked in chunked mode (e.g., rows_sort).
40
+ """
41
+
42
+ CHUNKABLE = auto()
43
+ GROUP_DEPENDENT = auto()
44
+ GLOBAL = auto()
45
+
46
+
47
+ @dataclass
48
+ class OperationMeta:
49
+ """Metadata about an operation's chunking behavior.
50
+
51
+ Attributes:
52
+ chunk_mode: How the operation behaves in chunked processing.
53
+ group_param: Name of parameter containing group columns (for GROUP_DEPENDENT).
54
+ """
55
+
56
+ chunk_mode: ChunkMode
57
+ group_param: str | None = None
58
+
59
+
60
+ # Registry mapping operation names to their chunking metadata.
61
+ # All operations are categorized based on their data dependencies.
62
+ OPERATION_CHUNK_REGISTRY: dict[str, OperationMeta] = {
63
+ # Column operations - all chunkable (row-independent)
64
+ "col_drop": OperationMeta(ChunkMode.CHUNKABLE),
65
+ "col_rename": OperationMeta(ChunkMode.CHUNKABLE),
66
+ "col_cast": OperationMeta(ChunkMode.CHUNKABLE),
67
+ "col_reorder": OperationMeta(ChunkMode.CHUNKABLE),
68
+ "col_select": OperationMeta(ChunkMode.CHUNKABLE),
69
+ "col_duplicate": OperationMeta(ChunkMode.CHUNKABLE),
70
+ "col_fill_null": OperationMeta(ChunkMode.CHUNKABLE),
71
+ "col_drop_null": OperationMeta(ChunkMode.CHUNKABLE),
72
+ "col_drop_zero": OperationMeta(ChunkMode.CHUNKABLE),
73
+ "col_add": OperationMeta(ChunkMode.CHUNKABLE),
74
+ "col_add_uuid": OperationMeta(ChunkMode.CHUNKABLE),
75
+ "col_hash": OperationMeta(ChunkMode.CHUNKABLE),
76
+ "col_coalesce": OperationMeta(ChunkMode.CHUNKABLE),
77
+ # Math scalar operations - all chunkable
78
+ "math_add": OperationMeta(ChunkMode.CHUNKABLE),
79
+ "math_subtract": OperationMeta(ChunkMode.CHUNKABLE),
80
+ "math_multiply": OperationMeta(ChunkMode.CHUNKABLE),
81
+ "math_divide": OperationMeta(ChunkMode.CHUNKABLE),
82
+ "math_clamp": OperationMeta(ChunkMode.CHUNKABLE),
83
+ "math_abs": OperationMeta(ChunkMode.CHUNKABLE),
84
+ "math_round": OperationMeta(ChunkMode.CHUNKABLE),
85
+ "math_set_min": OperationMeta(ChunkMode.CHUNKABLE),
86
+ "math_set_max": OperationMeta(ChunkMode.CHUNKABLE),
87
+ # Math column operations - all chunkable
88
+ "math_add_columns": OperationMeta(ChunkMode.CHUNKABLE),
89
+ "math_subtract_columns": OperationMeta(ChunkMode.CHUNKABLE),
90
+ "math_multiply_columns": OperationMeta(ChunkMode.CHUNKABLE),
91
+ "math_divide_columns": OperationMeta(ChunkMode.CHUNKABLE),
92
+ "math_percent_of": OperationMeta(ChunkMode.CHUNKABLE),
93
+ # Math aggregate operations - group-dependent
94
+ "math_cumsum": OperationMeta(ChunkMode.GROUP_DEPENDENT, group_param="group_by"),
95
+ "math_rank": OperationMeta(ChunkMode.GROUP_DEPENDENT, group_param="group_by"),
96
+ # String operations - all chunkable
97
+ "str_replace": OperationMeta(ChunkMode.CHUNKABLE),
98
+ "str_slice": OperationMeta(ChunkMode.CHUNKABLE),
99
+ "str_truncate": OperationMeta(ChunkMode.CHUNKABLE),
100
+ "str_lower": OperationMeta(ChunkMode.CHUNKABLE),
101
+ "str_upper": OperationMeta(ChunkMode.CHUNKABLE),
102
+ "str_strip": OperationMeta(ChunkMode.CHUNKABLE),
103
+ "str_pad": OperationMeta(ChunkMode.CHUNKABLE),
104
+ "str_split": OperationMeta(ChunkMode.CHUNKABLE),
105
+ "str_concat": OperationMeta(ChunkMode.CHUNKABLE),
106
+ "str_extract": OperationMeta(ChunkMode.CHUNKABLE),
107
+ # Datetime operations - all chunkable
108
+ "dt_year": OperationMeta(ChunkMode.CHUNKABLE),
109
+ "dt_month": OperationMeta(ChunkMode.CHUNKABLE),
110
+ "dt_day": OperationMeta(ChunkMode.CHUNKABLE),
111
+ "dt_week": OperationMeta(ChunkMode.CHUNKABLE),
112
+ "dt_quarter": OperationMeta(ChunkMode.CHUNKABLE),
113
+ "dt_year_month": OperationMeta(ChunkMode.CHUNKABLE),
114
+ "dt_quarter_year": OperationMeta(ChunkMode.CHUNKABLE),
115
+ "dt_calendar_week": OperationMeta(ChunkMode.CHUNKABLE),
116
+ "dt_parse": OperationMeta(ChunkMode.CHUNKABLE),
117
+ "dt_format": OperationMeta(ChunkMode.CHUNKABLE),
118
+ "dt_diff_days": OperationMeta(ChunkMode.CHUNKABLE),
119
+ "dt_age_years": OperationMeta(ChunkMode.CHUNKABLE),
120
+ "dt_is_between": OperationMeta(ChunkMode.CHUNKABLE),
121
+ "dt_truncate": OperationMeta(ChunkMode.CHUNKABLE),
122
+ # Map operations - all chunkable
123
+ "map_values": OperationMeta(ChunkMode.CHUNKABLE),
124
+ "map_discretize": OperationMeta(ChunkMode.CHUNKABLE),
125
+ "map_bool_to_int": OperationMeta(ChunkMode.CHUNKABLE),
126
+ "map_null_to_value": OperationMeta(ChunkMode.CHUNKABLE),
127
+ "map_value_to_null": OperationMeta(ChunkMode.CHUNKABLE),
128
+ "map_case": OperationMeta(ChunkMode.CHUNKABLE),
129
+ "map_from_column": OperationMeta(ChunkMode.CHUNKABLE),
130
+ # Row operations - mixed
131
+ "rows_filter": OperationMeta(ChunkMode.CHUNKABLE),
132
+ "rows_drop": OperationMeta(ChunkMode.CHUNKABLE),
133
+ "rows_flag": OperationMeta(ChunkMode.CHUNKABLE),
134
+ "rows_explode": OperationMeta(ChunkMode.CHUNKABLE),
135
+ "rows_drop_nulls": OperationMeta(ChunkMode.CHUNKABLE),
136
+ "rows_melt": OperationMeta(ChunkMode.CHUNKABLE),
137
+ # Row operations - group-dependent
138
+ "rows_unique": OperationMeta(ChunkMode.GROUP_DEPENDENT, group_param="columns"),
139
+ "rows_deduplicate": OperationMeta(ChunkMode.GROUP_DEPENDENT, group_param="columns"),
140
+ # Row operations - global (blocked)
141
+ "rows_sort": OperationMeta(ChunkMode.GLOBAL),
142
+ "rows_pivot": OperationMeta(ChunkMode.GLOBAL),
143
+ "rows_sample": OperationMeta(ChunkMode.GLOBAL),
144
+ "rows_head": OperationMeta(ChunkMode.GLOBAL),
145
+ "rows_tail": OperationMeta(ChunkMode.GLOBAL),
146
+ }
147
+
148
+
149
+ @dataclass
150
+ class ChunkValidationResult:
151
+ """Result of validating a pipeline for chunked processing.
152
+
153
+ Attributes:
154
+ is_valid: Whether the pipeline can be processed in chunks.
155
+ errors: List of error messages explaining incompatibilities.
156
+ warnings: List of warning messages (non-blocking).
157
+ global_operations: Names of operations that require full dataset.
158
+ group_dependent_ops: List of (operation, columns) for group-dependent ops.
159
+ """
160
+
161
+ is_valid: bool
162
+ errors: list[str] = field(default_factory=list)
163
+ warnings: list[str] = field(default_factory=list)
164
+ global_operations: list[str] = field(default_factory=list)
165
+ group_dependent_ops: list[tuple[str, list[str] | None]] = field(
166
+ default_factory=list
167
+ )
168
+
169
+ def __str__(self) -> str:
170
+ """Return string representation of validation result."""
171
+ lines = []
172
+ if self.is_valid:
173
+ lines.append("Pipeline is compatible with chunked processing.")
174
+ else:
175
+ lines.append("Pipeline is NOT compatible with chunked processing.")
176
+
177
+ if self.errors:
178
+ lines.append("\nErrors:")
179
+ lines.extend(f" - {error}" for error in self.errors)
180
+
181
+ if self.warnings:
182
+ lines.append("\nWarnings:")
183
+ lines.extend(f" - {warning}" for warning in self.warnings)
184
+
185
+ if self.global_operations:
186
+ lines.append(f"\nGlobal operations (blocked): {self.global_operations}")
187
+
188
+ if self.group_dependent_ops:
189
+ lines.append("\nGroup-dependent operations:")
190
+ for op, cols in self.group_dependent_ops:
191
+ lines.append(f" - {op}: groups by {cols}")
192
+
193
+ return "\n".join(lines)
194
+
195
+
196
+ class ChunkingError(Exception):
197
+ """Raised when a pipeline is incompatible with chunked processing.
198
+
199
+ Attributes:
200
+ validation_result: The validation result containing error details.
201
+ """
202
+
203
+ def __init__(
204
+ self, message: str, validation_result: ChunkValidationResult | None = None
205
+ ) -> None:
206
+ """Initialize ChunkingError with message and optional validation result."""
207
+ super().__init__(message)
208
+ self.validation_result = validation_result
209
+
210
+
211
+ @dataclass
212
+ class ChunkInfo:
213
+ """Information about a processed chunk.
214
+
215
+ Attributes:
216
+ chunk_index: Zero-based index of this chunk.
217
+ input_rows: Number of rows in the input chunk.
218
+ output_rows: Number of rows after processing.
219
+ input_hash: Hash of the input chunk data.
220
+ output_hash: Hash of the output chunk data.
221
+ elapsed_seconds: Processing time for this chunk.
222
+ """
223
+
224
+ chunk_index: int
225
+ input_rows: int
226
+ output_rows: int
227
+ input_hash: str
228
+ output_hash: str
229
+ elapsed_seconds: float
230
+
231
+
232
+ class ChunkedProtocol:
233
+ """Protocol for tracking chunked processing with per-chunk information.
234
+
235
+ Tracks the overall processing as well as individual chunk statistics.
236
+
237
+ Attributes:
238
+ VERSION: Protocol version string.
239
+ """
240
+
241
+ VERSION = "1.0"
242
+
243
+ def __init__(self) -> None:
244
+ """Initialize an empty ChunkedProtocol."""
245
+ self._chunks: list[ChunkInfo] = []
246
+ self._source_path: str | None = None
247
+ self._partition_key: list[str] | None = None
248
+ self._chunk_size: int | None = None
249
+ self._created_at: str = datetime.now(timezone.utc).isoformat()
250
+ self._metadata: dict[str, Any] = {}
251
+ self._operations: list[dict[str, Any]] = []
252
+
253
+ def set_source(
254
+ self,
255
+ path: str,
256
+ partition_key: list[str] | None,
257
+ chunk_size: int,
258
+ ) -> None:
259
+ """Set source file information."""
260
+ self._source_path = path
261
+ self._partition_key = partition_key
262
+ self._chunk_size = chunk_size
263
+
264
+ def set_operations(self, operations: list[dict[str, Any]]) -> None:
265
+ """Record the operations that were applied."""
266
+ self._operations = operations
267
+
268
+ def set_metadata(self, **kwargs: Any) -> None: # noqa: ANN401
269
+ """Set arbitrary metadata on the protocol."""
270
+ self._metadata.update(kwargs)
271
+
272
+ def add_chunk(self, chunk_info: ChunkInfo) -> None:
273
+ """Add information about a processed chunk."""
274
+ self._chunks.append(chunk_info)
275
+
276
+ @property
277
+ def chunks(self) -> list[ChunkInfo]:
278
+ """List of chunk information.
279
+
280
+ Returns:
281
+ List of ChunkInfo instances.
282
+ """
283
+ return self._chunks
284
+
285
+ @property
286
+ def total_input_rows(self) -> int:
287
+ """Total rows across all input chunks.
288
+
289
+ Returns:
290
+ Sum of input rows.
291
+ """
292
+ return sum(c.input_rows for c in self._chunks)
293
+
294
+ @property
295
+ def total_output_rows(self) -> int:
296
+ """Total rows across all output chunks.
297
+
298
+ Returns:
299
+ Sum of output rows.
300
+ """
301
+ return sum(c.output_rows for c in self._chunks)
302
+
303
+ @property
304
+ def total_elapsed_seconds(self) -> float:
305
+ """Total processing time across all chunks.
306
+
307
+ Returns:
308
+ Sum of elapsed seconds.
309
+ """
310
+ return sum(c.elapsed_seconds for c in self._chunks)
311
+
312
+ @property
313
+ def num_chunks(self) -> int:
314
+ """Number of chunks processed.
315
+
316
+ Returns:
317
+ Count of chunks.
318
+ """
319
+ return len(self._chunks)
320
+
321
+ @property
322
+ def metadata(self) -> dict[str, Any]:
323
+ """Protocol metadata.
324
+
325
+ Returns:
326
+ Dictionary of metadata.
327
+ """
328
+ return self._metadata
329
+
330
+ def output_hash(self) -> str:
331
+ """Compute a combined hash of all output chunk hashes.
332
+
333
+ Returns:
334
+ A 16-character hex hash of all chunk output hashes combined.
335
+ """
336
+ if not self._chunks:
337
+ return ""
338
+ combined = "|".join(c.output_hash for c in self._chunks)
339
+ return hashlib.sha256(combined.encode()).hexdigest()[:16]
340
+
341
+ def to_dict(self) -> dict[str, Any]:
342
+ """Serialize protocol to a dictionary.
343
+
344
+ Returns:
345
+ Dictionary representation of the protocol.
346
+ """
347
+ return {
348
+ "version": self.VERSION,
349
+ "created_at": self._created_at,
350
+ "metadata": self._metadata,
351
+ "source": {
352
+ "path": self._source_path,
353
+ "partition_key": self._partition_key,
354
+ "chunk_size": self._chunk_size,
355
+ },
356
+ "operations": self._operations,
357
+ "summary": {
358
+ "num_chunks": self.num_chunks,
359
+ "total_input_rows": self.total_input_rows,
360
+ "total_output_rows": self.total_output_rows,
361
+ "total_elapsed_seconds": round(self.total_elapsed_seconds, 4),
362
+ "output_hash": self.output_hash(),
363
+ },
364
+ "chunks": [
365
+ {
366
+ "chunk_index": c.chunk_index,
367
+ "input_rows": c.input_rows,
368
+ "output_rows": c.output_rows,
369
+ "input_hash": c.input_hash,
370
+ "output_hash": c.output_hash,
371
+ "elapsed_seconds": round(c.elapsed_seconds, 4),
372
+ }
373
+ for c in self._chunks
374
+ ],
375
+ }
376
+
377
+ @classmethod
378
+ def from_dict(cls, data: dict[str, Any]) -> ChunkedProtocol:
379
+ """Deserialize protocol from a dictionary.
380
+
381
+ Returns:
382
+ ChunkedProtocol instance.
383
+ """
384
+ protocol = cls()
385
+ protocol._created_at = data.get("created_at", protocol._created_at)
386
+ protocol._metadata = data.get("metadata", {})
387
+
388
+ source = data.get("source", {})
389
+ protocol._source_path = source.get("path")
390
+ protocol._partition_key = source.get("partition_key")
391
+ protocol._chunk_size = source.get("chunk_size")
392
+
393
+ protocol._operations = data.get("operations", [])
394
+
395
+ for chunk_data in data.get("chunks", []):
396
+ protocol._chunks.append(
397
+ ChunkInfo(
398
+ chunk_index=chunk_data["chunk_index"],
399
+ input_rows=chunk_data["input_rows"],
400
+ output_rows=chunk_data["output_rows"],
401
+ input_hash=chunk_data["input_hash"],
402
+ output_hash=chunk_data["output_hash"],
403
+ elapsed_seconds=chunk_data["elapsed_seconds"],
404
+ )
405
+ )
406
+
407
+ return protocol
408
+
409
+ def to_json(self, path: str | Path | None = None, indent: int = 2) -> str:
410
+ """Serialize protocol to JSON.
411
+
412
+ Args:
413
+ path: Optional file path to write to.
414
+ indent: JSON indentation level.
415
+
416
+ Returns:
417
+ JSON string.
418
+ """
419
+ json_str = json.dumps(self.to_dict(), indent=indent)
420
+
421
+ if path is not None:
422
+ Path(path).write_text(json_str)
423
+
424
+ return json_str
425
+
426
+ @classmethod
427
+ def from_json(cls, source: str | Path) -> ChunkedProtocol:
428
+ """Deserialize protocol from JSON.
429
+
430
+ Args:
431
+ source: Either a JSON string or a path to a JSON file.
432
+
433
+ Returns:
434
+ ChunkedProtocol instance.
435
+ """
436
+ if isinstance(source, Path) or not source.strip().startswith("{"):
437
+ content = Path(source).read_text()
438
+ else:
439
+ content = source
440
+
441
+ return cls.from_dict(json.loads(content))
442
+
443
+ def __repr__(self) -> str:
444
+ """Return string representation of the protocol.
445
+
446
+ Returns:
447
+ Human-readable representation.
448
+ """
449
+ return (
450
+ f"ChunkedProtocol({self.num_chunks} chunks, {self.total_input_rows} rows)"
451
+ )
452
+
453
+ def __len__(self) -> int:
454
+ """Return number of chunks processed.
455
+
456
+ Returns:
457
+ Count of chunks.
458
+ """
459
+ return self.num_chunks
460
+
461
+ def summary(self) -> str:
462
+ """Generate a human-readable summary of the chunked processing.
463
+
464
+ Returns:
465
+ Formatted string summary of the protocol.
466
+ """
467
+ lines = [
468
+ "=" * 70,
469
+ "CHUNKED PROCESSING PROTOCOL",
470
+ "=" * 70,
471
+ ]
472
+
473
+ if self._metadata:
474
+ for key, value in self._metadata.items():
475
+ lines.append(f"{key}: {value}")
476
+ lines.append("-" * 70)
477
+
478
+ # Source info
479
+ if self._source_path:
480
+ lines.append(f"Source: {self._source_path}")
481
+ if self._partition_key:
482
+ lines.append(f"Partition key: {self._partition_key}")
483
+ if self._chunk_size:
484
+ lines.append(f"Target chunk size: {self._chunk_size:,}")
485
+ lines.extend(
486
+ [
487
+ "-" * 70,
488
+ f"Chunks processed: {self.num_chunks}",
489
+ f"Total input rows: {self.total_input_rows:,}",
490
+ f"Total output rows: {self.total_output_rows:,}",
491
+ ]
492
+ )
493
+ rows_diff = self.total_output_rows - self.total_input_rows
494
+ if rows_diff != 0:
495
+ lines.append(f"Row change: {rows_diff:+,}")
496
+ lines.append(f"Total time: {self.total_elapsed_seconds:.4f}s")
497
+ if self.num_chunks > 0:
498
+ avg_time = self.total_elapsed_seconds / self.num_chunks
499
+ lines.append(f"Avg time per chunk: {avg_time:.4f}s")
500
+ lines.extend((f"Output hash: {self.output_hash()}", "-" * 70))
501
+
502
+ # Per-chunk details
503
+ if self._chunks:
504
+ lines.extend(
505
+ (
506
+ "",
507
+ f"{'#':<6} {'Input':<12} {'Output':<12} {'Change':<10} {'Time':<10} {'Hash':<16}",
508
+ "-" * 70,
509
+ )
510
+ )
511
+
512
+ for chunk in self._chunks:
513
+ idx = str(chunk.chunk_index)
514
+ input_rows = f"{chunk.input_rows:,}"
515
+ output_rows = f"{chunk.output_rows:,}"
516
+ change = chunk.output_rows - chunk.input_rows
517
+ change_str = f"{change:+,}" if change != 0 else "-"
518
+ time_str = f"{chunk.elapsed_seconds:.4f}s"
519
+ hash_str = chunk.output_hash
520
+
521
+ lines.append(
522
+ f"{idx:<6} {input_rows:<12} {output_rows:<12} {change_str:<10} {time_str:<10} {hash_str:<16}"
523
+ )
524
+
525
+ lines.append("=" * 70)
526
+ return "\n".join(lines)
527
+
528
+ def print(self) -> None:
529
+ """Print the protocol summary to stdout."""
530
+ print(self.summary()) # noqa: T201
531
+
532
+
533
+ def validate_chunked_pipeline( # noqa: C901
534
+ operations: list[tuple[Any, dict[str, Any]]],
535
+ partition_key: str | list[str] | None = None,
536
+ ) -> ChunkValidationResult:
537
+ """Validate that a pipeline is compatible with chunked processing.
538
+
539
+ Args:
540
+ operations: List of (method, params) tuples from the pipeline.
541
+ partition_key: Column(s) used for partitioning.
542
+
543
+ Returns:
544
+ ChunkValidationResult with validation details.
545
+ """
546
+ errors: list[str] = []
547
+ warnings: list[str] = []
548
+ global_ops: list[str] = []
549
+ group_ops: list[tuple[str, list[str] | None]] = []
550
+
551
+ # Normalize partition key to list
552
+ if partition_key is None:
553
+ partition_cols: set[str] = set()
554
+ elif isinstance(partition_key, str):
555
+ partition_cols = {partition_key}
556
+ else:
557
+ partition_cols = set(partition_key)
558
+
559
+ for method, params in operations:
560
+ op_name = method.__name__.lstrip("_")
561
+ meta = OPERATION_CHUNK_REGISTRY.get(op_name)
562
+
563
+ if meta is None:
564
+ # Unknown operation - warn but don't block
565
+ warnings.append(f"Unknown operation '{op_name}' - assuming chunkable")
566
+ continue
567
+
568
+ if meta.chunk_mode == ChunkMode.GLOBAL:
569
+ global_ops.append(op_name)
570
+ errors.append(
571
+ f"Operation '{op_name}' requires the full dataset and cannot be used "
572
+ "with chunked processing"
573
+ )
574
+
575
+ elif meta.chunk_mode == ChunkMode.GROUP_DEPENDENT:
576
+ group_param = meta.group_param
577
+ group_cols = params.get(group_param) if group_param else None
578
+
579
+ # Normalize to list
580
+ if isinstance(group_cols, str):
581
+ group_cols = [group_cols]
582
+
583
+ group_ops.append((op_name, group_cols))
584
+
585
+ if group_cols is None:
586
+ # No grouping specified = global operation
587
+ errors.append(
588
+ f"Operation '{op_name}' without '{group_param}' parameter requires "
589
+ "the full dataset. Either specify group columns or remove this operation."
590
+ )
591
+ elif not partition_cols:
592
+ # Group-dependent but no partition key
593
+ errors.append(
594
+ f"Operation '{op_name}' groups by {group_cols} but no partition_key "
595
+ "is specified. Set partition_key to include these columns."
596
+ )
597
+ elif not set(group_cols).issubset(partition_cols):
598
+ # Group columns not covered by partition key
599
+ missing = set(group_cols) - partition_cols
600
+ errors.append(
601
+ f"Operation '{op_name}' groups by {group_cols} but partition_key "
602
+ f"is {list(partition_cols)}. Missing columns: {list(missing)}"
603
+ )
604
+
605
+ return ChunkValidationResult(
606
+ is_valid=len(errors) == 0,
607
+ errors=errors,
608
+ warnings=warnings,
609
+ global_operations=global_ops,
610
+ group_dependent_ops=group_ops,
611
+ )