xlr8 0.1.7b2__cp313-cp313-macosx_10_12_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,792 @@
1
+ """
2
+ Partition-based callback streaming for data lake population among other use cases.
3
+
4
+ ================================================================================
5
+ ARCHITECTURE - STREAM TO CALLBACK WITH PARTITIONING
6
+ ================================================================================
7
+
8
+ This module implements a two-phase approach:
9
+
10
+ PHASE 1: Download to Cache (reuses existing Rust backend)
11
+ ────────────────────────────────────────────────────────────────────────────────
12
+ MongoDB --> Rust Workers --> Parquet Cache (on disk)
13
+
14
+ Uses execute_parallel_stream_to_cache() - battle-tested, memory-safe.
15
+
16
+ PHASE 2: Partition + Parallel Callbacks
17
+ ────────────────────────────────────────────────────────────────────────────────
18
+ 1. Build partition plan using DuckDB:
19
+ - Discover unique (time_bucket, partition_key) combinations
20
+ - Create work items for each partition
21
+
22
+ 2. Execute callbacks in parallel (ThreadPoolExecutor):
23
+ - Each worker: DuckDB query -> PyArrow Table -> decode -> callback()
24
+ - DuckDB releases GIL -> true parallelism
25
+ - User callbacks can use non-picklable objects (boto3, etc.)
26
+
27
+ EDGE CASES HANDLED:
28
+ ────────────────────────────────────────────────────────────────────────────────
29
+ - NULL values in partition_by fields -> grouped as one partition
30
+ - Empty partitions (no data in time bucket) -> skipped
31
+ - Parent fields (e.g., "metadata") -> expanded to child fields
32
+ - Types.Any() fields -> decoded based on any_type_strategy
33
+ - ObjectIds -> converted to strings (same as to_polars)
34
+ - Large partitions -> DuckDB streams internally, memory-safe
35
+ - Timezone handling -> all datetimes normalized to UTC
36
+
37
+ ================================================================================
38
+ """
39
+
40
+ import logging
41
+ from concurrent.futures import ThreadPoolExecutor, as_completed
42
+ from dataclasses import dataclass
43
+ from datetime import datetime, timedelta, timezone
44
+ from pathlib import Path
45
+ from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, cast
46
+
47
+ import polars as pl
48
+ import pyarrow as pa
49
+
50
+ from xlr8.schema.types import Any as AnyType
51
+ from xlr8.schema.types import ObjectId as ObjectIdType
52
+
53
+ logger = logging.getLogger(__name__)
54
+
55
+
56
+ @dataclass
57
+ class PartitionWorkItem:
58
+ """A single partition to process."""
59
+
60
+ index: int
61
+ total: int
62
+ time_start: datetime
63
+ time_end: datetime
64
+ partition_values: Optional[Dict[str, Any]] # None if no partition_by
65
+ partition_fields: Optional[List[str]] # Fields used for partitioning
66
+
67
+
68
+ def _expand_parent_fields(
69
+ fields: List[str],
70
+ schema: Any,
71
+ ) -> List[str]:
72
+ """
73
+ Expand parent fields to their children in schema definition order.
74
+
75
+ When user specifies a parent field like "metadata" but the schema has
76
+ flattened fields like "metadata.device_id", expand to all children.
77
+
78
+ Args:
79
+ fields: Original field list
80
+ schema: XLR8 schema with field definitions
81
+
82
+ Returns:
83
+ Expanded field list with parent fields replaced by children
84
+
85
+ Raises:
86
+ ValueError: If field not found and no children exist
87
+ """
88
+ if schema is None:
89
+ return fields
90
+
91
+ all_schema_fields = list(schema.fields.keys())
92
+ expanded = []
93
+
94
+ for field_name in fields:
95
+ if schema.has_field(field_name):
96
+ # Field exists directly in schema
97
+ expanded.append(field_name)
98
+ else:
99
+ # Look for child fields with this prefix (in schema order)
100
+ prefix = f"{field_name}."
101
+ children = [f for f in all_schema_fields if f.startswith(prefix)]
102
+
103
+ if children:
104
+ logger.info(
105
+ "Partition field '%s' expanded to children: %s",
106
+ field_name,
107
+ children,
108
+ )
109
+ expanded.extend(children)
110
+ else:
111
+ raise ValueError(
112
+ (
113
+ f"Partition field '{field_name}' not found in schema. "
114
+ "No child fields. "
115
+ f"Available fields: {sorted(all_schema_fields)[:10]}"
116
+ + ("..." if len(all_schema_fields) > 10 else "")
117
+ )
118
+ )
119
+
120
+ return expanded
121
+
122
+
123
+ def _timedelta_to_duckdb_interval(td: timedelta) -> str:
124
+ """
125
+ Convert Python timedelta to DuckDB interval string.
126
+
127
+ Examples:
128
+ timedelta(days=7) -> "7 days"
129
+ timedelta(hours=16) -> "16 hours"
130
+ timedelta(minutes=30) -> "30 minutes"
131
+ """
132
+ total_seconds = int(td.total_seconds())
133
+
134
+ if total_seconds >= 86400 and total_seconds % 86400 == 0:
135
+ days = total_seconds // 86400
136
+ return f"{days} day" if days == 1 else f"{days} days"
137
+ elif total_seconds >= 3600 and total_seconds % 3600 == 0:
138
+ hours = total_seconds // 3600
139
+ return f"{hours} hour" if hours == 1 else f"{hours} hours"
140
+ elif total_seconds >= 60 and total_seconds % 60 == 0:
141
+ minutes = total_seconds // 60
142
+ return f"{minutes} minute" if minutes == 1 else f"{minutes} minutes"
143
+ else:
144
+ return f"{total_seconds} seconds"
145
+
146
+
147
+ def _build_partition_plan(
148
+ cache_dir: str,
149
+ time_field: str,
150
+ partition_time_delta: timedelta,
151
+ partition_by: Optional[List[str]],
152
+ memory_limit_mb: int,
153
+ threads: int,
154
+ ) -> List[PartitionWorkItem]:
155
+ """
156
+ Build partition plan by discovering unique partitions in the cached data.
157
+
158
+ Uses DuckDB to efficiently scan all parquet files and find unique
159
+ (time_bucket, partition_key) combinations.
160
+
161
+ Natural Time Boundaries:
162
+ - First partition's start time is floored to the start of the day (00:00:00)
163
+ - End time of last partition is kept as actual max + partition_time_delta
164
+ - This creates clean, predictable partition boundaries for data lakes
165
+
166
+ Args:
167
+ cache_dir: Path to cache directory with parquet files
168
+ time_field: Name of the timestamp field
169
+ partition_time_delta: Time bucket size (e.g., 7 days)
170
+ partition_by: List of fields to partition by (e.g., ["metadata.instrument"])
171
+ memory_limit_mb: DuckDB memory limit
172
+ threads: DuckDB thread count
173
+
174
+ Returns:
175
+ List of PartitionWorkItem to process
176
+ """
177
+ import duckdb
178
+
179
+ cache_path = Path(cache_dir)
180
+ parquet_files = list(cache_path.glob("*.parquet"))
181
+
182
+ if not parquet_files:
183
+ logger.warning("No parquet files found in %s", cache_dir)
184
+ return []
185
+
186
+ file_paths = [str(f) for f in parquet_files]
187
+ files_list = ", ".join([f"'{f}'" for f in file_paths])
188
+
189
+ # STEP 1: Get global min timestamp to establish natural day boundary
190
+ global_min_query = f"""
191
+ SELECT MIN("{time_field}") AS global_min_time
192
+ FROM read_parquet([{files_list}])
193
+ """
194
+
195
+ try:
196
+ conn = duckdb.connect(":memory:")
197
+
198
+ # Configure DuckDB
199
+ if memory_limit_mb:
200
+ conn.execute(f"SET memory_limit = '{memory_limit_mb}MB'")
201
+ if threads:
202
+ conn.execute(f"SET threads = {threads}")
203
+
204
+ # Get global min and floor to start of day
205
+ global_result = cast(
206
+ Optional[Tuple[Any, ...]],
207
+ conn.execute(global_min_query).fetchone(),
208
+ )
209
+ if global_result is None or global_result[0] is None:
210
+ logger.warning("No data found in parquet files")
211
+ conn.close()
212
+ return []
213
+
214
+ global_min_time = global_result[0]
215
+
216
+ # Ensure timezone aware
217
+ if hasattr(global_min_time, "tzinfo") and global_min_time.tzinfo is None:
218
+ global_min_time = global_min_time.replace(tzinfo=timezone.utc)
219
+
220
+ # Floor to start of day (zero hours, mins, seconds, microseconds)
221
+ floored_start = global_min_time.replace(
222
+ hour=0,
223
+ minute=0,
224
+ second=0,
225
+ microsecond=0,
226
+ )
227
+
228
+ logger.info(
229
+ "Natural time boundary: floored %s -> %s",
230
+ global_min_time.isoformat(),
231
+ floored_start.isoformat(),
232
+ )
233
+
234
+ # Convert timedelta to DuckDB interval
235
+ interval = _timedelta_to_duckdb_interval(partition_time_delta)
236
+
237
+ # STEP 2: Build partition query using floored start as origin
238
+ # DuckDB's time_bucket can take an origin parameter
239
+ origin_str = floored_start.strftime("%Y-%m-%d %H:%M:%S")
240
+
241
+ # Build SELECT clause for partition keys
242
+ select_parts = [
243
+ f"time_bucket(INTERVAL '{interval}', \"{time_field}\", "
244
+ f"TIMESTAMP '{origin_str}') AS time_bucket"
245
+ ]
246
+ group_parts = ["time_bucket"]
247
+
248
+ if partition_by:
249
+ for field in partition_by:
250
+ # Quote field name properly for DuckDB (handles dots in names)
251
+ select_parts.append(f'"{field}" AS "{field}"')
252
+ group_parts.append(f'"{field}"')
253
+
254
+ select_clause = ", ".join(select_parts)
255
+ group_clause = ", ".join(group_parts)
256
+
257
+ # Build query to discover partitions
258
+ query = f"""
259
+ SELECT
260
+ {select_clause},
261
+ MIN("{time_field}") AS actual_min_time,
262
+ MAX("{time_field}") AS actual_max_time,
263
+ COUNT(*) AS row_count
264
+ FROM read_parquet([{files_list}])
265
+ GROUP BY {group_clause}
266
+ ORDER BY time_bucket
267
+ """
268
+
269
+ result = conn.execute(query).fetchall()
270
+ columns = [desc[0] for desc in conn.execute(query).description] # type: ignore[union-attr]
271
+ conn.close()
272
+
273
+ # Build work items from results
274
+ work_items = []
275
+ total = len(result)
276
+
277
+ for idx, row in enumerate(result):
278
+ row_dict = dict(zip(columns, row))
279
+
280
+ # Extract time bucket
281
+ time_bucket = row_dict["time_bucket"]
282
+
283
+ # Calculate time_start and time_end
284
+ # time_bucket is the start of the bucket (aligned to floored origin)
285
+ if isinstance(time_bucket, datetime):
286
+ time_start = time_bucket
287
+ if time_start.tzinfo is None:
288
+ time_start = time_start.replace(tzinfo=timezone.utc)
289
+ time_end = time_start + partition_time_delta
290
+ else:
291
+ # Fallback: use actual min/max from data
292
+ time_start = row_dict["actual_min_time"]
293
+ time_end = row_dict["actual_max_time"]
294
+ if time_start.tzinfo is None:
295
+ time_start = time_start.replace(tzinfo=timezone.utc)
296
+ if time_end.tzinfo is None:
297
+ time_end = time_end.replace(tzinfo=timezone.utc)
298
+
299
+ # Extract partition values
300
+ partition_values = None
301
+ if partition_by:
302
+ partition_values = {}
303
+ for field in partition_by:
304
+ partition_values[field] = row_dict.get(field)
305
+
306
+ work_items.append(
307
+ PartitionWorkItem(
308
+ index=idx,
309
+ total=total,
310
+ time_start=time_start,
311
+ time_end=time_end,
312
+ partition_values=partition_values,
313
+ partition_fields=partition_by,
314
+ )
315
+ )
316
+
317
+ return work_items
318
+
319
+ except (duckdb.Error, KeyError, AttributeError, TypeError, ValueError) as e:
320
+ logger.error("Failed to build partition plan: %s", e)
321
+ raise
322
+
323
+
324
+ def _build_partition_query(
325
+ cache_dir: str,
326
+ time_field: str,
327
+ work_item: PartitionWorkItem,
328
+ sort_ascending: bool = True,
329
+ ) -> str:
330
+ """
331
+ Build DuckDB query to fetch data for a single partition.
332
+
333
+ Args:
334
+ cache_dir: Path to cache directory
335
+ time_field: Timestamp field name
336
+ work_item: Partition work item with time bounds and partition values
337
+ sort_ascending: Sort direction for time field
338
+
339
+ Returns:
340
+ DuckDB SQL query string
341
+ """
342
+ cache_path = Path(cache_dir)
343
+ parquet_files = list(cache_path.glob("*.parquet"))
344
+ files_list = ", ".join([f"'{str(f)}'" for f in parquet_files])
345
+
346
+ # Build WHERE clauses
347
+ where_clauses = []
348
+
349
+ # Time bounds - use proper timestamp formatting
350
+ time_start_iso = work_item.time_start.isoformat()
351
+ time_end_iso = work_item.time_end.isoformat()
352
+ where_clauses.append(f"\"{time_field}\" >= '{time_start_iso}'::TIMESTAMPTZ")
353
+ where_clauses.append(f"\"{time_field}\" < '{time_end_iso}'::TIMESTAMPTZ")
354
+
355
+ # Partition values
356
+ if work_item.partition_values:
357
+ for field, value in work_item.partition_values.items():
358
+ if value is None:
359
+ where_clauses.append(f'"{field}" IS NULL')
360
+ elif isinstance(value, str):
361
+ # Escape single quotes in string values
362
+ escaped = value.replace("'", "''")
363
+ where_clauses.append(f"\"{field}\" = '{escaped}'")
364
+ elif isinstance(value, bool):
365
+ where_clauses.append(f'"{field}" = {str(value).upper()}')
366
+ elif isinstance(value, (int, float)):
367
+ where_clauses.append(f'"{field}" = {value}')
368
+ else:
369
+ # Convert to string for complex types
370
+ escaped = str(value).replace("'", "''")
371
+ where_clauses.append(f"\"{field}\" = '{escaped}'")
372
+
373
+ where_clause = " AND ".join(where_clauses)
374
+ order_dir = "ASC" if sort_ascending else "DESC"
375
+
376
+ return f"""
377
+ SELECT *
378
+ FROM read_parquet([{files_list}])
379
+ WHERE {where_clause}
380
+ ORDER BY "{time_field}" {order_dir}
381
+ """
382
+
383
+
384
+ def _decode_struct_values_polars(
385
+ df: pl.DataFrame,
386
+ schema: Any,
387
+ any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
388
+ ) -> pl.DataFrame:
389
+ """
390
+ Decode struct-encoded Any-typed columns back to actual values (Polars).
391
+
392
+ This is copied from reader.py to avoid circular imports and reuse
393
+ battle-tested Polars decode logic. DuckDB -> Polars -> decode -> Arrow.
394
+
395
+ Args:
396
+ df: Polars DataFrame from DuckDB
397
+ schema: Schema with field type info
398
+ any_type_strategy: How to decode:
399
+ - "float": Coalesce to Float64, prioritize numeric (default)
400
+ - "string": Convert everything to string (lossless)
401
+ - "keep_struct": Keep raw struct, don't decode
402
+ """
403
+ if not hasattr(schema, "fields"):
404
+ return df
405
+
406
+ # Find Any-typed fields in schema
407
+ for field_name, field_type in schema.fields.items():
408
+ # Check if it's an Any type
409
+ is_any = isinstance(field_type, AnyType) or (
410
+ isinstance(field_type, type) and issubclass(field_type, AnyType)
411
+ )
412
+
413
+ if is_any and field_name in df.columns:
414
+ # Check if column is a struct
415
+ col_dtype = df.schema[field_name]
416
+ if str(col_dtype).startswith("Struct"):
417
+ # Strategy: keep_struct - don't decode at all
418
+ if any_type_strategy == "keep_struct":
419
+ continue
420
+
421
+ try:
422
+ # Get field names from the struct
423
+ struct_fields = cast(Any, getattr(col_dtype, "fields", []))
424
+ field_names = (
425
+ [f.name for f in struct_fields] if struct_fields else []
426
+ )
427
+
428
+ if any_type_strategy == "string":
429
+ # Convert ALL value types to string
430
+ coalesce_exprs = []
431
+
432
+ # String first (already string)
433
+ if "string_value" in field_names:
434
+ coalesce_exprs.append(
435
+ pl.col(field_name).struct.field("string_value")
436
+ )
437
+
438
+ # Float to string
439
+ if "float_value" in field_names:
440
+ coalesce_exprs.append(
441
+ pl.col(field_name)
442
+ .struct.field("float_value")
443
+ .cast(pl.Utf8)
444
+ )
445
+
446
+ # Int to string
447
+ for int_name in ["int64_value", "int32_value"]:
448
+ if int_name in field_names:
449
+ coalesce_exprs.append(
450
+ pl.col(field_name)
451
+ .struct.field(int_name)
452
+ .cast(pl.Utf8)
453
+ )
454
+
455
+ # Bool to string
456
+ if "bool_value" in field_names:
457
+ coalesce_exprs.append(
458
+ pl.col(field_name)
459
+ .struct.field("bool_value")
460
+ .cast(pl.Utf8)
461
+ )
462
+
463
+ # ObjectId, decimal, etc. (already strings)
464
+ for str_field in [
465
+ "objectid_value",
466
+ "decimal128_value",
467
+ "regex_value",
468
+ "binary_value",
469
+ "document_value",
470
+ "array_value",
471
+ ]:
472
+ if str_field in field_names:
473
+ coalesce_exprs.append(
474
+ pl.col(field_name).struct.field(str_field)
475
+ )
476
+
477
+ if coalesce_exprs:
478
+ df = df.with_columns(
479
+ pl.coalesce(coalesce_exprs).alias(field_name)
480
+ )
481
+
482
+ else: # "float" strategy (default)
483
+ # Coalesce to Float64, prioritize numeric
484
+ coalesce_exprs = []
485
+
486
+ # Try float first (highest precision)
487
+ if "float_value" in field_names:
488
+ coalesce_exprs.append(
489
+ pl.col(field_name).struct.field("float_value")
490
+ )
491
+
492
+ # Try various int types, cast to float
493
+ for int_name in ["int64_value", "int32_value"]:
494
+ if int_name in field_names:
495
+ coalesce_exprs.append(
496
+ pl.col(field_name)
497
+ .struct.field(int_name)
498
+ .cast(pl.Float64)
499
+ )
500
+
501
+ # Try bool (as 0.0/1.0)
502
+ if "bool_value" in field_names:
503
+ coalesce_exprs.append(
504
+ pl.col(field_name)
505
+ .struct.field("bool_value")
506
+ .cast(pl.Float64)
507
+ )
508
+
509
+ if coalesce_exprs:
510
+ if len(coalesce_exprs) == 1:
511
+ df = df.with_columns(
512
+ coalesce_exprs[0].alias(field_name)
513
+ )
514
+ else:
515
+ df = df.with_columns(
516
+ pl.coalesce(coalesce_exprs).alias(field_name)
517
+ )
518
+ else:
519
+ logger.warning(
520
+ "Could not decode struct column '%s': "
521
+ "no numeric fields in %s",
522
+ field_name,
523
+ field_names,
524
+ )
525
+ except (KeyError, AttributeError, TypeError, ValueError) as e:
526
+ logger.warning("Error decoding struct '%s': %s", field_name, e)
527
+
528
+ return df
529
+
530
+
531
+ def _convert_objectids_to_strings_polars(
532
+ df: pl.DataFrame,
533
+ schema: Any,
534
+ ) -> pl.DataFrame:
535
+ """
536
+ Convert ObjectId columns to strings in Polars (same as reader.py behavior).
537
+
538
+ ObjectIds are stored as 24-char hex strings in Parquet. This ensures they
539
+ stay as strings in the final output.
540
+ """
541
+ if not hasattr(schema, "fields"):
542
+ return df
543
+
544
+ # Find ObjectId columns
545
+ objectid_columns = []
546
+ for field_name, field_type in schema.fields.items():
547
+ is_oid = isinstance(field_type, ObjectIdType) or (
548
+ isinstance(field_type, type) and issubclass(field_type, ObjectIdType)
549
+ )
550
+ if is_oid and field_name in df.columns:
551
+ objectid_columns.append(field_name)
552
+
553
+ if not objectid_columns:
554
+ return df
555
+
556
+ # Convert to string in Polars
557
+ for col_name in objectid_columns:
558
+ df = df.with_columns(pl.col(col_name).cast(pl.Utf8))
559
+
560
+ return df
561
+
562
+
563
+ def _execute_partition_callback(
564
+ work_item: PartitionWorkItem,
565
+ cache_dir: str,
566
+ callback: Callable[[pa.Table, Dict[str, Any]], None],
567
+ schema: Any,
568
+ time_field: str,
569
+ any_type_strategy: Literal["float", "string", "keep_struct"],
570
+ sort_ascending: bool,
571
+ memory_limit_mb: int,
572
+ threads: int = 1,
573
+ ) -> Dict[str, Any]:
574
+ """
575
+ Execute callback for a single partition (runs in thread).
576
+
577
+ This function:
578
+ 1. Builds DuckDB query for the partition
579
+ 2. Fetches data as PyArrow Table
580
+ 3. Decodes Any() struct columns
581
+ 4. Converts ObjectIds to strings
582
+ 5. Calls user callback
583
+
584
+ Args:
585
+ work_item: Partition to process
586
+ cache_dir: Path to cache directory
587
+ callback: User callback function
588
+ schema: XLR8 schema
589
+ time_field: Timestamp field name
590
+ any_type_strategy: How to decode Any() columns
591
+ sort_ascending: Sort direction
592
+ memory_limit_mb: DuckDB memory limit
593
+ threads: DuckDB thread count (per worker, usually 1)
594
+
595
+ Returns:
596
+ Dict with rows processed and partition info
597
+ """
598
+ import duckdb
599
+
600
+ try:
601
+ # Build query
602
+ query = _build_partition_query(
603
+ cache_dir=cache_dir,
604
+ time_field=time_field,
605
+ work_item=work_item,
606
+ sort_ascending=sort_ascending,
607
+ )
608
+
609
+ # Execute query
610
+ conn = duckdb.connect(":memory:")
611
+
612
+ # Configure DuckDB for this worker
613
+ # Use per-worker memory limit (divide total by num threads calling this)
614
+ if memory_limit_mb:
615
+ conn.execute(f"SET memory_limit = '{memory_limit_mb}MB'")
616
+
617
+ # ThreadPoolExecutor provides parallelism; set DuckDB threads per worker here.
618
+ conn.execute(f"SET threads = {threads}")
619
+
620
+ # Fetch as Arrow Table (DuckDB native support) and convert to Polars
621
+ arrow_tmp = conn.execute(query).fetch_arrow_table()
622
+ polars_df = cast(pl.DataFrame, pl.from_arrow(arrow_tmp))
623
+ conn.close()
624
+
625
+ if len(polars_df) == 0:
626
+ # Empty partition - skip callback
627
+ return {
628
+ "rows": 0,
629
+ "partition_index": work_item.index,
630
+ "skipped": True,
631
+ }
632
+
633
+ # Decode Any() struct columns using Polars (reuses reader.py logic)
634
+ polars_df = _decode_struct_values_polars(polars_df, schema, any_type_strategy)
635
+
636
+ # Convert ObjectIds to strings (Polars)
637
+ polars_df = _convert_objectids_to_strings_polars(polars_df, schema)
638
+
639
+ # Convert to Arrow for callback (zero-copy via Arrow C Data Interface)
640
+ arrow_table = polars_df.to_arrow()
641
+
642
+ # Build metadata for callback
643
+ metadata = {
644
+ "time_start": work_item.time_start,
645
+ "time_end": work_item.time_end,
646
+ "partition_values": work_item.partition_values or {},
647
+ "row_count": arrow_table.num_rows,
648
+ "partition_index": work_item.index,
649
+ "total_partitions": work_item.total,
650
+ }
651
+
652
+ # Call user callback
653
+ callback(arrow_table, metadata)
654
+
655
+ return {
656
+ "rows": arrow_table.num_rows,
657
+ "partition_index": work_item.index,
658
+ "skipped": False,
659
+ }
660
+
661
+ except Exception as e: # noqa: BLE001
662
+ logger.error("Partition %d failed: %s", work_item.index, e)
663
+ raise
664
+
665
+
666
+ def execute_partitioned_callback(
667
+ cache_dir: str,
668
+ schema: Any,
669
+ callback: Callable[[pa.Table, Dict[str, Any]], None],
670
+ partition_time_delta: timedelta,
671
+ partition_by: Optional[List[str]],
672
+ any_type_strategy: Literal["float", "string", "keep_struct"],
673
+ max_workers: int,
674
+ sort_ascending: bool,
675
+ memory_limit_mb: int,
676
+ ) -> Dict[str, Any]:
677
+ """
678
+ Orchestrate parallel callback execution for partitioned data.
679
+
680
+ This is the main entry point for Phase 2 (after cache is populated).
681
+
682
+ Args:
683
+ cache_dir: Path to cache directory with parquet files
684
+ schema: XLR8 schema
685
+ callback: User callback function(table, metadata)
686
+ partition_time_delta: Time bucket size
687
+ partition_by: Fields to partition by (optional)
688
+ any_type_strategy: How to decode Any() columns
689
+ max_workers: Number of parallel callback threads
690
+ sort_ascending: Sort direction for time field
691
+ memory_limit_mb: Total memory limit for DuckDB operations
692
+
693
+ Returns:
694
+ Dict with total_partitions, total_rows, skipped_partitions, duration_s
695
+ """
696
+ import time
697
+
698
+ start_time = time.time()
699
+
700
+ time_field = schema.time_field
701
+
702
+ # Expand parent fields to children
703
+ # (e.g., "metadata" -> ["metadata.device_id", "metadata.sensor_id"])
704
+ if partition_by:
705
+ partition_by = _expand_parent_fields(partition_by, schema)
706
+
707
+ # Calculate per-worker memory limit
708
+ # Each worker gets an equal share
709
+ worker_memory_mb = max(64, memory_limit_mb // max_workers)
710
+
711
+ logging.debug("\n[Partition] Building partition plan...")
712
+ logging.debug(f" - Time bucket: {partition_time_delta}")
713
+ logging.debug(f" - Partition by: {partition_by or 'None (time only)'}")
714
+
715
+ # Build partition plan
716
+ work_items = _build_partition_plan(
717
+ cache_dir=cache_dir,
718
+ time_field=time_field,
719
+ partition_time_delta=partition_time_delta,
720
+ partition_by=partition_by,
721
+ memory_limit_mb=worker_memory_mb,
722
+ threads=1, # Single thread for planning
723
+ )
724
+
725
+ if not work_items:
726
+ logging.debug("[Partition] No partitions found!")
727
+ return {
728
+ "total_partitions": 0,
729
+ "total_rows": 0,
730
+ "skipped_partitions": 0,
731
+ "duration_s": time.time() - start_time,
732
+ }
733
+
734
+ logging.debug(f"[Partition] Found {len(work_items)} partitions")
735
+ logging.debug(
736
+ f"[Partition] Executing callbacks with {max_workers} workers "
737
+ f"(memory per worker: {worker_memory_mb}MB)"
738
+ )
739
+
740
+ # Execute callbacks in parallel
741
+ results = []
742
+ skipped = 0
743
+
744
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
745
+ futures = {
746
+ executor.submit(
747
+ _execute_partition_callback,
748
+ work_item=item,
749
+ cache_dir=cache_dir,
750
+ callback=callback,
751
+ schema=schema,
752
+ time_field=time_field,
753
+ any_type_strategy=any_type_strategy,
754
+ sort_ascending=sort_ascending,
755
+ memory_limit_mb=worker_memory_mb,
756
+ threads=1, # Each worker uses 1 DuckDB thread
757
+ ): item
758
+ for item in work_items
759
+ }
760
+
761
+ for future in as_completed(futures):
762
+ work_item = futures[future]
763
+ try:
764
+ result = future.result()
765
+ results.append(result)
766
+
767
+ if result.get("skipped"):
768
+ skipped += 1
769
+
770
+ except Exception as e: # noqa: BLE001
771
+ logger.error("Partition %d failed: %s", work_item.index, e)
772
+ raise RuntimeError(
773
+ f"Callback failed for partition {work_item.index} "
774
+ f"(time: {work_item.time_start} to {work_item.time_end}, "
775
+ f"values: {work_item.partition_values}): {e}"
776
+ ) from e
777
+
778
+ duration = time.time() - start_time
779
+ total_rows = sum(r.get("rows", 0) for r in results)
780
+
781
+ logging.debug("\n[Partition] Complete:")
782
+ logging.debug(f" - Total partitions: {len(work_items)}")
783
+ logging.debug(f" - Skipped (empty): {skipped}")
784
+ logging.debug(f" - Total rows: {total_rows:,}")
785
+ logging.debug(f" - Duration: {duration:.2f}s")
786
+
787
+ return {
788
+ "total_partitions": len(work_items),
789
+ "total_rows": total_rows,
790
+ "skipped_partitions": skipped,
791
+ "duration_s": duration,
792
+ }