xlr8 0.1.7b3__cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xlr8/storage/reader.py ADDED
@@ -0,0 +1,1369 @@
1
+ """
2
+ Parquet file reader for cache-aware loading.
3
+
4
+ This module reads Parquet files written by the Rust backend and converts them
5
+ back into DataFrames with proper value decoding and type reconstruction.
6
+
7
+ DATA FLOW
8
+ =========
9
+
10
+ STEP 1: DISCOVER RUST-WRITTEN FILES
11
+ ------------------------------------
12
+ The Rust backend (rust_backend.fetch_chunks_bson) writes Parquet files with
13
+ timestamp-based naming derived from actual document data:
14
+
15
+ cache_dir/.cache/abc123def/
16
+ ts_1704067200_1704070800_part_0000.parquet
17
+ ts_1704070801_1704074400_part_0000.parquet
18
+ ts_1704074401_1704078000_part_0000.parquet
19
+ ts_1704078001_1704081600_part_0000.parquet
20
+ ...
21
+
22
+ Filename format: ts_{min_sec}_{max_sec}_part_{counter:04}.parquet
23
+ - min_sec: Unix timestamp (seconds) of earliest document in file
24
+ - max_sec: Unix timestamp (seconds) of latest document in file
25
+ - counter: Per-worker sequential counter (0000, 0001, 0002, ...)
26
+ Only increments if same worker writes multiple files with identical timestamps
27
+
28
+ How timestamps ensure uniqueness:
29
+ - Each chunk/bracket targets different time ranges
30
+ - Multiple workers process non-overlapping time ranges
31
+ - Natural file separation by actual data timestamps
32
+ - Counter only needed if worker flushes multiple batches with identical ranges
33
+
34
+ Fallback format (no timestamps): part_{counter:04}.parquet
35
+ Used when time_field is None or documents lack timestamps
36
+
37
+
38
+ STEP 2: READ & CONCATENATE
39
+ ---------------------------
40
+ Pandas: Read all files sequentially, concatenate into single DataFrame
41
+ Polars: Read all files in parallel (native multi-file support)
42
+
43
+ Both engines use PyArrow under the hood for efficient Parquet parsing.
44
+
45
+
46
+ STEP 3: DECODE TYPES.ANY STRUCT VALUES
47
+ ---------------------------------------
48
+ Types.Any fields are encoded as Arrow structs by Rust backend:
49
+
50
+ Parquet stores:
51
+ {
52
+ "value": {
53
+ "float_value": 42.5,
54
+ "int_value": null,
55
+ "string_value": null,
56
+ "bool_value": null,
57
+ ...
58
+ }
59
+ }
60
+
61
+ After decoding (coalesce first non-null field):
62
+ {"value": 42.5}
63
+
64
+ This decoding happens in Rust via decode_any_struct_arrow() for maximum
65
+ performance.
66
+
67
+
68
+ STEP 4: FLATTEN NESTED STRUCTS
69
+ -------------------------------
70
+ Convert nested struct columns to dotted field names:
71
+
72
+ Before: {"metadata": {"device_id": "123...", "sensor_id": "456..."}}
73
+ After: {"metadata.device_id": "123...", "metadata.sensor_id": "456..."}
74
+
75
+
76
+ STEP 5: RECONSTRUCT OBJECTIDS
77
+ ------------------------------
78
+ Convert string-encoded ObjectIds back to bson.ObjectId instances:
79
+
80
+ "507f1f77bcf86cd799439011" -> ObjectId("507f1f77bcf86cd799439011")
81
+
82
+
83
+ OUTPUT: DataFrame ( or Polars to stream pyarrow.Table )
84
+ -----------------
85
+ timestamp metadata.device_id value
86
+ 0 2024-01-15 12:00 64a1b2c3... 42.5
87
+ 1 2024-01-15 12:01 64a1b2c3... 43.1
88
+ 2 2024-01-15 12:02 64a1b2c3... "active"
89
+
90
+ """
91
+
92
+ import logging
93
+ from datetime import datetime
94
+ from pathlib import Path
95
+ from typing import Any, Dict, Generator, Iterator, List, Literal, Optional, Tuple, Union
96
+
97
+ import duckdb
98
+ import pandas as pd
99
+ import polars as pl
100
+ import pyarrow as pa
101
+ import pyarrow.parquet as pq
102
+ from bson import ObjectId
103
+
104
+ from xlr8.constants import DEFAULT_BATCH_SIZE
105
+
106
+ logger = logging.getLogger(__name__)
107
+
108
+
109
+ def _convert_datetime_for_filter(dt: datetime, target_type: pa.DataType) -> datetime:
110
+ """Convert datetime to match the target Arrow timestamp type.
111
+
112
+ Handles timezone-aware vs timezone-naive conversions:
113
+ - If target has timezone and input doesn't: assume UTC
114
+ - If target has no timezone and input does: strip timezone
115
+ - Matching types: return as-is
116
+
117
+ Args:
118
+ dt: Input datetime
119
+ target_type: PyArrow timestamp type from parquet schema
120
+
121
+ Returns:
122
+ datetime compatible with the target type
123
+ """
124
+ if not isinstance(target_type, pa.TimestampType):
125
+ return dt
126
+
127
+ target_has_tz = target_type.tz is not None
128
+ input_has_tz = dt.tzinfo is not None
129
+
130
+ if target_has_tz and not input_has_tz:
131
+ # Target has tz, input doesn't - assume input is UTC
132
+ from datetime import timezone
133
+
134
+ return dt.replace(tzinfo=timezone.utc)
135
+ elif not target_has_tz and input_has_tz:
136
+ # Target has no tz, input does - strip timezone
137
+ return dt.replace(tzinfo=None)
138
+ else:
139
+ # Both match (both have tz or both don't)
140
+ return dt
141
+
142
+
143
+ class ParquetReader:
144
+ """
145
+ Reads Parquet files from cache directory.
146
+
147
+ Provides streaming and batch reading of documents from Parquet files.
148
+ Supports reading all files in a cache directory or specific partitions.
149
+
150
+ Example:
151
+ >>> reader = ParquetReader(cache_dir=".cache/abc123def")
152
+ >>>
153
+ >>> # Stream all documents
154
+ >>> for doc in reader.iter_documents():
155
+ ... logging.debug(doc)
156
+ >>>
157
+ >>> # Or load to DataFrame
158
+ >>> df = reader.to_dataframe()
159
+ """
160
+
161
+ def __init__(self, cache_dir: Union[str, Path]):
162
+ """
163
+ Initialize reader for cache directory.
164
+
165
+ Args:
166
+ cache_dir: Directory containing parquet files
167
+ """
168
+ self.cache_dir = Path(cache_dir)
169
+
170
+ if not self.cache_dir.exists():
171
+ raise FileNotFoundError(f"Cache directory not found: {cache_dir}")
172
+
173
+ # Find all parquet files (may be empty if query returned no results)
174
+ self.parquet_files = sorted(self.cache_dir.glob("*.parquet"))
175
+
176
+ def iter_documents(
177
+ self,
178
+ batch_size: int = DEFAULT_BATCH_SIZE,
179
+ ) -> Iterator[Dict[str, Any]]:
180
+ """
181
+ Stream documents from all parquet files.
182
+
183
+ Reads in batches to avoid loading entire dataset into memory.
184
+
185
+ Args:
186
+ batch_size: Number of rows to read per batch
187
+
188
+ Yields:
189
+ Document dictionaries
190
+
191
+ Example:
192
+ >>> for doc in reader.iter_documents(batch_size=5000):
193
+ ... process(doc)
194
+ """
195
+ for parquet_file in self.parquet_files:
196
+ # Read in batches
197
+ parquet_file_obj = pq.ParquetFile(parquet_file)
198
+
199
+ for batch in parquet_file_obj.iter_batches(batch_size=batch_size):
200
+ # Convert Arrow batch to pandas then to dicts
201
+ df_batch = batch.to_pandas()
202
+
203
+ for _, row in df_batch.iterrows():
204
+ yield row.to_dict()
205
+
206
+ def _is_any_type(self, field_type: Any) -> bool:
207
+ """Check if field_type is an Any type (supports both class and instance)."""
208
+ from xlr8.schema.types import Any as AnyType
209
+
210
+ # Support both Types.Any (class) and Types.Any() (instance)
211
+ if isinstance(field_type, AnyType):
212
+ return True
213
+ if isinstance(field_type, type) and issubclass(field_type, AnyType):
214
+ return True
215
+ return False
216
+
217
+ def _decode_struct_values(self, df: pd.DataFrame, schema: Any) -> pd.DataFrame:
218
+ """
219
+ Decode struct-encoded Any-typed columns back to actual values.
220
+
221
+ For columns marked as Any type in schema, extracts the actual value
222
+ from the struct bitmap representation (float_value, int_value, etc.).
223
+
224
+ Uses Rust Arrow-native decoding for maximum performance (~40x faster).
225
+
226
+ Note: This is a fallback path. The fast path decodes directly from Arrow
227
+ before to_pandas() conversion, avoiding dict overhead entirely.
228
+ """
229
+ if not hasattr(schema, "fields"):
230
+ return df
231
+
232
+ # Import Rust Arrow-native decoder (required)
233
+ from xlr8.rust_backend import decode_any_struct_arrow
234
+
235
+ # Find Any-typed fields in schema
236
+ for field_name, field_type in schema.fields.items():
237
+ if self._is_any_type(field_type) and field_name in df.columns:
238
+ # Column contains struct-encoded values (dicts)
239
+ col = df[field_name]
240
+
241
+ if len(col) == 0:
242
+ continue
243
+
244
+ # Check if it's a struct (dict) column - skip if already decoded
245
+ first_val = col.iloc[0]
246
+ if not isinstance(first_val, dict):
247
+ # Already decoded in fast path - skip
248
+ continue
249
+
250
+ # Build struct type dynamically based on the dict keys
251
+ sample_dict = first_val
252
+ struct_fields = []
253
+ field_type_map = {
254
+ "float_value": pa.float64(),
255
+ "int32_value": pa.int32(),
256
+ "int64_value": pa.int64(),
257
+ "string_value": pa.string(),
258
+ "objectid_value": pa.string(),
259
+ "decimal128_value": pa.string(),
260
+ "regex_value": pa.string(),
261
+ "binary_value": pa.string(),
262
+ "document_value": pa.string(),
263
+ "array_value": pa.string(),
264
+ "bool_value": pa.bool_(),
265
+ "datetime_value": pa.timestamp("ms"), # Use ms for new schema
266
+ "null_value": pa.bool_(),
267
+ }
268
+
269
+ for key in sample_dict.keys():
270
+ if key in field_type_map:
271
+ struct_fields.append((key, field_type_map[key]))
272
+
273
+ any_struct_type = pa.struct(struct_fields)
274
+
275
+ # Convert to PyArrow array - this is a single pass over the data
276
+ arrow_array = pa.array(col.tolist(), type=any_struct_type)
277
+
278
+ # Decode in Rust - direct memory access to Arrow memory
279
+ decoded_values = decode_any_struct_arrow(arrow_array)
280
+ df[field_name] = decoded_values
281
+
282
+ return df
283
+
284
+ def _flatten_struct_columns(self, df: pd.DataFrame) -> pd.DataFrame:
285
+ """
286
+ Flatten nested struct columns into separate columns.
287
+
288
+ Example:
289
+ metadata: {'sensor_id': '...', 'device_id': '...'}
290
+ -> metadata.sensor_id: '...', metadata.device_id: '...'
291
+
292
+ """
293
+ if df.empty:
294
+ return df
295
+
296
+ struct_cols = []
297
+ for col in df.columns:
298
+ # Check if column contains dicts (structs)
299
+ if len(df) > 0 and isinstance(df[col].iloc[0], dict):
300
+ struct_cols.append(col)
301
+
302
+ for col in struct_cols:
303
+ # FAST PATH: Extract struct fields directly using list comprehension
304
+ # This is ~5x faster than pd.json_normalize() for large datasets
305
+ col_values = df[col].tolist()
306
+
307
+ # Detect subcolumns from first non-null row
308
+ first_val = col_values[0] if col_values else {}
309
+ subcolumns = list(first_val.keys()) if isinstance(first_val, dict) else []
310
+
311
+ # Build new columns efficiently
312
+ new_cols = {}
313
+ for subcol in subcolumns:
314
+ new_col_name = f"{col}.{subcol}"
315
+ new_cols[new_col_name] = [
316
+ row.get(subcol) if isinstance(row, dict) else None
317
+ for row in col_values
318
+ ]
319
+
320
+ # Drop original struct column
321
+ df = df.drop(columns=[col])
322
+
323
+ # Add flattened columns
324
+ for new_col_name, values in new_cols.items():
325
+ df[new_col_name] = values
326
+
327
+ return df
328
+
329
+ def _reconstruct_objectids(self, df: pd.DataFrame, schema: Any) -> pd.DataFrame:
330
+ """
331
+ Reconstruct ObjectId columns from string representation.
332
+
333
+ Converts string ObjectIds back to bson.ObjectId instances.
334
+ """
335
+ from xlr8.schema.types import ObjectId as ObjectIdType
336
+
337
+ # Find all ObjectId fields in schema (including nested ones)
338
+ objectid_fields = []
339
+
340
+ if hasattr(schema, "fields"):
341
+ for field_name, field_type in schema.fields.items():
342
+ if isinstance(field_type, ObjectIdType):
343
+ objectid_fields.append(field_name)
344
+ elif hasattr(field_type, "fields"):
345
+ # Nested struct with ObjectId fields
346
+ for nested_name, nested_type in field_type.fields.items():
347
+ if isinstance(nested_type, ObjectIdType):
348
+ objectid_fields.append(f"{field_name}.{nested_name}")
349
+
350
+ # Convert string columns back to ObjectId
351
+ for field in objectid_fields:
352
+ if field in df.columns:
353
+ df[field] = df[field].apply(
354
+ lambda x: ObjectId(x) if x and pd.notna(x) else x
355
+ )
356
+
357
+ return df
358
+
359
+ def _decode_struct_values_polars(
360
+ self,
361
+ df: "pl.DataFrame",
362
+ schema: Any,
363
+ any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
364
+ ) -> "pl.DataFrame":
365
+ """
366
+ Decode struct-encoded Any-typed columns back to actual values (Polars).
367
+
368
+ Args:
369
+ df: Polars DataFrame
370
+ schema: Schema with field type info
371
+ any_type_strategy: How to decode:
372
+ - "float": Coalesce to Float64, prioritize numeric (default)
373
+ - "string": Convert everything to string (lossless)
374
+ - "keep_struct": Keep raw struct, don't decode
375
+ """
376
+ if not hasattr(schema, "fields"):
377
+ return df
378
+
379
+ # Find Any-typed fields in schema
380
+ for field_name, field_type in schema.fields.items():
381
+ if self._is_any_type(field_type) and field_name in df.columns:
382
+ # Check if column is a struct
383
+ col_dtype = df.schema[field_name]
384
+ if str(col_dtype).startswith("Struct"):
385
+ # Strategy: keep_struct - don't decode at all
386
+ if any_type_strategy == "keep_struct":
387
+ continue
388
+
389
+ try:
390
+ # Get field names from the struct
391
+ struct_fields = (
392
+ col_dtype.fields if hasattr(col_dtype, "fields") else []
393
+ ) # type: ignore[attr-defined]
394
+ field_names = (
395
+ [f.name for f in struct_fields] if struct_fields else []
396
+ )
397
+
398
+ if any_type_strategy == "string":
399
+ # Convert ALL value types to string
400
+ coalesce_exprs = []
401
+
402
+ # String first (already string)
403
+ if "string_value" in field_names:
404
+ coalesce_exprs.append(
405
+ pl.col(field_name).struct.field("string_value")
406
+ )
407
+
408
+ # Float to string
409
+ if "float_value" in field_names:
410
+ coalesce_exprs.append(
411
+ pl.col(field_name)
412
+ .struct.field("float_value")
413
+ .cast(pl.Utf8)
414
+ )
415
+
416
+ # Int to string
417
+ for int_name in ["int64_value", "int32_value"]:
418
+ if int_name in field_names:
419
+ coalesce_exprs.append(
420
+ pl.col(field_name)
421
+ .struct.field(int_name)
422
+ .cast(pl.Utf8)
423
+ )
424
+
425
+ # Bool to string
426
+ if "bool_value" in field_names:
427
+ coalesce_exprs.append(
428
+ pl.col(field_name)
429
+ .struct.field("bool_value")
430
+ .cast(pl.Utf8)
431
+ )
432
+
433
+ # ObjectId, decimal, etc. (already strings)
434
+ for str_field in [
435
+ "objectid_value",
436
+ "decimal128_value",
437
+ "regex_value",
438
+ "binary_value",
439
+ "document_value",
440
+ "array_value",
441
+ ]:
442
+ if str_field in field_names:
443
+ coalesce_exprs.append(
444
+ pl.col(field_name).struct.field(str_field)
445
+ )
446
+
447
+ if coalesce_exprs:
448
+ df = df.with_columns(
449
+ pl.coalesce(coalesce_exprs).alias(field_name)
450
+ )
451
+
452
+ else: # "float" strategy (default)
453
+ # Coalesce to Float64, prioritize numeric
454
+ coalesce_exprs = []
455
+
456
+ # Try float first (highest precision)
457
+ if "float_value" in field_names:
458
+ coalesce_exprs.append(
459
+ pl.col(field_name).struct.field("float_value")
460
+ )
461
+
462
+ # Try various int types, cast to float
463
+ for int_name in ["int64_value", "int32_value"]:
464
+ if int_name in field_names:
465
+ coalesce_exprs.append(
466
+ pl.col(field_name)
467
+ .struct.field(int_name)
468
+ .cast(pl.Float64)
469
+ )
470
+
471
+ # Try bool (as 0.0/1.0)
472
+ if "bool_value" in field_names:
473
+ coalesce_exprs.append(
474
+ pl.col(field_name)
475
+ .struct.field("bool_value")
476
+ .cast(pl.Float64)
477
+ )
478
+
479
+ if coalesce_exprs:
480
+ if len(coalesce_exprs) == 1:
481
+ df = df.with_columns(
482
+ coalesce_exprs[0].alias(field_name)
483
+ )
484
+ else:
485
+ df = df.with_columns(
486
+ pl.coalesce(coalesce_exprs).alias(field_name)
487
+ )
488
+ else:
489
+ logger.warning(
490
+ "Could not decode struct column '%s': "
491
+ "no numeric fields in %s",
492
+ field_name,
493
+ field_names,
494
+ )
495
+ except (AttributeError, KeyError, ValueError) as e:
496
+ logger.warning("Error decoding struct '%s': %s", field_name, e)
497
+
498
+ return df
499
+
500
+ def _process_dataframe(
501
+ self,
502
+ df: Union[pd.DataFrame, "pl.DataFrame"],
503
+ engine: Literal["pandas", "polars"],
504
+ schema: Optional[Any] = None,
505
+ coerce: Literal["raise", "error"] = "raise",
506
+ any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
507
+ ) -> Union[pd.DataFrame, "pl.DataFrame"]:
508
+ """
509
+ Process DataFrame: decode struct values, flatten structs and
510
+ reconstruct ObjectIds.
511
+
512
+ Args:
513
+ df: DataFrame to process
514
+ engine: "pandas" or "polars"
515
+ schema: Schema for ObjectId reconstruction
516
+ coerce: Error handling mode ("raise" or "error")
517
+ any_type_strategy: How to decode Any() structs in Polars
518
+ (float/string/keep_struct)
519
+
520
+ Returns:
521
+ Processed DataFrame
522
+ """
523
+ if engine == "pandas":
524
+ # First, decode Any-typed struct columns back to actual values
525
+ if schema is not None:
526
+ try:
527
+ df = self._decode_struct_values(df, schema) # type: ignore[arg-type]
528
+ except (AttributeError, KeyError, ValueError, TypeError) as e:
529
+ if coerce == "error":
530
+ logger.error("Error decoding struct values: %s", e)
531
+ else:
532
+ raise
533
+
534
+ # Flatten struct columns (e.g., metadata -> metadata.sensor_id)
535
+ df = self._flatten_struct_columns(df) # type: ignore[arg-type]
536
+
537
+ # Reconstruct ObjectIds from strings
538
+ if schema is not None:
539
+ try:
540
+ df = self._reconstruct_objectids(df, schema)
541
+ except (AttributeError, KeyError, ValueError, TypeError) as e:
542
+ if coerce == "error":
543
+ logger.error("Error reconstructing ObjectIds: %s", e)
544
+ else:
545
+ raise
546
+
547
+ return df
548
+ elif engine == "polars":
549
+ # Polars: decode Any-typed struct columns and keep dotted column names
550
+ if schema is not None:
551
+ try:
552
+ df = self._decode_struct_values_polars(
553
+ df, schema, any_type_strategy
554
+ ) # type: ignore[arg-type]
555
+ except (AttributeError, KeyError, ValueError, TypeError) as e:
556
+ if coerce == "error":
557
+ logger.error("Error decoding struct values (polars): %s", e)
558
+ else:
559
+ raise
560
+ return df
561
+
562
+ def to_dataframe(
563
+ self,
564
+ engine: str = "pandas",
565
+ schema: Optional[Any] = None,
566
+ time_field: Optional[str] = None,
567
+ start_date: Optional[datetime] = None,
568
+ end_date: Optional[datetime] = None,
569
+ coerce: Literal["raise", "error"] = "raise",
570
+ any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
571
+ ) -> Union[pd.DataFrame, "pl.DataFrame"]:
572
+ """
573
+ Load all parquet files into a DataFrame.
574
+
575
+ Args:
576
+ engine: "pandas" or "polars"
577
+ schema: Schema for ObjectId reconstruction and struct flattening (required)
578
+ time_field: Name of time field for date filtering (from schema.time_field)
579
+ start_date: Filter data from this date (inclusive, tz-aware datetime)
580
+ end_date: Filter data until this date (exclusive, tz-aware datetime)
581
+ coerce: Error handling mode:
582
+ - "raise": Raise exceptions on schema validation errors (default)
583
+ - "error": Log errors and store None for invalid values
584
+ any_type_strategy: How to decode Types.Any() struct columns in Polars:
585
+ - "float": Coalesce to Float64, prioritize numeric (default)
586
+ - "string": Convert everything to string (lossless)
587
+ - "keep_struct": Keep raw struct, don't decode
588
+
589
+ Returns:
590
+ DataFrame with all documents (structs flattened, ObjectIds reconstructed)
591
+
592
+ Example:
593
+ >>> df = reader.to_dataframe(
594
+ ... schema=schema,
595
+ ... time_field="timestamp",
596
+ ... start_date=datetime(2024, 6, 1, tzinfo=timezone.utc),
597
+ ... end_date=datetime(2024, 6, 15, tzinfo=timezone.utc),
598
+ ... )
599
+ """
600
+ # Build PyArrow filter for date range (predicate pushdown)
601
+ # We'll determine the correct timestamp type from the first parquet file
602
+ filters = None
603
+ if time_field and (start_date or end_date) and self.parquet_files:
604
+ # Get the timestamp type from the parquet schema
605
+ first_file_schema = pq.read_schema(self.parquet_files[0])
606
+ field_idx = first_file_schema.get_field_index(time_field)
607
+ if field_idx >= 0:
608
+ ts_type = first_file_schema.field(field_idx).type
609
+ else:
610
+ # Fallback to ms if field not found
611
+ ts_type = pa.timestamp("ms")
612
+
613
+ filter_conditions = []
614
+ if start_date:
615
+ # Convert datetime to match parquet column type
616
+ start_ts = pa.scalar(start_date, type=ts_type)
617
+ filter_conditions.append((time_field, ">=", start_ts))
618
+ if end_date:
619
+ end_ts = pa.scalar(end_date, type=ts_type)
620
+ filter_conditions.append((time_field, "<", end_ts))
621
+ if filter_conditions:
622
+ filters = filter_conditions
623
+
624
+ if engine == "polars":
625
+ # Return empty DataFrame if no parquet files (query returned no results)
626
+ if not self.parquet_files:
627
+ return pl.DataFrame()
628
+
629
+ # Use scan_parquet for lazy evaluation with predicate pushdown
630
+ # This only reads the row groups that match the filter conditions
631
+ lf = pl.scan_parquet(self.parquet_files)
632
+
633
+ # Apply date filter with predicate pushdown (reads only matching data)
634
+ # Convert datetime to match Parquet column dtype (tz-aware or naive)
635
+ if time_field and (start_date or end_date):
636
+ # Get timestamp type from parquet to handle tz correctly
637
+ first_file_schema = pq.read_schema(self.parquet_files[0])
638
+ field_idx = first_file_schema.get_field_index(time_field)
639
+ ts_type = (
640
+ first_file_schema.field(field_idx).type
641
+ if field_idx >= 0
642
+ else pa.timestamp("ms")
643
+ )
644
+
645
+ if start_date:
646
+ start_converted = _convert_datetime_for_filter(start_date, ts_type)
647
+ lf = lf.filter(pl.col(time_field) >= start_converted)
648
+ if end_date:
649
+ end_converted = _convert_datetime_for_filter(end_date, ts_type)
650
+ lf = lf.filter(pl.col(time_field) < end_converted)
651
+
652
+ # Collect executes the query with predicate pushdown
653
+ df = lf.collect()
654
+
655
+ return self._process_dataframe(
656
+ df, engine, schema, coerce, any_type_strategy
657
+ )
658
+
659
+ elif engine == "pandas":
660
+ # Return empty DataFrame if no parquet files (query returned no results)
661
+ if not self.parquet_files:
662
+ return pd.DataFrame()
663
+
664
+ # Read all files with optional filter (predicate pushdown)
665
+ # Use PyArrow to read, then convert to pandas - this allows
666
+ # struct columns to stay in Arrow format for fast Rust decoding
667
+ tables = []
668
+ for parquet_file in self.parquet_files:
669
+ try:
670
+ # Use PyArrow filters for efficient predicate pushdown
671
+ table = pq.read_table(parquet_file, filters=filters)
672
+ tables.append(table)
673
+ except Exception as e:
674
+ if coerce == "error":
675
+ logger.error(f"Error reading {parquet_file}: {e}")
676
+ continue
677
+ raise
678
+
679
+ if not tables:
680
+ return pd.DataFrame()
681
+
682
+ # Concatenate Arrow tables
683
+ combined_table = pa.concat_tables(tables)
684
+
685
+ # FAST PATH: Decode Any-typed struct columns directly in Arrow
686
+ # This gives us 44x speedup because Rust reads Arrow memory directly
687
+ # without Python iteration over dicts
688
+ any_columns_decoded = {}
689
+ columns_to_drop = []
690
+ if schema and hasattr(schema, "fields"):
691
+ from xlr8.rust_backend import decode_any_struct_arrow
692
+
693
+ for field_name, field_type in schema.fields.items():
694
+ if (
695
+ self._is_any_type(field_type)
696
+ and field_name in combined_table.column_names
697
+ ):
698
+ col = combined_table.column(field_name)
699
+ if pa.types.is_struct(col.type):
700
+ # Decode in Rust - returns Python list of mixed types
701
+ combined = col.combine_chunks()
702
+ decoded_values = decode_any_struct_arrow(combined)
703
+ any_columns_decoded[field_name] = decoded_values
704
+ # Mark for removal to avoid slow dict conversion
705
+ # in to_pandas()
706
+ columns_to_drop.append(field_name)
707
+
708
+ # Drop decoded struct columns before pandas conversion
709
+ # to avoid dict overhead
710
+ if columns_to_drop:
711
+ combined_table = combined_table.drop(columns_to_drop)
712
+
713
+ # Convert to pandas (non-Any columns go through normal path)
714
+ df = combined_table.to_pandas()
715
+
716
+ # Add back Any columns with decoded values
717
+ # (bypassing struct->dict->decode path)
718
+ for field_name, decoded_values in any_columns_decoded.items():
719
+ df[field_name] = decoded_values
720
+
721
+ return self._process_dataframe(df, engine, schema, coerce)
722
+
723
+ else:
724
+ raise ValueError(f"Unknown engine: {engine}. Use 'pandas' or 'polars'")
725
+
726
+ def iter_dataframe_batches(
727
+ self,
728
+ batch_size: int = 10000,
729
+ schema: Optional[Any] = None,
730
+ time_field: Optional[str] = None,
731
+ start_date: Optional[datetime] = None,
732
+ end_date: Optional[datetime] = None,
733
+ coerce: Literal["raise", "error"] = "raise",
734
+ ) -> Generator[pd.DataFrame, None, None]:
735
+ """
736
+ Yield DataFrames in batches without loading all data into memory.
737
+
738
+ This is memory-efficient: only batch_size rows are in memory at a time.
739
+ Uses PyArrow's batch iteration for efficient streaming.
740
+
741
+ Use this when NO sorting is needed. For sorted batches, use
742
+ iter_globally_sorted_batches().
743
+
744
+ Args:
745
+ batch_size: Number of rows per batch (default: 10,000)
746
+ schema: Schema for struct decoding and ObjectId reconstruction
747
+ time_field: Name of time field for date filtering
748
+ start_date: Filter data from this date (inclusive, tz-aware)
749
+ end_date: Filter data until this date (exclusive, tz-aware)
750
+ coerce: Error handling mode ("raise" or "error")
751
+
752
+ Yields:
753
+ pd.DataFrame: Batches of processed rows
754
+
755
+ Example:
756
+ >>> for batch_df in reader.iter_dataframe_batches(batch_size=5000):
757
+ ... process(batch_df)
758
+ """
759
+ import pyarrow.parquet as pq
760
+
761
+ batch_count = 0
762
+ total_rows = 0
763
+
764
+ # Pre-compute converted datetimes for filtering (tz-aware or naive)
765
+ start_converted = None
766
+ end_converted = None
767
+ if time_field and (start_date or end_date) and self.parquet_files:
768
+ first_file_schema = pq.read_schema(self.parquet_files[0])
769
+ field_idx = first_file_schema.get_field_index(time_field)
770
+ ts_type = (
771
+ first_file_schema.field(field_idx).type
772
+ if field_idx >= 0
773
+ else pa.timestamp("ms")
774
+ )
775
+ if start_date:
776
+ start_converted = _convert_datetime_for_filter(start_date, ts_type)
777
+ if end_date:
778
+ end_converted = _convert_datetime_for_filter(end_date, ts_type)
779
+
780
+ for parquet_file in self.parquet_files:
781
+ try:
782
+ # Open parquet file for batch iteration
783
+ parquet_file_obj = pq.ParquetFile(parquet_file)
784
+
785
+ for batch in parquet_file_obj.iter_batches(batch_size=batch_size):
786
+ # Convert Arrow batch to pandas
787
+ batch_df = batch.to_pandas()
788
+
789
+ # Apply date filter if specified
790
+ if time_field and (start_converted or end_converted):
791
+ if time_field in batch_df.columns:
792
+ if start_converted:
793
+ batch_df = batch_df[
794
+ batch_df[time_field] >= start_converted
795
+ ]
796
+ if end_converted:
797
+ batch_df = batch_df[
798
+ batch_df[time_field] < end_converted
799
+ ]
800
+
801
+ if len(batch_df) == 0:
802
+ continue
803
+
804
+ # Process the batch (decode structs, flatten, reconstruct ObjectIds)
805
+ processed_df = self._process_dataframe(
806
+ batch_df, "pandas", schema, coerce
807
+ )
808
+
809
+ batch_count += 1
810
+ total_rows += len(processed_df)
811
+
812
+ yield processed_df
813
+
814
+ except Exception as e:
815
+ if coerce == "error":
816
+ logger.error(f"Error reading batch from {parquet_file}: {e}")
817
+ continue
818
+ raise
819
+
820
+ logger.debug(f"Yielded {batch_count} batches, {total_rows} total rows")
821
+
822
+ def get_globally_sorted_dataframe(
823
+ self,
824
+ sort_spec: List[Tuple[str, int]],
825
+ schema: Optional[Any] = None,
826
+ time_field: Optional[str] = None,
827
+ start_date: Optional[datetime] = None,
828
+ end_date: Optional[datetime] = None,
829
+ coerce: Literal["raise", "error"] = "raise",
830
+ memory_limit_mb: Optional[int] = None,
831
+ threads: Optional[int] = None,
832
+ ) -> pd.DataFrame:
833
+ """
834
+ Return entire globally sorted DataFrame using DuckDB K-way merge.
835
+
836
+ More efficient than iter_globally_sorted_batches() when you want
837
+ the full result, as it avoids batch iteration overhead and just
838
+ fetches all rows at once.
839
+ for to_dataframe_batches() where streaming is required.
840
+
841
+ Args:
842
+ sort_spec: Sort specification as [(field, direction), ...]
843
+ schema: Schema for ObjectId reconstruction and advanced sorting
844
+ time_field: Field for date filtering
845
+ start_date: Filter data from this date (inclusive, tz-aware)
846
+ end_date: Filter data until this date (exclusive, tz-aware)
847
+ coerce: Error handling mode
848
+ memory_limit_mb: DuckDB memory limit
849
+ threads: DuckDB thread count
850
+
851
+ Returns:
852
+ pd.DataFrame: Complete sorted DataFrame
853
+ """
854
+ if not self.parquet_files:
855
+ return pd.DataFrame()
856
+
857
+ # Expand parent fields to children in schema definition order
858
+ sort_spec = self._expand_parent_sort_fields(sort_spec, schema)
859
+
860
+ # Get list of parquet files
861
+ file_paths = [str(f) for f in self.parquet_files]
862
+
863
+ logger.debug(
864
+ f"DuckDB K-way merge (full): {len(file_paths)} files, sort_spec={sort_spec}"
865
+ )
866
+
867
+ try:
868
+ # Create DuckDB connection
869
+ conn = duckdb.connect(":memory:")
870
+
871
+ # Configure DuckDB to use allocated resources
872
+ if memory_limit_mb:
873
+ conn.execute(f"SET memory_limit = '{memory_limit_mb}MB'")
874
+ logger.info(f"DuckDB memory_limit set to {memory_limit_mb} MB")
875
+
876
+ if threads:
877
+ conn.execute(f"SET threads = {threads}")
878
+ logger.info(f"DuckDB threads set to {threads}")
879
+
880
+ # Build ORDER BY with MongoDB type ordering
881
+ # (same logic as iter_globally_sorted_batches)
882
+ order_clauses = []
883
+ for field_name, direction in sort_spec:
884
+ dir_sql = "ASC" if direction == 1 else "DESC"
885
+ if schema and schema.has_field(field_name):
886
+ field_type = schema.get_field_type(field_name)
887
+ else:
888
+ field_type = None
889
+ is_any = self._is_any_type(field_type) if field_type else True
890
+
891
+ if is_any:
892
+ # Complete MongoDB type ordering for Any() fields
893
+ type_clause = f"""CASE
894
+ WHEN "{field_name}" IS NULL OR "{field_name}".null_value IS TRUE
895
+ THEN 0
896
+ WHEN "{field_name}".float_value IS NOT NULL
897
+ OR "{field_name}".int32_value IS NOT NULL
898
+ OR "{field_name}".int64_value IS NOT NULL
899
+ OR "{field_name}".decimal128_value IS NOT NULL
900
+ THEN 1
901
+ WHEN "{field_name}".string_value IS NOT NULL THEN 2
902
+ WHEN "{field_name}".document_value IS NOT NULL THEN 3
903
+ WHEN "{field_name}".array_value IS NOT NULL THEN 4
904
+ WHEN "{field_name}".binary_value IS NOT NULL THEN 5
905
+ WHEN "{field_name}".objectid_value IS NOT NULL THEN 6
906
+ WHEN "{field_name}".bool_value IS NOT NULL THEN 7
907
+ WHEN "{field_name}".datetime_value IS NOT NULL THEN 8
908
+ WHEN "{field_name}".regex_value IS NOT NULL THEN 9
909
+ ELSE 10
910
+ END {dir_sql}"""
911
+
912
+ # Value comparisons for each type
913
+ num_clause = (
914
+ f'COALESCE("{field_name}".float_value, '
915
+ f'CAST("{field_name}".int32_value AS DOUBLE), '
916
+ f'CAST("{field_name}".int64_value AS DOUBLE)) {dir_sql}'
917
+ )
918
+ str_clause = f'"{field_name}".string_value {dir_sql}'
919
+ doc_clause = f'"{field_name}".document_value {dir_sql}'
920
+ arr_clause = f'"{field_name}".array_value {dir_sql}'
921
+ bin_clause = f'"{field_name}".binary_value {dir_sql}'
922
+ oid_clause = f'"{field_name}".objectid_value {dir_sql}'
923
+ bool_clause = f'"{field_name}".bool_value {dir_sql}'
924
+ date_clause = f'"{field_name}".datetime_value {dir_sql}'
925
+ regex_clause = f'"{field_name}".regex_value {dir_sql}'
926
+
927
+ order_clauses.extend(
928
+ [
929
+ type_clause,
930
+ num_clause,
931
+ str_clause,
932
+ doc_clause,
933
+ arr_clause,
934
+ bin_clause,
935
+ oid_clause,
936
+ bool_clause,
937
+ date_clause,
938
+ regex_clause,
939
+ ]
940
+ )
941
+ else:
942
+ # Simple field - use direct comparison
943
+ order_clauses.append(f'"{field_name}" {dir_sql}')
944
+
945
+ order_by = ", ".join(order_clauses)
946
+ files = ", ".join([f"'{f}'" for f in file_paths])
947
+ query = f"SELECT * FROM read_parquet([{files}]) ORDER BY {order_by}"
948
+
949
+ logging.debug(f"[DuckDB] K-way merge (full): {len(file_paths)} files")
950
+
951
+ # Fetch entire result at once using df()
952
+ df = conn.execute(query).df()
953
+
954
+ # Ensure time field is UTC
955
+ if time_field and time_field in df.columns:
956
+ if pd.api.types.is_datetime64_any_dtype(df[time_field]):
957
+ if df[time_field].dt.tz is not None:
958
+ df[time_field] = df[time_field].dt.tz_convert("UTC")
959
+ else:
960
+ df[time_field] = df[time_field].dt.tz_localize("UTC")
961
+
962
+ # Apply date filtering if needed
963
+ # Convert datetimes to match the column's timezone state
964
+ if time_field and (start_date or end_date) and time_field in df.columns:
965
+ # After the above, time_field is always tz-aware (UTC)
966
+ # So we need tz-aware comparisons
967
+ from datetime import timezone
968
+
969
+ if start_date:
970
+ start_cmp = (
971
+ start_date
972
+ if start_date.tzinfo
973
+ else start_date.replace(tzinfo=timezone.utc)
974
+ )
975
+ df = df[df[time_field] >= start_cmp]
976
+ if end_date:
977
+ end_cmp = (
978
+ end_date
979
+ if end_date.tzinfo
980
+ else end_date.replace(tzinfo=timezone.utc)
981
+ )
982
+ df = df[df[time_field] < end_cmp]
983
+
984
+ # Process the DataFrame (decode structs, reconstruct ObjectIds)
985
+ df = self._process_dataframe(df, "pandas", schema, coerce)
986
+
987
+ conn.close()
988
+ logging.debug(f"[DuckDB] K-way merge complete: {len(df):,} rows")
989
+ logger.debug(f"DuckDB K-way merge complete: {len(df):,} rows")
990
+
991
+ return df
992
+
993
+ except Exception as e:
994
+ logger.error(f"DuckDB K-way merge failed: {e}")
995
+ raise
996
+
997
+ def _expand_parent_sort_fields(
998
+ self, sort_spec: List[Tuple[str, int]], schema: Optional[Any]
999
+ ) -> List[Tuple[str, int]]:
1000
+ """
1001
+ Expand parent field sorts to their child fields in schema definition order.
1002
+
1003
+ When user sorts by a parent field like "metadata" but the schema has
1004
+ flattened fields like "metadata.device_id", expand to all children.
1005
+
1006
+ Args:
1007
+ sort_spec: Original [(field, direction), ...]
1008
+ schema: XLR8 schema with field definitions
1009
+
1010
+ Returns:
1011
+ Expanded sort spec with parent fields replaced by children
1012
+
1013
+ Raises:
1014
+ ValueError: If field not found and no children exist
1015
+ """
1016
+ if schema is None:
1017
+ return sort_spec
1018
+
1019
+ expanded = []
1020
+ # Schema.fields preserves insertion order (Python 3.7+)
1021
+ all_fields = list(schema.fields.keys())
1022
+
1023
+ for field_name, direction in sort_spec:
1024
+ if schema.has_field(field_name):
1025
+ # Field exists directly in schema
1026
+ expanded.append((field_name, direction))
1027
+ else:
1028
+ # Look for child fields with this prefix (in schema order)
1029
+ prefix = f"{field_name}."
1030
+ children = [f for f in all_fields if f.startswith(prefix)]
1031
+
1032
+ if children:
1033
+ logger.info(
1034
+ f"Sort field '{field_name}' expanded to children "
1035
+ f"(schema order): {children}"
1036
+ )
1037
+ for child in children:
1038
+ expanded.append((child, direction))
1039
+ else:
1040
+ raise ValueError(
1041
+ f"Sort field '{field_name}' not found in schema "
1042
+ f"and has no child fields. "
1043
+ f"Available fields: {sorted(all_fields)[:10]}"
1044
+ + ("..." if len(all_fields) > 10 else "")
1045
+ )
1046
+
1047
+ return expanded
1048
+
1049
+ def iter_globally_sorted_batches(
1050
+ self,
1051
+ sort_field: Optional[str] = None,
1052
+ ascending: bool = True,
1053
+ batch_size: int = DEFAULT_BATCH_SIZE,
1054
+ schema: Optional[Any] = None,
1055
+ time_field: Optional[str] = None,
1056
+ start_date: Optional[datetime] = None,
1057
+ end_date: Optional[datetime] = None,
1058
+ coerce: Literal["raise", "error"] = "raise",
1059
+ sort_spec: Optional[List[Tuple[str, int]]] = None,
1060
+ # DuckDB configuration
1061
+ memory_limit_mb: Optional[int] = None,
1062
+ threads: Optional[int] = None,
1063
+ ) -> Generator[pd.DataFrame, None, None]:
1064
+ """
1065
+ Yield globally sorted batches using DuckDB K-way merge.
1066
+
1067
+ This method reads all Parquet files in the cache directory and
1068
+ yields batches in globally sorted order. Uses Rust's K-way merge
1069
+ with MongoDB BSON comparison for 100% compatibility.
1070
+
1071
+ Supports advanced sorting:
1072
+ - Parent fields (e.g., "metadata" expands to all child fields)
1073
+ - Types.Any() with full MongoDB BSON type ordering (Objects, Arrays, Binary)
1074
+
1075
+ RAM Usage:
1076
+ O(K × batch_size) where K = number of files.
1077
+ Already handled by flush_ram_limit_mb.
1078
+
1079
+ Args:
1080
+ sort_field: Field to sort by (use sort_spec for multi-field sorting).
1081
+ ascending: Sort direction (use sort_spec for mixed directions).
1082
+ batch_size: Number of rows per yielded DataFrame (default: 10,000)
1083
+ schema: Schema for ObjectId reconstruction and advanced sorting
1084
+ time_field: Field for date filtering (usually same as sort_field)
1085
+ start_date: Filter data from this date (inclusive, tz-aware)
1086
+ end_date: Filter data until this date (exclusive, tz-aware)
1087
+ coerce: Error handling mode ("raise" or "error")
1088
+ sort_spec: Sort specification as [(field, direction), ...] where
1089
+ direction is 1 (ASC) or -1 (DESC). Preferred over sort_field.
1090
+
1091
+ Yields:
1092
+ pd.DataFrame: Batches in globally sorted order
1093
+
1094
+ Example:
1095
+ >>> reader = ParquetReader(".cache/abc123def")
1096
+ >>> # Simple sort
1097
+ >>> for batch in reader.iter_globally_sorted_batches(
1098
+ ... sort_spec=[("timestamp", 1)],
1099
+ ... schema=schema,
1100
+ ... batch_size=10_000
1101
+ ... ):
1102
+ ... process(batch)
1103
+ >>>
1104
+ >>> # Advanced: parent field + Any type
1105
+ >>> for batch in reader.iter_globally_sorted_batches(
1106
+ ... sort_spec=[("metadata", -1), ("value", 1)],
1107
+ ... schema=schema,
1108
+ ... ):
1109
+ ... process(batch)
1110
+ """
1111
+
1112
+ if not self.parquet_files:
1113
+ return
1114
+
1115
+ # Handle backwards compatibility
1116
+ if sort_spec is None and sort_field is not None:
1117
+ direction = 1 if ascending else -1
1118
+ sort_spec = [(sort_field, direction)]
1119
+
1120
+ if sort_spec is None:
1121
+ raise ValueError("sort_spec or sort_field is required")
1122
+
1123
+ # Expand parent fields to children in schema definition order
1124
+ sort_spec = self._expand_parent_sort_fields(sort_spec, schema)
1125
+
1126
+ # Get list of parquet files
1127
+ file_paths = [str(f) for f in self.parquet_files]
1128
+
1129
+ logger.debug(
1130
+ f"DuckDB K-way merge: {len(file_paths)} files, sort_spec={sort_spec}"
1131
+ )
1132
+
1133
+ try:
1134
+ # Create DuckDB connection
1135
+ conn = duckdb.connect(":memory:")
1136
+
1137
+ # Configure DuckDB to use allocated resources
1138
+ if memory_limit_mb:
1139
+ conn.execute(f"SET memory_limit = '{memory_limit_mb}MB'")
1140
+ logger.info(f"DuckDB memory_limit set to {memory_limit_mb} MB")
1141
+
1142
+ if threads:
1143
+ conn.execute(f"SET threads = {threads}")
1144
+ logger.info(f"DuckDB threads set to {threads}")
1145
+
1146
+ # Query DuckDB settings to verify
1147
+ memory_result = conn.execute(
1148
+ "SELECT current_setting('memory_limit')"
1149
+ ).fetchone()
1150
+ actual_memory = memory_result[0] if memory_result else "unknown"
1151
+ threads_result = conn.execute(
1152
+ "SELECT current_setting('threads')"
1153
+ ).fetchone()
1154
+ actual_threads = threads_result[0] if threads_result else "unknown"
1155
+ logger.debug(
1156
+ f"DuckDB configured: memory={actual_memory}, threads={actual_threads}"
1157
+ )
1158
+
1159
+ # Build ORDER BY with MongoDB type ordering
1160
+ order_clauses = []
1161
+ for field_name, direction in sort_spec:
1162
+ dir_sql = "ASC" if direction == 1 else "DESC"
1163
+ # Check if field exists in schema before getting type
1164
+ if schema and schema.has_field(field_name):
1165
+ field_type = schema.get_field_type(field_name)
1166
+ else:
1167
+ field_type = None
1168
+ is_any = self._is_any_type(field_type) if field_type else True
1169
+
1170
+ if is_any:
1171
+ # Complete MongoDB type ordering for Any() fields:
1172
+ # Reference: https://www.mongodb.com/docs/manual/reference/bson-type-comparison-order/
1173
+ # 1. MinKey (internal)
1174
+ # 2. Null
1175
+ # 3. Numbers (int, long, double, decimal)
1176
+ # 4. Symbol, String
1177
+ # 5. Object
1178
+ # 6. Array
1179
+ # 7. BinData
1180
+ # 8. ObjectId
1181
+ # 9. Boolean
1182
+ # 10. Date
1183
+ # 11. Timestamp
1184
+ # 12. Regular Expression
1185
+ # 13. MaxKey (internal)
1186
+
1187
+ # Type priority clause
1188
+ type_clause = f"""CASE
1189
+ WHEN "{field_name}" IS NULL OR "{field_name}".null_value IS TRUE
1190
+ THEN 0
1191
+ WHEN "{field_name}".float_value IS NOT NULL
1192
+ OR "{field_name}".int32_value IS NOT NULL
1193
+ OR "{field_name}".int64_value IS NOT NULL
1194
+ OR "{field_name}".decimal128_value IS NOT NULL
1195
+ THEN 1
1196
+ WHEN "{field_name}".string_value IS NOT NULL THEN 2
1197
+ WHEN "{field_name}".document_value IS NOT NULL THEN 3
1198
+ WHEN "{field_name}".array_value IS NOT NULL THEN 4
1199
+ WHEN "{field_name}".binary_value IS NOT NULL THEN 5
1200
+ WHEN "{field_name}".objectid_value IS NOT NULL THEN 6
1201
+ WHEN "{field_name}".bool_value IS NOT NULL THEN 7
1202
+ WHEN "{field_name}".datetime_value IS NOT NULL THEN 8
1203
+ WHEN "{field_name}".regex_value IS NOT NULL THEN 9
1204
+ ELSE 10
1205
+ END {dir_sql}"""
1206
+
1207
+ # Value comparisons for each type
1208
+ num_clause = (
1209
+ f'COALESCE("{field_name}".float_value, '
1210
+ f'CAST("{field_name}".int32_value AS DOUBLE), '
1211
+ f'CAST("{field_name}".int64_value AS DOUBLE)) {dir_sql}'
1212
+ )
1213
+ str_clause = f'"{field_name}".string_value {dir_sql}'
1214
+ # JSON strings compare lexicographically
1215
+ doc_clause = f'"{field_name}".document_value {dir_sql}'
1216
+ # JSON arrays compare lexicographically
1217
+ arr_clause = f'"{field_name}".array_value {dir_sql}'
1218
+ bin_clause = f'"{field_name}".binary_value {dir_sql}'
1219
+ oid_clause = f'"{field_name}".objectid_value {dir_sql}'
1220
+ bool_clause = f'"{field_name}".bool_value {dir_sql}'
1221
+ date_clause = f'"{field_name}".datetime_value {dir_sql}'
1222
+ regex_clause = f'"{field_name}".regex_value {dir_sql}'
1223
+
1224
+ order_clauses.extend(
1225
+ [
1226
+ type_clause,
1227
+ num_clause,
1228
+ str_clause,
1229
+ doc_clause,
1230
+ arr_clause,
1231
+ bin_clause,
1232
+ oid_clause,
1233
+ bool_clause,
1234
+ date_clause,
1235
+ regex_clause,
1236
+ ]
1237
+ )
1238
+ else:
1239
+ # Simple field - use direct comparison
1240
+ order_clauses.append(f'"{field_name}" {dir_sql}')
1241
+
1242
+ order_by = ", ".join(order_clauses)
1243
+ files = ", ".join([f"'{f}'" for f in file_paths])
1244
+ query = f"SELECT * FROM read_parquet([{files}]) ORDER BY {order_by}"
1245
+
1246
+ result = conn.execute(query)
1247
+
1248
+ # Use fetchmany() cursor API - this ACTUALLY streams incrementally
1249
+ # without loading all data into memory (unlike fetch_df_chunk)
1250
+ # NOTE: DuckDB's k-way merge uses internal buffering
1251
+ # separate from batch_size.
1252
+ # batch_size only controls how much we pull at once,
1253
+ # not DuckDB's merge buffer.
1254
+ batch_count = 0
1255
+ total_rows = 0
1256
+ column_names = [desc[0] for desc in result.description]
1257
+
1258
+ logging.debug(
1259
+ f"[DuckDB] K-way merge started: {len(file_paths)} files, "
1260
+ f"batch_size={batch_size:,}"
1261
+ )
1262
+
1263
+ while True:
1264
+ # Fetch batch as list of tuples
1265
+ rows = result.fetchmany(batch_size)
1266
+ if not rows:
1267
+ break
1268
+
1269
+ batch_count += 1
1270
+ total_rows += len(rows)
1271
+
1272
+ # Convert to DataFrame
1273
+ batch_df = pd.DataFrame(rows, columns=column_names)
1274
+ logger.debug(
1275
+ f"Streamed batch {batch_count}: {len(batch_df)} rows "
1276
+ f"from DuckDB K-way merge"
1277
+ )
1278
+
1279
+ # Ensure time field is UTC (DuckDB might return naive)
1280
+ if time_field and time_field in batch_df.columns:
1281
+ if pd.api.types.is_datetime64_any_dtype(batch_df[time_field]):
1282
+ if batch_df[time_field].dt.tz is not None:
1283
+ batch_df[time_field] = batch_df[time_field].dt.tz_convert(
1284
+ "UTC"
1285
+ )
1286
+ else:
1287
+ batch_df[time_field] = batch_df[time_field].dt.tz_localize(
1288
+ "UTC"
1289
+ )
1290
+
1291
+ # Apply date filtering if needed
1292
+ # After UTC conversion above, time_field is tz-aware
1293
+ if time_field and (start_date or end_date):
1294
+ from datetime import timezone
1295
+
1296
+ if start_date:
1297
+ start_cmp = (
1298
+ start_date
1299
+ if start_date.tzinfo
1300
+ else start_date.replace(tzinfo=timezone.utc)
1301
+ )
1302
+ batch_df = batch_df[batch_df[time_field] >= start_cmp]
1303
+ if end_date:
1304
+ end_cmp = (
1305
+ end_date
1306
+ if end_date.tzinfo
1307
+ else end_date.replace(tzinfo=timezone.utc)
1308
+ )
1309
+ batch_df = batch_df[batch_df[time_field] < end_cmp]
1310
+ if len(batch_df) == 0:
1311
+ continue
1312
+
1313
+ # Process the batch (decode structs, reconstruct ObjectIds)
1314
+ processed_df = self._process_dataframe(
1315
+ batch_df, "pandas", schema, coerce
1316
+ )
1317
+ yield processed_df
1318
+
1319
+ conn.close()
1320
+ logger.debug("DuckDB K-way merge complete")
1321
+
1322
+ except Exception as e:
1323
+ if coerce == "error":
1324
+ logger.error(f"Error in globally sorted streaming: {e}")
1325
+ return
1326
+ raise
1327
+
1328
+ def get_statistics(self) -> Dict[str, Any]:
1329
+ """
1330
+ Get statistics about cached data.
1331
+
1332
+ Returns:
1333
+ Dict with file count, total rows, size, schema info
1334
+ """
1335
+ total_rows = 0
1336
+ total_size = 0
1337
+ schema = None
1338
+
1339
+ for parquet_file in self.parquet_files:
1340
+ # File size
1341
+ total_size += parquet_file.stat().st_size
1342
+
1343
+ # Read metadata
1344
+ parquet_meta = pq.read_metadata(parquet_file)
1345
+ total_rows += parquet_meta.num_rows
1346
+
1347
+ # Get schema from first file
1348
+ if schema is None:
1349
+ schema = parquet_meta.schema.to_arrow_schema()
1350
+
1351
+ return {
1352
+ "file_count": len(self.parquet_files),
1353
+ "total_rows": total_rows,
1354
+ "total_size_mb": round(total_size / (1024 * 1024), 2),
1355
+ "schema_fields": [field.name for field in schema] if schema else [],
1356
+ "cache_dir": str(self.cache_dir),
1357
+ }
1358
+
1359
+ def __repr__(self) -> str:
1360
+ stats = self.get_statistics()
1361
+ return (
1362
+ f"ParquetReader(files={stats['file_count']}, "
1363
+ f"rows={stats['total_rows']:,}, "
1364
+ f"size={stats['total_size_mb']:.1f}MB)"
1365
+ )
1366
+
1367
+ def __len__(self) -> int:
1368
+ """Return total number of rows across all files."""
1369
+ return self.get_statistics()["total_rows"]