xlr8 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xlr8/storage/reader.py ADDED
@@ -0,0 +1,1276 @@
1
+ """
2
+ Parquet file reader for cache-aware loading.
3
+
4
+ This module reads Parquet files written by the Rust backend and converts them
5
+ back into DataFrames with proper value decoding and type reconstruction.
6
+
7
+ DATA FLOW
8
+ =========
9
+
10
+ STEP 1: DISCOVER RUST-WRITTEN FILES
11
+ ------------------------------------
12
+ The Rust backend (rust_backend.fetch_chunks_bson) writes Parquet files with
13
+ timestamp-based naming derived from actual document data:
14
+
15
+ cache_dir/.cache/abc123def/
16
+ ts_1704067200_1704070800_part_0000.parquet
17
+ ts_1704070801_1704074400_part_0000.parquet
18
+ ts_1704074401_1704078000_part_0000.parquet
19
+ ts_1704078001_1704081600_part_0000.parquet
20
+ ...
21
+
22
+ Filename format: ts_{min_sec}_{max_sec}_part_{counter:04}.parquet
23
+ - min_sec: Unix timestamp (seconds) of earliest document in file
24
+ - max_sec: Unix timestamp (seconds) of latest document in file
25
+ - counter: Per-worker sequential counter (0000, 0001, 0002, ...)
26
+ Only increments if same worker writes multiple files with identical timestamps
27
+
28
+ How timestamps ensure uniqueness:
29
+ - Each chunk/bracket targets different time ranges
30
+ - Multiple workers process non-overlapping time ranges
31
+ - Natural file separation by actual data timestamps
32
+ - Counter only needed if worker flushes multiple batches with identical ranges
33
+
34
+ Fallback format (no timestamps): part_{counter:04}.parquet
35
+ Used when time_field is None or documents lack timestamps
36
+
37
+
38
+ STEP 2: READ & CONCATENATE
39
+ ---------------------------
40
+ Pandas: Read all files sequentially, concatenate into single DataFrame
41
+ Polars: Read all files in parallel (native multi-file support)
42
+
43
+ Both engines use PyArrow under the hood for efficient Parquet parsing.
44
+
45
+
46
+ STEP 3: DECODE TYPES.ANY STRUCT VALUES
47
+ ---------------------------------------
48
+ Types.Any fields are encoded as Arrow structs by Rust backend:
49
+
50
+ Parquet stores:
51
+ {
52
+ "value": {
53
+ "float_value": 42.5,
54
+ "int_value": null,
55
+ "string_value": null,
56
+ "bool_value": null,
57
+ ...
58
+ }
59
+ }
60
+
61
+ After decoding (coalesce first non-null field):
62
+ {"value": 42.5}
63
+
64
+ This decoding happens in Rust via decode_any_struct_arrow() for maximum
65
+ performance.
66
+
67
+
68
+ STEP 4: FLATTEN NESTED STRUCTS
69
+ -------------------------------
70
+ Convert nested struct columns to dotted field names:
71
+
72
+ Before: {"metadata": {"device_id": "123...", "sensor_id": "456..."}}
73
+ After: {"metadata.device_id": "123...", "metadata.sensor_id": "456..."}
74
+
75
+
76
+ STEP 5: RECONSTRUCT OBJECTIDS
77
+ ------------------------------
78
+ Convert string-encoded ObjectIds back to bson.ObjectId instances:
79
+
80
+ "507f1f77bcf86cd799439011" -> ObjectId("507f1f77bcf86cd799439011")
81
+
82
+
83
+ OUTPUT: DataFrame ( or Polars to stream pyarrow.Table )
84
+ -----------------
85
+ timestamp metadata.device_id value
86
+ 0 2024-01-15 12:00 64a1b2c3... 42.5
87
+ 1 2024-01-15 12:01 64a1b2c3... 43.1
88
+ 2 2024-01-15 12:02 64a1b2c3... "active"
89
+
90
+ """
91
+
92
+ import logging
93
+ from datetime import datetime
94
+ from pathlib import Path
95
+ from typing import Any, Dict, Generator, Iterator, List, Literal, Optional, Tuple, Union
96
+
97
+ import duckdb
98
+ import pandas as pd
99
+ import polars as pl
100
+ import pyarrow as pa
101
+ import pyarrow.parquet as pq
102
+ from bson import ObjectId
103
+
104
+ from xlr8.constants import DEFAULT_BATCH_SIZE
105
+
106
+ logger = logging.getLogger(__name__)
107
+
108
+
109
+ class ParquetReader:
110
+ """
111
+ Reads Parquet files from cache directory.
112
+
113
+ Provides streaming and batch reading of documents from Parquet files.
114
+ Supports reading all files in a cache directory or specific partitions.
115
+
116
+ Example:
117
+ >>> reader = ParquetReader(cache_dir=".cache/abc123def")
118
+ >>>
119
+ >>> # Stream all documents
120
+ >>> for doc in reader.iter_documents():
121
+ ... print(doc)
122
+ >>>
123
+ >>> # Or load to DataFrame
124
+ >>> df = reader.to_dataframe()
125
+ """
126
+
127
+ def __init__(self, cache_dir: Union[str, Path]):
128
+ """
129
+ Initialize reader for cache directory.
130
+
131
+ Args:
132
+ cache_dir: Directory containing parquet files
133
+ """
134
+ self.cache_dir = Path(cache_dir)
135
+
136
+ if not self.cache_dir.exists():
137
+ raise FileNotFoundError(f"Cache directory not found: {cache_dir}")
138
+
139
+ # Find all parquet files (may be empty if query returned no results)
140
+ self.parquet_files = sorted(self.cache_dir.glob("*.parquet"))
141
+
142
+ def iter_documents(
143
+ self,
144
+ batch_size: int = DEFAULT_BATCH_SIZE,
145
+ ) -> Iterator[Dict[str, Any]]:
146
+ """
147
+ Stream documents from all parquet files.
148
+
149
+ Reads in batches to avoid loading entire dataset into memory.
150
+
151
+ Args:
152
+ batch_size: Number of rows to read per batch
153
+
154
+ Yields:
155
+ Document dictionaries
156
+
157
+ Example:
158
+ >>> for doc in reader.iter_documents(batch_size=5000):
159
+ ... process(doc)
160
+ """
161
+ for parquet_file in self.parquet_files:
162
+ # Read in batches
163
+ parquet_file_obj = pq.ParquetFile(parquet_file)
164
+
165
+ for batch in parquet_file_obj.iter_batches(batch_size=batch_size):
166
+ # Convert Arrow batch to pandas then to dicts
167
+ df_batch = batch.to_pandas()
168
+
169
+ for _, row in df_batch.iterrows():
170
+ yield row.to_dict()
171
+
172
+ def _is_any_type(self, field_type: Any) -> bool:
173
+ """Check if field_type is an Any type (supports both class and instance)."""
174
+ from xlr8.schema.types import Any as AnyType
175
+
176
+ # Support both Types.Any (class) and Types.Any() (instance)
177
+ if isinstance(field_type, AnyType):
178
+ return True
179
+ if isinstance(field_type, type) and issubclass(field_type, AnyType):
180
+ return True
181
+ return False
182
+
183
+ def _decode_struct_values(self, df: pd.DataFrame, schema: Any) -> pd.DataFrame:
184
+ """
185
+ Decode struct-encoded Any-typed columns back to actual values.
186
+
187
+ For columns marked as Any type in schema, extracts the actual value
188
+ from the struct bitmap representation (float_value, int_value, etc.).
189
+
190
+ Uses Rust Arrow-native decoding for maximum performance (~40x faster).
191
+
192
+ Note: This is a fallback path. The fast path decodes directly from Arrow
193
+ before to_pandas() conversion, avoiding dict overhead entirely.
194
+ """
195
+ if not hasattr(schema, "fields"):
196
+ return df
197
+
198
+ # Import Rust Arrow-native decoder (required)
199
+ from xlr8.rust_backend import decode_any_struct_arrow
200
+
201
+ # Find Any-typed fields in schema
202
+ for field_name, field_type in schema.fields.items():
203
+ if self._is_any_type(field_type) and field_name in df.columns:
204
+ # Column contains struct-encoded values (dicts)
205
+ col = df[field_name]
206
+
207
+ if len(col) == 0:
208
+ continue
209
+
210
+ # Check if it's a struct (dict) column - skip if already decoded
211
+ first_val = col.iloc[0]
212
+ if not isinstance(first_val, dict):
213
+ # Already decoded in fast path - skip
214
+ continue
215
+
216
+ # Build struct type dynamically based on the dict keys
217
+ sample_dict = first_val
218
+ struct_fields = []
219
+ field_type_map = {
220
+ "float_value": pa.float64(),
221
+ "int32_value": pa.int32(),
222
+ "int64_value": pa.int64(),
223
+ "string_value": pa.string(),
224
+ "objectid_value": pa.string(),
225
+ "decimal128_value": pa.string(),
226
+ "regex_value": pa.string(),
227
+ "binary_value": pa.string(),
228
+ "document_value": pa.string(),
229
+ "array_value": pa.string(),
230
+ "bool_value": pa.bool_(),
231
+ "datetime_value": pa.timestamp("ms"), # Use ms for new schema
232
+ "null_value": pa.bool_(),
233
+ }
234
+
235
+ for key in sample_dict.keys():
236
+ if key in field_type_map:
237
+ struct_fields.append((key, field_type_map[key]))
238
+
239
+ any_struct_type = pa.struct(struct_fields)
240
+
241
+ # Convert to PyArrow array - this is a single pass over the data
242
+ arrow_array = pa.array(col.tolist(), type=any_struct_type)
243
+
244
+ # Decode in Rust - direct memory access to Arrow memory
245
+ decoded_values = decode_any_struct_arrow(arrow_array)
246
+ df[field_name] = decoded_values
247
+
248
+ return df
249
+
250
+ def _flatten_struct_columns(self, df: pd.DataFrame) -> pd.DataFrame:
251
+ """
252
+ Flatten nested struct columns into separate columns.
253
+
254
+ Example:
255
+ metadata: {'sensor_id': '...', 'device_id': '...'}
256
+ -> metadata.sensor_id: '...', metadata.device_id: '...'
257
+
258
+ """
259
+ if df.empty:
260
+ return df
261
+
262
+ struct_cols = []
263
+ for col in df.columns:
264
+ # Check if column contains dicts (structs)
265
+ if len(df) > 0 and isinstance(df[col].iloc[0], dict):
266
+ struct_cols.append(col)
267
+
268
+ for col in struct_cols:
269
+ # FAST PATH: Extract struct fields directly using list comprehension
270
+ # This is ~5x faster than pd.json_normalize() for large datasets
271
+ col_values = df[col].tolist()
272
+
273
+ # Detect subcolumns from first non-null row
274
+ first_val = col_values[0] if col_values else {}
275
+ subcolumns = list(first_val.keys()) if isinstance(first_val, dict) else []
276
+
277
+ # Build new columns efficiently
278
+ new_cols = {}
279
+ for subcol in subcolumns:
280
+ new_col_name = f"{col}.{subcol}"
281
+ new_cols[new_col_name] = [
282
+ row.get(subcol) if isinstance(row, dict) else None
283
+ for row in col_values
284
+ ]
285
+
286
+ # Drop original struct column
287
+ df = df.drop(columns=[col])
288
+
289
+ # Add flattened columns
290
+ for new_col_name, values in new_cols.items():
291
+ df[new_col_name] = values
292
+
293
+ return df
294
+
295
+ def _reconstruct_objectids(self, df: pd.DataFrame, schema: Any) -> pd.DataFrame:
296
+ """
297
+ Reconstruct ObjectId columns from string representation.
298
+
299
+ Converts string ObjectIds back to bson.ObjectId instances.
300
+ """
301
+ from xlr8.schema.types import ObjectId as ObjectIdType
302
+
303
+ # Find all ObjectId fields in schema (including nested ones)
304
+ objectid_fields = []
305
+
306
+ if hasattr(schema, "fields"):
307
+ for field_name, field_type in schema.fields.items():
308
+ if isinstance(field_type, ObjectIdType):
309
+ objectid_fields.append(field_name)
310
+ elif hasattr(field_type, "fields"):
311
+ # Nested struct with ObjectId fields
312
+ for nested_name, nested_type in field_type.fields.items():
313
+ if isinstance(nested_type, ObjectIdType):
314
+ objectid_fields.append(f"{field_name}.{nested_name}")
315
+
316
+ # Convert string columns back to ObjectId
317
+ for field in objectid_fields:
318
+ if field in df.columns:
319
+ df[field] = df[field].apply(
320
+ lambda x: ObjectId(x) if x and pd.notna(x) else x
321
+ )
322
+
323
+ return df
324
+
325
+ def _decode_struct_values_polars(
326
+ self,
327
+ df: "pl.DataFrame",
328
+ schema: Any,
329
+ any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
330
+ ) -> "pl.DataFrame":
331
+ """
332
+ Decode struct-encoded Any-typed columns back to actual values (Polars).
333
+
334
+ Args:
335
+ df: Polars DataFrame
336
+ schema: Schema with field type info
337
+ any_type_strategy: How to decode:
338
+ - "float": Coalesce to Float64, prioritize numeric (default)
339
+ - "string": Convert everything to string (lossless)
340
+ - "keep_struct": Keep raw struct, don't decode
341
+ """
342
+ if not hasattr(schema, "fields"):
343
+ return df
344
+
345
+ # Find Any-typed fields in schema
346
+ for field_name, field_type in schema.fields.items():
347
+ if self._is_any_type(field_type) and field_name in df.columns:
348
+ # Check if column is a struct
349
+ col_dtype = df.schema[field_name]
350
+ if str(col_dtype).startswith("Struct"):
351
+ # Strategy: keep_struct - don't decode at all
352
+ if any_type_strategy == "keep_struct":
353
+ continue
354
+
355
+ try:
356
+ # Get field names from the struct
357
+ struct_fields = (
358
+ col_dtype.fields if hasattr(col_dtype, "fields") else []
359
+ ) # type: ignore[attr-defined]
360
+ field_names = (
361
+ [f.name for f in struct_fields] if struct_fields else []
362
+ )
363
+
364
+ if any_type_strategy == "string":
365
+ # Convert ALL value types to string
366
+ coalesce_exprs = []
367
+
368
+ # String first (already string)
369
+ if "string_value" in field_names:
370
+ coalesce_exprs.append(
371
+ pl.col(field_name).struct.field("string_value")
372
+ )
373
+
374
+ # Float to string
375
+ if "float_value" in field_names:
376
+ coalesce_exprs.append(
377
+ pl.col(field_name)
378
+ .struct.field("float_value")
379
+ .cast(pl.Utf8)
380
+ )
381
+
382
+ # Int to string
383
+ for int_name in ["int64_value", "int32_value"]:
384
+ if int_name in field_names:
385
+ coalesce_exprs.append(
386
+ pl.col(field_name)
387
+ .struct.field(int_name)
388
+ .cast(pl.Utf8)
389
+ )
390
+
391
+ # Bool to string
392
+ if "bool_value" in field_names:
393
+ coalesce_exprs.append(
394
+ pl.col(field_name)
395
+ .struct.field("bool_value")
396
+ .cast(pl.Utf8)
397
+ )
398
+
399
+ # ObjectId, decimal, etc. (already strings)
400
+ for str_field in [
401
+ "objectid_value",
402
+ "decimal128_value",
403
+ "regex_value",
404
+ "binary_value",
405
+ "document_value",
406
+ "array_value",
407
+ ]:
408
+ if str_field in field_names:
409
+ coalesce_exprs.append(
410
+ pl.col(field_name).struct.field(str_field)
411
+ )
412
+
413
+ if coalesce_exprs:
414
+ df = df.with_columns(
415
+ pl.coalesce(coalesce_exprs).alias(field_name)
416
+ )
417
+
418
+ else: # "float" strategy (default)
419
+ # Coalesce to Float64, prioritize numeric
420
+ coalesce_exprs = []
421
+
422
+ # Try float first (highest precision)
423
+ if "float_value" in field_names:
424
+ coalesce_exprs.append(
425
+ pl.col(field_name).struct.field("float_value")
426
+ )
427
+
428
+ # Try various int types, cast to float
429
+ for int_name in ["int64_value", "int32_value"]:
430
+ if int_name in field_names:
431
+ coalesce_exprs.append(
432
+ pl.col(field_name)
433
+ .struct.field(int_name)
434
+ .cast(pl.Float64)
435
+ )
436
+
437
+ # Try bool (as 0.0/1.0)
438
+ if "bool_value" in field_names:
439
+ coalesce_exprs.append(
440
+ pl.col(field_name)
441
+ .struct.field("bool_value")
442
+ .cast(pl.Float64)
443
+ )
444
+
445
+ if coalesce_exprs:
446
+ if len(coalesce_exprs) == 1:
447
+ df = df.with_columns(
448
+ coalesce_exprs[0].alias(field_name)
449
+ )
450
+ else:
451
+ df = df.with_columns(
452
+ pl.coalesce(coalesce_exprs).alias(field_name)
453
+ )
454
+ else:
455
+ logger.warning(
456
+ "Could not decode struct column '%s': "
457
+ "no numeric fields in %s",
458
+ field_name,
459
+ field_names,
460
+ )
461
+ except (AttributeError, KeyError, ValueError) as e:
462
+ logger.warning("Error decoding struct '%s': %s", field_name, e)
463
+
464
+ return df
465
+
466
+ def _process_dataframe(
467
+ self,
468
+ df: Union[pd.DataFrame, "pl.DataFrame"],
469
+ engine: Literal["pandas", "polars"],
470
+ schema: Optional[Any] = None,
471
+ coerce: Literal["raise", "error"] = "raise",
472
+ any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
473
+ ) -> Union[pd.DataFrame, "pl.DataFrame"]:
474
+ """
475
+ Process DataFrame: decode struct values, flatten structs and
476
+ reconstruct ObjectIds.
477
+
478
+ Args:
479
+ df: DataFrame to process
480
+ engine: "pandas" or "polars"
481
+ schema: Schema for ObjectId reconstruction
482
+ coerce: Error handling mode ("raise" or "error")
483
+ any_type_strategy: How to decode Any() structs in Polars
484
+ (float/string/keep_struct)
485
+
486
+ Returns:
487
+ Processed DataFrame
488
+ """
489
+ if engine == "pandas":
490
+ # First, decode Any-typed struct columns back to actual values
491
+ if schema is not None:
492
+ try:
493
+ df = self._decode_struct_values(df, schema) # type: ignore[arg-type]
494
+ except (AttributeError, KeyError, ValueError, TypeError) as e:
495
+ if coerce == "error":
496
+ logger.error("Error decoding struct values: %s", e)
497
+ else:
498
+ raise
499
+
500
+ # Flatten struct columns (e.g., metadata -> metadata.sensor_id)
501
+ df = self._flatten_struct_columns(df) # type: ignore[arg-type]
502
+
503
+ # Reconstruct ObjectIds from strings
504
+ if schema is not None:
505
+ try:
506
+ df = self._reconstruct_objectids(df, schema)
507
+ except (AttributeError, KeyError, ValueError, TypeError) as e:
508
+ if coerce == "error":
509
+ logger.error("Error reconstructing ObjectIds: %s", e)
510
+ else:
511
+ raise
512
+
513
+ return df
514
+ elif engine == "polars":
515
+ # Polars: decode Any-typed struct columns and keep dotted column names
516
+ if schema is not None:
517
+ try:
518
+ df = self._decode_struct_values_polars(
519
+ df, schema, any_type_strategy
520
+ ) # type: ignore[arg-type]
521
+ except (AttributeError, KeyError, ValueError, TypeError) as e:
522
+ if coerce == "error":
523
+ logger.error("Error decoding struct values (polars): %s", e)
524
+ else:
525
+ raise
526
+ return df
527
+
528
+ def to_dataframe(
529
+ self,
530
+ engine: str = "pandas",
531
+ schema: Optional[Any] = None,
532
+ time_field: Optional[str] = None,
533
+ start_date: Optional[datetime] = None,
534
+ end_date: Optional[datetime] = None,
535
+ coerce: Literal["raise", "error"] = "raise",
536
+ any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
537
+ ) -> Union[pd.DataFrame, "pl.DataFrame"]:
538
+ """
539
+ Load all parquet files into a DataFrame.
540
+
541
+ Args:
542
+ engine: "pandas" or "polars"
543
+ schema: Schema for ObjectId reconstruction and struct flattening (required)
544
+ time_field: Name of time field for date filtering (from schema.time_field)
545
+ start_date: Filter data from this date (inclusive, tz-aware datetime)
546
+ end_date: Filter data until this date (exclusive, tz-aware datetime)
547
+ coerce: Error handling mode:
548
+ - "raise": Raise exceptions on schema validation errors (default)
549
+ - "error": Log errors and store None for invalid values
550
+ any_type_strategy: How to decode Types.Any() struct columns in Polars:
551
+ - "float": Coalesce to Float64, prioritize numeric (default)
552
+ - "string": Convert everything to string (lossless)
553
+ - "keep_struct": Keep raw struct, don't decode
554
+
555
+ Returns:
556
+ DataFrame with all documents (structs flattened, ObjectIds reconstructed)
557
+
558
+ Example:
559
+ >>> df = reader.to_dataframe(
560
+ ... schema=schema,
561
+ ... time_field="timestamp",
562
+ ... start_date=datetime(2024, 6, 1, tzinfo=timezone.utc),
563
+ ... end_date=datetime(2024, 6, 15, tzinfo=timezone.utc),
564
+ ... )
565
+ """
566
+ # Build PyArrow filter for date range (predicate pushdown)
567
+ # Convert datetime to PyArrow timestamp[ms] to match Parquet column type (no tz)
568
+ filters = None
569
+ if time_field and (start_date or end_date):
570
+ filter_conditions = []
571
+ if start_date:
572
+ # Convert to ms timestamp (no tz) to match Parquet storage
573
+ start_ts = pa.scalar(start_date, type=pa.timestamp("ms"))
574
+ filter_conditions.append((time_field, ">=", start_ts))
575
+ if end_date:
576
+ end_ts = pa.scalar(end_date, type=pa.timestamp("ms"))
577
+ filter_conditions.append((time_field, "<", end_ts))
578
+ if filter_conditions:
579
+ filters = filter_conditions
580
+
581
+ if engine == "polars":
582
+ # Return empty DataFrame if no parquet files (query returned no results)
583
+ if not self.parquet_files:
584
+ return pl.DataFrame()
585
+
586
+ # Use scan_parquet for lazy evaluation with predicate pushdown
587
+ # This only reads the row groups that match the filter conditions
588
+ lf = pl.scan_parquet(self.parquet_files)
589
+
590
+ # Apply date filter with predicate pushdown (reads only matching data)
591
+ # Convert datetime to naive (no timezone) to match Parquet column dtype
592
+ if time_field and (start_date or end_date):
593
+ if start_date:
594
+ start_naive = (
595
+ start_date.replace(tzinfo=None)
596
+ if start_date.tzinfo
597
+ else start_date
598
+ )
599
+ lf = lf.filter(pl.col(time_field) >= start_naive)
600
+ if end_date:
601
+ end_naive = (
602
+ end_date.replace(tzinfo=None) if end_date.tzinfo else end_date
603
+ )
604
+ lf = lf.filter(pl.col(time_field) < end_naive)
605
+
606
+ # Collect executes the query with predicate pushdown
607
+ df = lf.collect()
608
+
609
+ return self._process_dataframe(
610
+ df, engine, schema, coerce, any_type_strategy
611
+ )
612
+
613
+ elif engine == "pandas":
614
+ # Return empty DataFrame if no parquet files (query returned no results)
615
+ if not self.parquet_files:
616
+ return pd.DataFrame()
617
+
618
+ # Read all files with optional filter (predicate pushdown)
619
+ # Use PyArrow to read, then convert to pandas - this allows
620
+ # struct columns to stay in Arrow format for fast Rust decoding
621
+ tables = []
622
+ for parquet_file in self.parquet_files:
623
+ try:
624
+ # Use PyArrow filters for efficient predicate pushdown
625
+ table = pq.read_table(parquet_file, filters=filters)
626
+ tables.append(table)
627
+ except Exception as e:
628
+ if coerce == "error":
629
+ logger.error(f"Error reading {parquet_file}: {e}")
630
+ continue
631
+ raise
632
+
633
+ if not tables:
634
+ return pd.DataFrame()
635
+
636
+ # Concatenate Arrow tables
637
+ combined_table = pa.concat_tables(tables)
638
+
639
+ # FAST PATH: Decode Any-typed struct columns directly in Arrow
640
+ # This gives us 44x speedup because Rust reads Arrow memory directly
641
+ # without Python iteration over dicts
642
+ any_columns_decoded = {}
643
+ columns_to_drop = []
644
+ if schema and hasattr(schema, "fields"):
645
+ from xlr8.rust_backend import decode_any_struct_arrow
646
+
647
+ for field_name, field_type in schema.fields.items():
648
+ if (
649
+ self._is_any_type(field_type)
650
+ and field_name in combined_table.column_names
651
+ ):
652
+ col = combined_table.column(field_name)
653
+ if pa.types.is_struct(col.type):
654
+ # Decode in Rust - returns Python list of mixed types
655
+ combined = col.combine_chunks()
656
+ decoded_values = decode_any_struct_arrow(combined)
657
+ any_columns_decoded[field_name] = decoded_values
658
+ # Mark for removal to avoid slow dict conversion
659
+ # in to_pandas()
660
+ columns_to_drop.append(field_name)
661
+
662
+ # Drop decoded struct columns before pandas conversion
663
+ # to avoid dict overhead
664
+ if columns_to_drop:
665
+ combined_table = combined_table.drop(columns_to_drop)
666
+
667
+ # Convert to pandas (non-Any columns go through normal path)
668
+ df = combined_table.to_pandas()
669
+
670
+ # Add back Any columns with decoded values
671
+ # (bypassing struct->dict->decode path)
672
+ for field_name, decoded_values in any_columns_decoded.items():
673
+ df[field_name] = decoded_values
674
+
675
+ return self._process_dataframe(df, engine, schema, coerce)
676
+
677
+ else:
678
+ raise ValueError(f"Unknown engine: {engine}. Use 'pandas' or 'polars'")
679
+
680
+ def iter_dataframe_batches(
681
+ self,
682
+ batch_size: int = 10000,
683
+ schema: Optional[Any] = None,
684
+ time_field: Optional[str] = None,
685
+ start_date: Optional[datetime] = None,
686
+ end_date: Optional[datetime] = None,
687
+ coerce: Literal["raise", "error"] = "raise",
688
+ ) -> Generator[pd.DataFrame, None, None]:
689
+ """
690
+ Yield DataFrames in batches without loading all data into memory.
691
+
692
+ This is memory-efficient: only batch_size rows are in memory at a time.
693
+ Uses PyArrow's batch iteration for efficient streaming.
694
+
695
+ Use this when NO sorting is needed. For sorted batches, use
696
+ iter_globally_sorted_batches().
697
+
698
+ Args:
699
+ batch_size: Number of rows per batch (default: 10,000)
700
+ schema: Schema for struct decoding and ObjectId reconstruction
701
+ time_field: Name of time field for date filtering
702
+ start_date: Filter data from this date (inclusive, tz-aware)
703
+ end_date: Filter data until this date (exclusive, tz-aware)
704
+ coerce: Error handling mode ("raise" or "error")
705
+
706
+ Yields:
707
+ pd.DataFrame: Batches of processed rows
708
+
709
+ Example:
710
+ >>> for batch_df in reader.iter_dataframe_batches(batch_size=5000):
711
+ ... process(batch_df)
712
+ """
713
+ import pyarrow.parquet as pq
714
+
715
+ batch_count = 0
716
+ total_rows = 0
717
+
718
+ for parquet_file in self.parquet_files:
719
+ try:
720
+ # Open parquet file for batch iteration
721
+ parquet_file_obj = pq.ParquetFile(parquet_file)
722
+
723
+ for batch in parquet_file_obj.iter_batches(batch_size=batch_size):
724
+ # Convert Arrow batch to pandas
725
+ batch_df = batch.to_pandas()
726
+
727
+ # Apply date filter if specified
728
+ # (in case predicate pushdown not supported)
729
+ if time_field and (start_date or end_date):
730
+ if time_field in batch_df.columns:
731
+ if start_date:
732
+ batch_df = batch_df[batch_df[time_field] >= start_date]
733
+ if end_date:
734
+ batch_df = batch_df[batch_df[time_field] < end_date]
735
+
736
+ if len(batch_df) == 0:
737
+ continue
738
+
739
+ # Process the batch (decode structs, flatten, reconstruct ObjectIds)
740
+ processed_df = self._process_dataframe(
741
+ batch_df, "pandas", schema, coerce
742
+ )
743
+
744
+ batch_count += 1
745
+ total_rows += len(processed_df)
746
+
747
+ yield processed_df
748
+
749
+ except Exception as e:
750
+ if coerce == "error":
751
+ logger.error(f"Error reading batch from {parquet_file}: {e}")
752
+ continue
753
+ raise
754
+
755
+ logger.debug(f"Yielded {batch_count} batches, {total_rows} total rows")
756
+
757
+ def get_globally_sorted_dataframe(
758
+ self,
759
+ sort_spec: List[Tuple[str, int]],
760
+ schema: Optional[Any] = None,
761
+ time_field: Optional[str] = None,
762
+ start_date: Optional[datetime] = None,
763
+ end_date: Optional[datetime] = None,
764
+ coerce: Literal["raise", "error"] = "raise",
765
+ memory_limit_mb: Optional[int] = None,
766
+ threads: Optional[int] = None,
767
+ ) -> pd.DataFrame:
768
+ """
769
+ Return entire globally sorted DataFrame using DuckDB K-way merge.
770
+
771
+ More efficient than iter_globally_sorted_batches() when you want
772
+ the full result, as it avoids batch iteration overhead and just
773
+ fetches all rows at once.
774
+ for to_dataframe_batches() where streaming is required.
775
+
776
+ Args:
777
+ sort_spec: Sort specification as [(field, direction), ...]
778
+ schema: Schema for ObjectId reconstruction and advanced sorting
779
+ time_field: Field for date filtering
780
+ start_date: Filter data from this date (inclusive, tz-aware)
781
+ end_date: Filter data until this date (exclusive, tz-aware)
782
+ coerce: Error handling mode
783
+ memory_limit_mb: DuckDB memory limit
784
+ threads: DuckDB thread count
785
+
786
+ Returns:
787
+ pd.DataFrame: Complete sorted DataFrame
788
+ """
789
+ if not self.parquet_files:
790
+ return pd.DataFrame()
791
+
792
+ # Expand parent fields to children in schema definition order
793
+ sort_spec = self._expand_parent_sort_fields(sort_spec, schema)
794
+
795
+ # Get list of parquet files
796
+ file_paths = [str(f) for f in self.parquet_files]
797
+
798
+ logger.debug(
799
+ f"DuckDB K-way merge (full): {len(file_paths)} files, sort_spec={sort_spec}"
800
+ )
801
+
802
+ try:
803
+ # Create DuckDB connection
804
+ conn = duckdb.connect(":memory:")
805
+
806
+ # Configure DuckDB to use allocated resources
807
+ if memory_limit_mb:
808
+ conn.execute(f"SET memory_limit = '{memory_limit_mb}MB'")
809
+ logger.info(f"DuckDB memory_limit set to {memory_limit_mb} MB")
810
+
811
+ if threads:
812
+ conn.execute(f"SET threads = {threads}")
813
+ logger.info(f"DuckDB threads set to {threads}")
814
+
815
+ # Build ORDER BY with MongoDB type ordering
816
+ # (same logic as iter_globally_sorted_batches)
817
+ order_clauses = []
818
+ for field_name, direction in sort_spec:
819
+ dir_sql = "ASC" if direction == 1 else "DESC"
820
+ if schema and schema.has_field(field_name):
821
+ field_type = schema.get_field_type(field_name)
822
+ else:
823
+ field_type = None
824
+ is_any = self._is_any_type(field_type) if field_type else True
825
+
826
+ if is_any:
827
+ # Complete MongoDB type ordering for Any() fields
828
+ type_clause = f"""CASE
829
+ WHEN "{field_name}" IS NULL OR "{field_name}".null_value IS TRUE
830
+ THEN 0
831
+ WHEN "{field_name}".float_value IS NOT NULL
832
+ OR "{field_name}".int32_value IS NOT NULL
833
+ OR "{field_name}".int64_value IS NOT NULL
834
+ OR "{field_name}".decimal128_value IS NOT NULL
835
+ THEN 1
836
+ WHEN "{field_name}".string_value IS NOT NULL THEN 2
837
+ WHEN "{field_name}".document_value IS NOT NULL THEN 3
838
+ WHEN "{field_name}".array_value IS NOT NULL THEN 4
839
+ WHEN "{field_name}".binary_value IS NOT NULL THEN 5
840
+ WHEN "{field_name}".objectid_value IS NOT NULL THEN 6
841
+ WHEN "{field_name}".bool_value IS NOT NULL THEN 7
842
+ WHEN "{field_name}".datetime_value IS NOT NULL THEN 8
843
+ WHEN "{field_name}".regex_value IS NOT NULL THEN 9
844
+ ELSE 10
845
+ END {dir_sql}"""
846
+
847
+ # Value comparisons for each type
848
+ num_clause = (
849
+ f'COALESCE("{field_name}".float_value, '
850
+ f'CAST("{field_name}".int32_value AS DOUBLE), '
851
+ f'CAST("{field_name}".int64_value AS DOUBLE)) {dir_sql}'
852
+ )
853
+ str_clause = f'"{field_name}".string_value {dir_sql}'
854
+ doc_clause = f'"{field_name}".document_value {dir_sql}'
855
+ arr_clause = f'"{field_name}".array_value {dir_sql}'
856
+ bin_clause = f'"{field_name}".binary_value {dir_sql}'
857
+ oid_clause = f'"{field_name}".objectid_value {dir_sql}'
858
+ bool_clause = f'"{field_name}".bool_value {dir_sql}'
859
+ date_clause = f'"{field_name}".datetime_value {dir_sql}'
860
+ regex_clause = f'"{field_name}".regex_value {dir_sql}'
861
+
862
+ order_clauses.extend(
863
+ [
864
+ type_clause,
865
+ num_clause,
866
+ str_clause,
867
+ doc_clause,
868
+ arr_clause,
869
+ bin_clause,
870
+ oid_clause,
871
+ bool_clause,
872
+ date_clause,
873
+ regex_clause,
874
+ ]
875
+ )
876
+ else:
877
+ # Simple field - use direct comparison
878
+ order_clauses.append(f'"{field_name}" {dir_sql}')
879
+
880
+ order_by = ", ".join(order_clauses)
881
+ files = ", ".join([f"'{f}'" for f in file_paths])
882
+ query = f"SELECT * FROM read_parquet([{files}]) ORDER BY {order_by}"
883
+
884
+ print(f"[DuckDB] K-way merge (full): {len(file_paths)} files")
885
+
886
+ # Fetch entire result at once using df()
887
+ df = conn.execute(query).df()
888
+
889
+ # Ensure time field is UTC
890
+ if time_field and time_field in df.columns:
891
+ if pd.api.types.is_datetime64_any_dtype(df[time_field]):
892
+ if df[time_field].dt.tz is not None:
893
+ df[time_field] = df[time_field].dt.tz_convert("UTC")
894
+ else:
895
+ df[time_field] = df[time_field].dt.tz_localize("UTC")
896
+
897
+ # Apply date filtering if needed
898
+ if time_field and (start_date or end_date):
899
+ if start_date:
900
+ df = df[df[time_field] >= start_date]
901
+ if end_date:
902
+ df = df[df[time_field] < end_date]
903
+
904
+ # Process the DataFrame (decode structs, reconstruct ObjectIds)
905
+ df = self._process_dataframe(df, "pandas", schema, coerce)
906
+
907
+ conn.close()
908
+ print(f"[DuckDB] K-way merge complete: {len(df):,} rows")
909
+ logger.debug(f"DuckDB K-way merge complete: {len(df):,} rows")
910
+
911
+ return df
912
+
913
+ except Exception as e:
914
+ logger.error(f"DuckDB K-way merge failed: {e}")
915
+ raise
916
+
917
+ def _expand_parent_sort_fields(
918
+ self, sort_spec: List[Tuple[str, int]], schema: Optional[Any]
919
+ ) -> List[Tuple[str, int]]:
920
+ """
921
+ Expand parent field sorts to their child fields in schema definition order.
922
+
923
+ When user sorts by a parent field like "metadata" but the schema has
924
+ flattened fields like "metadata.device_id", expand to all children.
925
+
926
+ Args:
927
+ sort_spec: Original [(field, direction), ...]
928
+ schema: XLR8 schema with field definitions
929
+
930
+ Returns:
931
+ Expanded sort spec with parent fields replaced by children
932
+
933
+ Raises:
934
+ ValueError: If field not found and no children exist
935
+ """
936
+ if schema is None:
937
+ return sort_spec
938
+
939
+ expanded = []
940
+ # Schema.fields preserves insertion order (Python 3.7+)
941
+ all_fields = list(schema.fields.keys())
942
+
943
+ for field_name, direction in sort_spec:
944
+ if schema.has_field(field_name):
945
+ # Field exists directly in schema
946
+ expanded.append((field_name, direction))
947
+ else:
948
+ # Look for child fields with this prefix (in schema order)
949
+ prefix = f"{field_name}."
950
+ children = [f for f in all_fields if f.startswith(prefix)]
951
+
952
+ if children:
953
+ logger.info(
954
+ f"Sort field '{field_name}' expanded to children "
955
+ f"(schema order): {children}"
956
+ )
957
+ for child in children:
958
+ expanded.append((child, direction))
959
+ else:
960
+ raise ValueError(
961
+ f"Sort field '{field_name}' not found in schema "
962
+ f"and has no child fields. "
963
+ f"Available fields: {sorted(all_fields)[:10]}"
964
+ + ("..." if len(all_fields) > 10 else "")
965
+ )
966
+
967
+ return expanded
968
+
969
+ def iter_globally_sorted_batches(
970
+ self,
971
+ sort_field: Optional[str] = None,
972
+ ascending: bool = True,
973
+ batch_size: int = DEFAULT_BATCH_SIZE,
974
+ schema: Optional[Any] = None,
975
+ time_field: Optional[str] = None,
976
+ start_date: Optional[datetime] = None,
977
+ end_date: Optional[datetime] = None,
978
+ coerce: Literal["raise", "error"] = "raise",
979
+ sort_spec: Optional[List[Tuple[str, int]]] = None,
980
+ # DuckDB configuration
981
+ memory_limit_mb: Optional[int] = None,
982
+ threads: Optional[int] = None,
983
+ ) -> Generator[pd.DataFrame, None, None]:
984
+ """
985
+ Yield globally sorted batches using DuckDB K-way merge.
986
+
987
+ This method reads all Parquet files in the cache directory and
988
+ yields batches in globally sorted order. Uses Rust's K-way merge
989
+ with MongoDB BSON comparison for 100% compatibility.
990
+
991
+ Supports advanced sorting:
992
+ - Parent fields (e.g., "metadata" expands to all child fields)
993
+ - Types.Any() with full MongoDB BSON type ordering (Objects, Arrays, Binary)
994
+
995
+ RAM Usage:
996
+ O(K × batch_size) where K = number of files.
997
+ Already handled by flush_ram_limit_mb.
998
+
999
+ Args:
1000
+ sort_field: Field to sort by (use sort_spec for multi-field sorting).
1001
+ ascending: Sort direction (use sort_spec for mixed directions).
1002
+ batch_size: Number of rows per yielded DataFrame (default: 10,000)
1003
+ schema: Schema for ObjectId reconstruction and advanced sorting
1004
+ time_field: Field for date filtering (usually same as sort_field)
1005
+ start_date: Filter data from this date (inclusive, tz-aware)
1006
+ end_date: Filter data until this date (exclusive, tz-aware)
1007
+ coerce: Error handling mode ("raise" or "error")
1008
+ sort_spec: Sort specification as [(field, direction), ...] where
1009
+ direction is 1 (ASC) or -1 (DESC). Preferred over sort_field.
1010
+
1011
+ Yields:
1012
+ pd.DataFrame: Batches in globally sorted order
1013
+
1014
+ Example:
1015
+ >>> reader = ParquetReader(".cache/abc123def")
1016
+ >>> # Simple sort
1017
+ >>> for batch in reader.iter_globally_sorted_batches(
1018
+ ... sort_spec=[("timestamp", 1)],
1019
+ ... schema=schema,
1020
+ ... batch_size=10_000
1021
+ ... ):
1022
+ ... process(batch)
1023
+ >>>
1024
+ >>> # Advanced: parent field + Any type
1025
+ >>> for batch in reader.iter_globally_sorted_batches(
1026
+ ... sort_spec=[("metadata", -1), ("value", 1)],
1027
+ ... schema=schema,
1028
+ ... ):
1029
+ ... process(batch)
1030
+ """
1031
+
1032
+ if not self.parquet_files:
1033
+ return
1034
+
1035
+ # Handle backwards compatibility
1036
+ if sort_spec is None and sort_field is not None:
1037
+ direction = 1 if ascending else -1
1038
+ sort_spec = [(sort_field, direction)]
1039
+
1040
+ if sort_spec is None:
1041
+ raise ValueError("sort_spec or sort_field is required")
1042
+
1043
+ # Expand parent fields to children in schema definition order
1044
+ sort_spec = self._expand_parent_sort_fields(sort_spec, schema)
1045
+
1046
+ # Get list of parquet files
1047
+ file_paths = [str(f) for f in self.parquet_files]
1048
+
1049
+ logger.debug(
1050
+ f"DuckDB K-way merge: {len(file_paths)} files, sort_spec={sort_spec}"
1051
+ )
1052
+
1053
+ try:
1054
+ # Create DuckDB connection
1055
+ conn = duckdb.connect(":memory:")
1056
+
1057
+ # Configure DuckDB to use allocated resources
1058
+ if memory_limit_mb:
1059
+ conn.execute(f"SET memory_limit = '{memory_limit_mb}MB'")
1060
+ logger.info(f"DuckDB memory_limit set to {memory_limit_mb} MB")
1061
+
1062
+ if threads:
1063
+ conn.execute(f"SET threads = {threads}")
1064
+ logger.info(f"DuckDB threads set to {threads}")
1065
+
1066
+ # Query DuckDB settings to verify
1067
+ memory_result = conn.execute(
1068
+ "SELECT current_setting('memory_limit')"
1069
+ ).fetchone()
1070
+ actual_memory = memory_result[0] if memory_result else "unknown"
1071
+ threads_result = conn.execute(
1072
+ "SELECT current_setting('threads')"
1073
+ ).fetchone()
1074
+ actual_threads = threads_result[0] if threads_result else "unknown"
1075
+ logger.debug(
1076
+ f"DuckDB configured: memory={actual_memory}, threads={actual_threads}"
1077
+ )
1078
+
1079
+ # Build ORDER BY with MongoDB type ordering
1080
+ order_clauses = []
1081
+ for field_name, direction in sort_spec:
1082
+ dir_sql = "ASC" if direction == 1 else "DESC"
1083
+ # Check if field exists in schema before getting type
1084
+ if schema and schema.has_field(field_name):
1085
+ field_type = schema.get_field_type(field_name)
1086
+ else:
1087
+ field_type = None
1088
+ is_any = self._is_any_type(field_type) if field_type else True
1089
+
1090
+ if is_any:
1091
+ # Complete MongoDB type ordering for Any() fields:
1092
+ # Reference: https://www.mongodb.com/docs/manual/reference/bson-type-comparison-order/
1093
+ # 1. MinKey (internal)
1094
+ # 2. Null
1095
+ # 3. Numbers (int, long, double, decimal)
1096
+ # 4. Symbol, String
1097
+ # 5. Object
1098
+ # 6. Array
1099
+ # 7. BinData
1100
+ # 8. ObjectId
1101
+ # 9. Boolean
1102
+ # 10. Date
1103
+ # 11. Timestamp
1104
+ # 12. Regular Expression
1105
+ # 13. MaxKey (internal)
1106
+
1107
+ # Type priority clause
1108
+ type_clause = f"""CASE
1109
+ WHEN "{field_name}" IS NULL OR "{field_name}".null_value IS TRUE
1110
+ THEN 0
1111
+ WHEN "{field_name}".float_value IS NOT NULL
1112
+ OR "{field_name}".int32_value IS NOT NULL
1113
+ OR "{field_name}".int64_value IS NOT NULL
1114
+ OR "{field_name}".decimal128_value IS NOT NULL
1115
+ THEN 1
1116
+ WHEN "{field_name}".string_value IS NOT NULL THEN 2
1117
+ WHEN "{field_name}".document_value IS NOT NULL THEN 3
1118
+ WHEN "{field_name}".array_value IS NOT NULL THEN 4
1119
+ WHEN "{field_name}".binary_value IS NOT NULL THEN 5
1120
+ WHEN "{field_name}".objectid_value IS NOT NULL THEN 6
1121
+ WHEN "{field_name}".bool_value IS NOT NULL THEN 7
1122
+ WHEN "{field_name}".datetime_value IS NOT NULL THEN 8
1123
+ WHEN "{field_name}".regex_value IS NOT NULL THEN 9
1124
+ ELSE 10
1125
+ END {dir_sql}"""
1126
+
1127
+ # Value comparisons for each type
1128
+ num_clause = (
1129
+ f'COALESCE("{field_name}".float_value, '
1130
+ f'CAST("{field_name}".int32_value AS DOUBLE), '
1131
+ f'CAST("{field_name}".int64_value AS DOUBLE)) {dir_sql}'
1132
+ )
1133
+ str_clause = f'"{field_name}".string_value {dir_sql}'
1134
+ # JSON strings compare lexicographically
1135
+ doc_clause = f'"{field_name}".document_value {dir_sql}'
1136
+ # JSON arrays compare lexicographically
1137
+ arr_clause = f'"{field_name}".array_value {dir_sql}'
1138
+ bin_clause = f'"{field_name}".binary_value {dir_sql}'
1139
+ oid_clause = f'"{field_name}".objectid_value {dir_sql}'
1140
+ bool_clause = f'"{field_name}".bool_value {dir_sql}'
1141
+ date_clause = f'"{field_name}".datetime_value {dir_sql}'
1142
+ regex_clause = f'"{field_name}".regex_value {dir_sql}'
1143
+
1144
+ order_clauses.extend(
1145
+ [
1146
+ type_clause,
1147
+ num_clause,
1148
+ str_clause,
1149
+ doc_clause,
1150
+ arr_clause,
1151
+ bin_clause,
1152
+ oid_clause,
1153
+ bool_clause,
1154
+ date_clause,
1155
+ regex_clause,
1156
+ ]
1157
+ )
1158
+ else:
1159
+ # Simple field - use direct comparison
1160
+ order_clauses.append(f'"{field_name}" {dir_sql}')
1161
+
1162
+ order_by = ", ".join(order_clauses)
1163
+ files = ", ".join([f"'{f}'" for f in file_paths])
1164
+ query = f"SELECT * FROM read_parquet([{files}]) ORDER BY {order_by}"
1165
+
1166
+ result = conn.execute(query)
1167
+
1168
+ # Use fetchmany() cursor API - this ACTUALLY streams incrementally
1169
+ # without loading all data into memory (unlike fetch_df_chunk)
1170
+ # NOTE: DuckDB's k-way merge uses internal buffering
1171
+ # separate from batch_size.
1172
+ # batch_size only controls how much we pull at once,
1173
+ # not DuckDB's merge buffer.
1174
+ batch_count = 0
1175
+ total_rows = 0
1176
+ column_names = [desc[0] for desc in result.description]
1177
+
1178
+ print(
1179
+ f"[DuckDB] K-way merge started: {len(file_paths)} files, "
1180
+ f"batch_size={batch_size:,}"
1181
+ )
1182
+
1183
+ while True:
1184
+ # Fetch batch as list of tuples
1185
+ rows = result.fetchmany(batch_size)
1186
+ if not rows:
1187
+ break
1188
+
1189
+ batch_count += 1
1190
+ total_rows += len(rows)
1191
+
1192
+ # Convert to DataFrame
1193
+ batch_df = pd.DataFrame(rows, columns=column_names)
1194
+ logger.debug(
1195
+ f"Streamed batch {batch_count}: {len(batch_df)} rows "
1196
+ f"from DuckDB K-way merge"
1197
+ )
1198
+
1199
+ # Ensure time field is UTC (DuckDB might return naive)
1200
+ if time_field and time_field in batch_df.columns:
1201
+ if pd.api.types.is_datetime64_any_dtype(batch_df[time_field]):
1202
+ if batch_df[time_field].dt.tz is not None:
1203
+ batch_df[time_field] = batch_df[time_field].dt.tz_convert(
1204
+ "UTC"
1205
+ )
1206
+ else:
1207
+ batch_df[time_field] = batch_df[time_field].dt.tz_localize(
1208
+ "UTC"
1209
+ )
1210
+
1211
+ # Apply date filtering if needed
1212
+ if time_field and (start_date or end_date):
1213
+ if start_date:
1214
+ batch_df = batch_df[batch_df[time_field] >= start_date]
1215
+ if end_date:
1216
+ batch_df = batch_df[batch_df[time_field] < end_date]
1217
+ if len(batch_df) == 0:
1218
+ continue
1219
+
1220
+ # Process the batch (decode structs, reconstruct ObjectIds)
1221
+ processed_df = self._process_dataframe(
1222
+ batch_df, "pandas", schema, coerce
1223
+ )
1224
+ yield processed_df
1225
+
1226
+ conn.close()
1227
+ logger.debug("DuckDB K-way merge complete")
1228
+
1229
+ except Exception as e:
1230
+ if coerce == "error":
1231
+ logger.error(f"Error in globally sorted streaming: {e}")
1232
+ return
1233
+ raise
1234
+
1235
+ def get_statistics(self) -> Dict[str, Any]:
1236
+ """
1237
+ Get statistics about cached data.
1238
+
1239
+ Returns:
1240
+ Dict with file count, total rows, size, schema info
1241
+ """
1242
+ total_rows = 0
1243
+ total_size = 0
1244
+ schema = None
1245
+
1246
+ for parquet_file in self.parquet_files:
1247
+ # File size
1248
+ total_size += parquet_file.stat().st_size
1249
+
1250
+ # Read metadata
1251
+ parquet_meta = pq.read_metadata(parquet_file)
1252
+ total_rows += parquet_meta.num_rows
1253
+
1254
+ # Get schema from first file
1255
+ if schema is None:
1256
+ schema = parquet_meta.schema.to_arrow_schema()
1257
+
1258
+ return {
1259
+ "file_count": len(self.parquet_files),
1260
+ "total_rows": total_rows,
1261
+ "total_size_mb": round(total_size / (1024 * 1024), 2),
1262
+ "schema_fields": [field.name for field in schema] if schema else [],
1263
+ "cache_dir": str(self.cache_dir),
1264
+ }
1265
+
1266
+ def __repr__(self) -> str:
1267
+ stats = self.get_statistics()
1268
+ return (
1269
+ f"ParquetReader(files={stats['file_count']}, "
1270
+ f"rows={stats['total_rows']:,}, "
1271
+ f"size={stats['total_size_mb']:.1f}MB)"
1272
+ )
1273
+
1274
+ def __len__(self) -> int:
1275
+ """Return total number of rows across all files."""
1276
+ return self.get_statistics()["total_rows"]