xlr8 0.1.7b3__cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2161 @@
1
+ """
2
+ XLR8 cursor with PyMongo compatibility.
3
+
4
+ ================================================================================
5
+ DATA FLOW - CURSOR (WHERE ACCELERATION HAPPENS)
6
+ ================================================================================
7
+
8
+ This module is where the magic happens. When user calls cursor.to_dataframe(),
9
+ we decide whether to:
10
+ A) Use regular PyMongo iteration (slow)
11
+ B) Use accelerated parallel fetch + Parquet caching (fast)
12
+
13
+ DECISION FLOW:
14
+ ────────────────────────────────────────────────────────────────────────────────
15
+
16
+ cursor.to_dataframe() called
17
+
18
+
19
+ ┌─────────────────────────────┐
20
+ │ Is schema provided? │─── No ──▶ REGULAR PATH (PyMongo iteration)
21
+ └─────────────────────────────┘
22
+ │ Yes
23
+
24
+ ┌─────────────────────────────┐
25
+ │ Is query chunkable? │─── No ──▶ REGULAR PATH
26
+ │ (has time range, no │ (e.g., has $where or nested $or)
27
+ │ forbidden operators) │
28
+ └─────────────────────────────┘
29
+ │ Yes
30
+
31
+ ┌─────────────────────────────┐
32
+ │ Is data in cache? │─── Yes ─▶ READ FROM CACHE
33
+ │ (.cache/{query_hash}/*.parquet) (instant, ~100ms for 1M rows)
34
+ └─────────────────────────────┘
35
+ │ No
36
+
37
+ ┌─────────────────────────────┐
38
+ │ ACCELERATED PATH: │
39
+ │ 1. Build brackets │ ← analysis/brackets.py
40
+ │ 2. Plan execution │ ← execution/planner.py
41
+ │ 3. Chunk time ranges │ ← analysis/chunker.py
42
+ │ 4. Parallel async fetch │ ← Rust backend (fetch_chunks_bson)
43
+ │ 5. Stream to Parquet │ ← Rust backend writes shards
44
+ │ 6. Read back DataFrame │ ← storage/reader.py
45
+ └─────────────────────────────┘
46
+
47
+ EXAMPLE DATA TRANSFORMATIONS:
48
+ ────────────────────────────────────────────────────────────────────────────────
49
+
50
+ 1. INPUT QUERY (from user):
51
+ {
52
+ "$or": [
53
+ {"metadata.sensor_id": ObjectId("64a...")},
54
+ {"metadata.sensor_id": ObjectId("64b...")},
55
+ ],
56
+ "timestamp": {"$gte": datetime(2024, 1, 1), "$lt": datetime(2024, 7, 1)}
57
+ }
58
+
59
+ 2. AFTER BRACKET ANALYSIS (brackets.py):
60
+ [
61
+ Bracket(static={"metadata.sensor_id": "64a..."}, time=Jan-Jul),
62
+ Bracket(static={"metadata.sensor_id": "64b..."}, time=Jan-Jul),
63
+ ]
64
+
65
+ 3. AFTER CHUNKING (for each bracket):
66
+ Bracket 1 -> 13 chunks (14 days each for 6 months)
67
+ Bracket 2 -> 13 chunks
68
+ Total: 26 work items in queue
69
+
70
+ 4. PARALLEL FETCH (10 workers):
71
+ Worker 0: Chunk 1 -> 45,000 docs, write to part_0000.parquet
72
+ Worker 1: Chunk 2 -> 52,000 docs, write to part_0001.parquet
73
+ ...
74
+ Worker 9: Chunk 10 -> 38,000 docs, write to part_0009.parquet
75
+ (Rust async workers pull chunks as they finish)
76
+
77
+ 5. OUTPUT (DataFrame):
78
+ pandas.DataFrame with columns: [timestamp, metadata.device_id, value, ...]
79
+ 500,000 rows loaded from Parquet in ~0.5s
80
+
81
+ ================================================================================
82
+ """
83
+
84
+ from __future__ import annotations
85
+
86
+ from typing import (
87
+ Any,
88
+ Callable,
89
+ Dict,
90
+ List,
91
+ Optional,
92
+ Union,
93
+ Iterator,
94
+ Literal,
95
+ Generator,
96
+ cast,
97
+ )
98
+ from datetime import datetime, date, timezone, timedelta
99
+ import logging
100
+ import warnings
101
+ import pandas as pd
102
+ import time
103
+ import pyarrow as pa
104
+ import polars as pl
105
+
106
+ logger = logging.getLogger(__name__)
107
+
108
+ # Import after logger to avoid circular imports
109
+ from xlr8.constants import DEFAULT_BATCH_SIZE
110
+ from xlr8.execution.callback import execute_partitioned_callback
111
+ from xlr8.analysis import (
112
+ build_brackets_for_find,
113
+ chunk_time_range,
114
+ get_sort_field_info,
115
+ validate_sort_field,
116
+ )
117
+ from xlr8.schema.types import Any as AnyType, List as ListType
118
+ from xlr8.storage import CacheManager, ParquetReader
119
+ from xlr8.execution import execute_parallel_stream_to_cache
120
+
121
+
122
+ def parse_datetime_tz_aware(
123
+ value: Union[datetime, date, str, None],
124
+ param_name: str = "date",
125
+ ) -> Optional[datetime]:
126
+ """
127
+ Parse a date/datetime value to a timezone-aware datetime.
128
+
129
+ Accepts:
130
+ - datetime (must be tz-aware or will assume UTC)
131
+ - date (converted to midnight UTC)
132
+ - ISO format string with timezone (e.g., "2024-01-15T10:30:00Z", "2024-01-15T10:30:00+00:00")
133
+
134
+ Args:
135
+ value: The date value to parse
136
+ param_name: Name of parameter for error messages
137
+
138
+ Returns:
139
+ Timezone-aware datetime or None if value is None
140
+
141
+ Raises:
142
+ ValueError: If string is not a valid ISO format or missing timezone
143
+ """
144
+ if value is None:
145
+ return None
146
+
147
+ if isinstance(value, datetime):
148
+ if value.tzinfo is None:
149
+ # Assume UTC for naive datetimes
150
+ return value.replace(tzinfo=timezone.utc)
151
+ return value
152
+
153
+ if isinstance(value, date):
154
+ # Convert date to midnight UTC
155
+ return datetime(value.year, value.month, value.day, tzinfo=timezone.utc)
156
+
157
+ if isinstance(value, str):
158
+ # Try parsing ISO format
159
+ try:
160
+ # Python 3.11+ has datetime.fromisoformat with better Z support
161
+ # For compatibility, handle Z suffix manually
162
+ if value.endswith("Z"):
163
+ value = value[:-1] + "+00:00"
164
+
165
+ dt = datetime.fromisoformat(value)
166
+
167
+ if dt.tzinfo is None:
168
+ raise ValueError(
169
+ f"{param_name}: Timezone-aware datetime required. "
170
+ f"Got '{value}' without timezone. "
171
+ f"Use ISO format with timezone like '2024-01-15T10:30:00Z' or '2024-01-15T10:30:00+00:00'"
172
+ )
173
+ return dt
174
+ except ValueError as e:
175
+ if "Timezone-aware" in str(e):
176
+ raise
177
+ raise ValueError(
178
+ f"{param_name}: Invalid datetime string '{value}'. "
179
+ f"Use ISO format with timezone like '2024-01-15T10:30:00Z' or '2024-01-15T10:30:00+00:00'"
180
+ ) from e
181
+
182
+ raise TypeError(
183
+ f"{param_name}: Expected datetime, date, or ISO string, got {type(value).__name__}"
184
+ )
185
+
186
+
187
+ class XLR8Cursor:
188
+ """
189
+ PyMongo-compatible cursor with acceleration support.
190
+
191
+ Acts as drop-in replacement for pymongo.cursor.Cursor but can
192
+ accelerate queries through parallel execution and Parquet caching.
193
+
194
+ Key differences from PyMongo:
195
+ - to_dataframe() / to_polars() for efficient DataFrame conversion
196
+ - Transparent acceleration when query is chunkable
197
+ - Maintains full PyMongo API compatibility for iteration
198
+
199
+ Example:
200
+ >>> cursor = collection.find({"timestamp": {"$gte": start, "$lt": end}})
201
+ >>> df = cursor.to_dataframe() # Accelerated execution
202
+ >>>
203
+ >>> # Or use like regular PyMongo cursor:
204
+ >>> for doc in cursor:
205
+ ... logging.debug(doc)
206
+ """
207
+
208
+ def __init__(
209
+ self,
210
+ collection: Any, # XLR8Collection
211
+ query_filter: Dict[str, Any],
212
+ projection: Optional[Dict[str, Any]] = None,
213
+ skip: int = 0,
214
+ limit: int = 0,
215
+ sort: Optional[List[tuple]] = None,
216
+ batch_size: int = 1000,
217
+ **kwargs: Any,
218
+ ):
219
+ """
220
+ Initialize cursor.
221
+
222
+ Args:
223
+ collection: Parent XLR8Collection
224
+ query_filter: Query filter dict
225
+ projection: Field projection dict
226
+ skip: Number of documents to skip
227
+ limit: Maximum documents to return (0 = unlimited)
228
+ sort: List of (field, direction) tuples
229
+ batch_size: Batch size for iteration
230
+ **kwargs: Additional PyMongo cursor options (no_cursor_timeout,
231
+ cursor_type, collation, hint, max_time_ms, etc.)
232
+ These are passed through to PyMongo when iterating.
233
+ """
234
+ self._collection = collection
235
+ self._filter = query_filter
236
+ self._projection = projection
237
+ self._skip = skip
238
+ self._limit = limit
239
+ self._sort = sort
240
+ self._batch_size = batch_size
241
+ self._cursor_kwargs = kwargs # Store all additional PyMongo options
242
+
243
+ # Iteration state
244
+ self._started = False
245
+ self._pymongo_cursor: Optional[Any] = None
246
+ self._exhausted = False
247
+
248
+ def __iter__(self) -> Iterator[Dict[str, Any]]:
249
+ """Iterate over documents."""
250
+ if not self._started:
251
+ self._started = True
252
+ # Create actual PyMongo cursor for iteration
253
+ self._ensure_pymongo_cursor()
254
+
255
+ if self._pymongo_cursor is None:
256
+ return iter([])
257
+
258
+ return iter(self._pymongo_cursor)
259
+
260
+ def __next__(self) -> Dict[str, Any]:
261
+ """Get next document."""
262
+ if not self._started:
263
+ self.__iter__()
264
+
265
+ if self._pymongo_cursor is None:
266
+ raise StopIteration
267
+
268
+ return next(self._pymongo_cursor)
269
+
270
+ def _ensure_pymongo_cursor(self) -> None:
271
+ """Lazily create PyMongo cursor only when needed for iteration/delegation."""
272
+ if self._pymongo_cursor is None:
273
+ self._pymongo_cursor = self._collection.pymongo_collection.find(
274
+ filter=self._filter,
275
+ projection=self._projection,
276
+ skip=self._skip,
277
+ limit=self._limit,
278
+ sort=self._sort,
279
+ batch_size=self._batch_size,
280
+ **self._cursor_kwargs, # Pass through all PyMongo cursor options
281
+ )
282
+
283
+ def raw_cursor(self):
284
+ """
285
+ Get direct access to underlying PyMongo cursor.
286
+
287
+ This is an escape hatch for power users who need access to PyMongo cursor
288
+ methods not explicitly implemented in XLR8Cursor.
289
+
290
+ Returns:
291
+ pymongo.cursor.Cursor: The underlying PyMongo cursor
292
+
293
+ Example:
294
+ >>> cursor = collection.find(...)
295
+ >>> cursor.raw_cursor().comment("my query").max_time_ms(5000)
296
+ """
297
+ self._ensure_pymongo_cursor()
298
+ return self._pymongo_cursor
299
+
300
+ def __getattr__(self, name: str) -> Any:
301
+ """
302
+ Delegate unknown attributes to underlying PyMongo cursor.
303
+
304
+ This provides transparent access to all PyMongo cursor methods while
305
+ preserving XLR8's accelerated methods.
306
+
307
+ Note: PyMongo cursor is created lazily only when delegation is needed.
308
+ For explicit access, use .raw_cursor()
309
+ """
310
+ # Avoid infinite recursion
311
+ if name.startswith("_"):
312
+ raise AttributeError(
313
+ f"'{type(self).__name__}' object has no attribute '{name}'"
314
+ )
315
+
316
+ # Create PyMongo cursor if needed
317
+ self._ensure_pymongo_cursor()
318
+
319
+ # Get attribute from PyMongo cursor
320
+ attr = getattr(self._pymongo_cursor, name)
321
+
322
+ # If it's a method that returns cursor, wrap the result
323
+ if callable(attr):
324
+
325
+ def wrapper(*args, **kwargs):
326
+ result = attr(*args, **kwargs)
327
+ # If PyMongo method returns cursor, it returns self (the PyMongo cursor)
328
+ # We want to return our wrapper instead
329
+ if result is self._pymongo_cursor:
330
+ return self
331
+ return result
332
+
333
+ return wrapper
334
+
335
+ return attr
336
+
337
+ def __enter__(self):
338
+ """Context manager entry."""
339
+ return self
340
+
341
+ def __exit__(self, exc_type, exc_val, exc_tb):
342
+ """Context manager exit."""
343
+ self.close()
344
+
345
+ # PyMongo compatibility methods
346
+
347
+ def skip(self, count: int) -> "XLR8Cursor":
348
+ """
349
+ Skip documents.
350
+
351
+ Args:
352
+ count: Number of documents to skip
353
+
354
+ Returns:
355
+ Self for chaining
356
+ """
357
+ if self._started:
358
+ raise RuntimeError("Cannot modify cursor after iteration started")
359
+
360
+ self._skip = count
361
+ return self
362
+
363
+ def limit(self, count: int) -> "XLR8Cursor":
364
+ """
365
+ Limit result count.
366
+
367
+ Args:
368
+ count: Maximum documents to return
369
+
370
+ Returns:
371
+ Self for chaining
372
+ """
373
+ if self._started:
374
+ raise RuntimeError("Cannot modify cursor after iteration started")
375
+
376
+ self._limit = count
377
+ return self
378
+
379
+ def sort(
380
+ self, key_or_list: Union[str, List[tuple]], direction: int = 1
381
+ ) -> "XLR8Cursor":
382
+ """
383
+ Sort results.
384
+
385
+ Automatically adds _id as final tie-breaker for deterministic ordering
386
+ (matching MongoDB's behavior).
387
+
388
+ Args:
389
+ key_or_list: Field name or list of (field, direction) tuples
390
+ direction: Sort direction (1=ascending, -1=descending)
391
+
392
+ Returns:
393
+ Self for chaining
394
+ """
395
+ if self._started:
396
+ raise RuntimeError("Cannot modify cursor after iteration started")
397
+
398
+ if isinstance(key_or_list, str):
399
+ self._sort = [(key_or_list, direction)]
400
+ else:
401
+ self._sort = key_or_list
402
+
403
+ return self
404
+
405
+ def batch_size(self, size: int) -> "XLR8Cursor":
406
+ """
407
+ Set batch size for iteration.
408
+
409
+ Args:
410
+ size: Batch size
411
+
412
+ Returns:
413
+ Self for chaining
414
+ """
415
+ if self._started:
416
+ raise RuntimeError("Cannot modify cursor after iteration started")
417
+
418
+ self._batch_size = size
419
+ return self
420
+
421
+ def close(self) -> None:
422
+ """Close cursor and free resources."""
423
+ if self._pymongo_cursor is not None:
424
+ self._pymongo_cursor.close()
425
+ self._pymongo_cursor = None
426
+ self._exhausted = True
427
+
428
+ # count() and distinct() removed - use __getattr__ delegation to PyMongo
429
+ # These are available via: cursor.count(), cursor.distinct()
430
+ # __getattr__ automatically forwards them to the underlying PyMongo cursor
431
+
432
+ # XLR8-specific acceleration methods
433
+
434
+ def to_dataframe(
435
+ self,
436
+ accelerate: bool = True,
437
+ cache_read: bool = True,
438
+ cache_write: bool = True,
439
+ start_date: Optional[Union[datetime, date, str]] = None,
440
+ end_date: Optional[Union[datetime, date, str]] = None,
441
+ coerce: Literal["raise", "error"] = "raise",
442
+ max_workers: int = 4,
443
+ chunking_granularity: Optional[timedelta] = None,
444
+ row_group_size: Optional[int] = None,
445
+ flush_ram_limit_mb: int = 512,
446
+ ) -> pd.DataFrame:
447
+ """
448
+ Convert results to Pandas DataFrame with optional acceleration.
449
+
450
+ This is the main acceleration entry point. If the query is chunkable
451
+ and acceleration is enabled, uses parallel execution and Parquet caching
452
+ for upto 4x speedup on large result sets.
453
+
454
+
455
+ DATA FLOW - ACCELERATION DECISION:
456
+
457
+ INPUT: self._filter (the MongoDB query)
458
+ Example: {
459
+ "timestamp": {"$gte": datetime(2024,1,1), "$lt": datetime(...)},
460
+ "$or": [{"metadata.sensor_id": ObjectId("64a...")}]
461
+ }
462
+
463
+ DECISION STEPS:
464
+ 1. Check if schema exists -> No: raise error (schema required)
465
+ 2. Check if query is chunkable -> No: single-worker, still Parquet
466
+ (is_chunkable_query checks for time bounds, forbidden ops)
467
+ 3. If chunkable: use parallel workers based on time span
468
+
469
+ OUTPUT: pandas.DataFrame with columns from schema
470
+ Example columns: [timestamp, metadata.device_id, value]
471
+
472
+ PERFORMANCE ( Obviously depends on data size, schema,
473
+ cache state etc. but this is just for illustration ):
474
+ - Regular path: ~30s for 500K docs (sequential cursor iteration)
475
+ - Accelerated path: ~10s for 500K docs (parallel + caching)
476
+ - Cache hit: ~0.5s for 500K docs (read from Parquet)
477
+
478
+ Args:
479
+ accelerate: Enable acceleration if query is chunkable
480
+ cache_read: Read from Parquet cache if available
481
+ cache_write: Write results to Parquet cache
482
+ start_date: Filter cached data from this date (inclusive).
483
+ Accepts datetime, date, or ISO string with timezone.
484
+ Example: "2024-01-15T00:00:00Z" or datetime with tzinfo
485
+ end_date: Filter cached data until this date (exclusive).
486
+ Accepts datetime, date, or ISO string with timezone.
487
+ coerce: Error handling mode:
488
+ - "raise": Raise exceptions on schema validation errors (default)
489
+ - "error": Log errors and store None for invalid values
490
+ max_workers: Maximum parallel workers (default: 4). More workers use
491
+ more RAM but process faster. Set to 1 for single-threaded.
492
+ Only used when chunking_granularity is provided.
493
+ chunking_granularity: Time granularity for chunking the query.
494
+ Example: timedelta(days=1) chunks by day, timedelta(hours=1) by hour.
495
+ REQUIRED for parallel execution - determines chunk boundaries.
496
+ If None, single-worker mode is used (no parallelization).
497
+ row_group_size: Rows per Parquet row group. If None, Rust default is used.
498
+ flush_ram_limit_mb: RAM limit in MB for buffered data before flushing to
499
+ Parquet. Higher values mean fewer files but more memory usage.
500
+ (default: 512)
501
+
502
+ Returns:
503
+ Pandas DataFrame with results
504
+
505
+ Raises:
506
+ ValueError: If no schema is provided (schema is required for acceleration)
507
+ ValueError: If date strings are not timezone-aware
508
+
509
+ Example:
510
+ >>> cursor = collection.find({
511
+ ... "timestamp": {"$gte": start, "$lt": end},
512
+ ... "status": "active"
513
+ ... })
514
+ >>> df = cursor.to_dataframe() # Accelerated automatically
515
+ >>>
516
+ """
517
+ # Schema is required for acceleration
518
+ schema = self._collection.schema
519
+ if schema is None:
520
+ raise ValueError(
521
+ "Schema is required for to_dataframe(). "
522
+ "Provide a schema when creating the collection: "
523
+ "xlr8_collection = xlr8.wrap(collection, schema=my_schema)"
524
+ )
525
+
526
+ # CRITICAL: Validate projection doesn't exclude required fields
527
+ if self._projection:
528
+ # Check if projection is inclusion (has 1 values) or exclusion (has 0 values)
529
+ projection_values = [v for k, v in self._projection.items() if k != "_id"]
530
+ is_inclusion = any(v == 1 for v in projection_values)
531
+
532
+ # Time field must be included (required for all operations)
533
+ if is_inclusion:
534
+ time_in_projection = (
535
+ schema.time_field in self._projection
536
+ and self._projection[schema.time_field] == 1
537
+ )
538
+ if not time_in_projection:
539
+ raise ValueError(
540
+ f"Projection must include time field '{schema.time_field}'. "
541
+ f"Projection: {self._projection}"
542
+ )
543
+
544
+ # Sort fields must be included
545
+ if self._sort:
546
+ for sort_field, _ in self._sort:
547
+ if is_inclusion:
548
+ if (
549
+ sort_field not in self._projection
550
+ or self._projection[sort_field] != 1
551
+ ):
552
+ raise ValueError(
553
+ f"Projection must include sort field '{sort_field}'. "
554
+ f"Cannot sort by a field that is projected out. "
555
+ f"Projection: {self._projection}"
556
+ )
557
+
558
+ # CRITICAL: If limit() or skip() are used, fall back to PyMongo
559
+ # Reason: Downloading all data just to return a subset is impractical
560
+ # MongoDB can efficiently handle limit/skip operations
561
+ if self._limit > 0 or self._skip > 0:
562
+ logger.info(
563
+ "limit() or skip() detected - falling back to PyMongo iteration "
564
+ "(acceleration would be impractical for subset queries)"
565
+ )
566
+ # Use fresh PyMongo cursor (not self which may be exhausted)
567
+ pymongo_cursor = self._collection.pymongo_collection.find(
568
+ self._filter, self._projection
569
+ )
570
+ if self._sort:
571
+ pymongo_cursor = pymongo_cursor.sort(self._sort)
572
+ if self._skip:
573
+ pymongo_cursor = pymongo_cursor.skip(self._skip)
574
+ if self._limit:
575
+ pymongo_cursor = pymongo_cursor.limit(self._limit)
576
+ if self._batch_size:
577
+ pymongo_cursor = pymongo_cursor.batch_size(self._batch_size)
578
+ return pd.json_normalize(list(pymongo_cursor))
579
+
580
+ # Validate sort field if specified
581
+ if self._sort:
582
+ sort_validation = validate_sort_field(self._sort, schema)
583
+ if not sort_validation.is_valid:
584
+ raise ValueError(f"Sort validation failed: {sort_validation.reason}")
585
+
586
+ # Parse and validate date filters
587
+ parsed_start = parse_datetime_tz_aware(start_date, "start_date")
588
+ parsed_end = parse_datetime_tz_aware(end_date, "end_date")
589
+
590
+ if not accelerate:
591
+ # Fallback to regular iteration (ignores date filters)
592
+ if parsed_start or parsed_end:
593
+ logger.warning(
594
+ "start_date/end_date filters are ignored when accelerate=False"
595
+ )
596
+ return self._to_dataframe_regular()
597
+
598
+ is_chunkable, reason, brackets, _ = build_brackets_for_find(
599
+ self._filter,
600
+ schema.time_field,
601
+ self._sort, # Pass sort spec for $natural detection
602
+ )
603
+
604
+ # Validate chunking_granularity if provided
605
+ # CRITICAL: If chunking_granularity is None, we CANNOT chunk the query
606
+ # because we don't know the data's time precision (could be ms, us, ns)
607
+ if chunking_granularity is not None:
608
+ if chunking_granularity.total_seconds() <= 0:
609
+ raise ValueError(
610
+ f"chunking_granularity must be positive, got {chunking_granularity}"
611
+ )
612
+
613
+ if not is_chunkable:
614
+ # REJECT mode - invalid query syntax or contradictory constraints
615
+ # This is different from SINGLE mode (where is_chunkable=True, brackets empty)
616
+ if parsed_start or parsed_end:
617
+ logger.warning(
618
+ "start_date/end_date filters are ignored for non-chunkable queries"
619
+ )
620
+ logger.info("Query has invalid syntax (%s) - cannot execute", reason)
621
+ return self._to_dataframe_accelerated(
622
+ cache_read=cache_read,
623
+ cache_write=cache_write,
624
+ start_date=parsed_start,
625
+ end_date=parsed_end,
626
+ coerce=coerce,
627
+ max_workers=1, # Single worker for invalid queries
628
+ chunking_granularity=None, # No chunking
629
+ is_chunkable=False,
630
+ )
631
+
632
+ # Check for SINGLE mode - valid query but single-worker fallback
633
+ # Indicated by: is_chunkable=True AND empty brackets
634
+ if is_chunkable and not brackets:
635
+ # SINGLE mode examples: $natural sort, unbounded $or branches
636
+ logger.info(
637
+ "Query valid but not parallelizable (%s) - using single-worker mode",
638
+ reason,
639
+ )
640
+ return self._to_dataframe_accelerated(
641
+ cache_read=cache_read,
642
+ cache_write=cache_write,
643
+ start_date=parsed_start,
644
+ end_date=parsed_end,
645
+ coerce=coerce,
646
+ max_workers=1, # Single worker for SINGLE mode
647
+ chunking_granularity=None, # No chunking
648
+ is_chunkable=False,
649
+ )
650
+
651
+ # Query IS chunkable, but do we have granularity info?
652
+ if chunking_granularity is None:
653
+ # No chunking_granularity provided - cannot parallelize safely
654
+ # because we don't know how to split the time range
655
+ logger.info(
656
+ "Query is chunkable but chunking_granularity not provided - "
657
+ "using single-worker mode. Provide chunking_granularity=timedelta(...) "
658
+ "to enable parallel execution."
659
+ )
660
+ return self._to_dataframe_accelerated(
661
+ cache_read=cache_read,
662
+ cache_write=cache_write,
663
+ start_date=parsed_start,
664
+ end_date=parsed_end,
665
+ coerce=coerce,
666
+ max_workers=1, # Single worker - no chunking info
667
+ chunking_granularity=None,
668
+ is_chunkable=False, # Treat as non-chunkable since we can't chunk
669
+ flush_ram_limit_mb=flush_ram_limit_mb, # Pass through for cache reading
670
+ row_group_size=row_group_size, # Pass through for DuckDB batch
671
+ )
672
+
673
+ # Use accelerated parallel execution - we have chunking info!
674
+ return self._to_dataframe_accelerated(
675
+ cache_read=cache_read,
676
+ cache_write=cache_write,
677
+ start_date=parsed_start,
678
+ end_date=parsed_end,
679
+ coerce=coerce,
680
+ max_workers=max_workers,
681
+ chunking_granularity=chunking_granularity,
682
+ is_chunkable=True,
683
+ flush_ram_limit_mb=flush_ram_limit_mb,
684
+ row_group_size=row_group_size,
685
+ )
686
+
687
+ def to_dataframe_batches(
688
+ self,
689
+ batch_size: int = DEFAULT_BATCH_SIZE,
690
+ cache_read: bool = True,
691
+ cache_write: bool = True,
692
+ start_date: Optional[Union[datetime, date, str]] = None,
693
+ end_date: Optional[Union[datetime, date, str]] = None,
694
+ coerce: Literal["raise", "error"] = "raise",
695
+ max_workers: int = 4,
696
+ chunking_granularity: Optional[timedelta] = None,
697
+ row_group_size: Optional[int] = None,
698
+ flush_ram_limit_mb: int = 512,
699
+ ) -> Generator[pd.DataFrame, None, None]:
700
+ """
701
+ Yield DataFrames in batches from cache without loading all data into memory.
702
+
703
+ This is a memory-efficient alternative to to_dataframe() for very large
704
+ result sets. Instead of loading the entire result into memory, it yields
705
+ smaller DataFrames that can be processed incrementally.
706
+
707
+
708
+ MEMORY-EFFICIENT BATCH PROCESSING:
709
+
710
+ Instead of:
711
+ df = cursor.to_dataframe() # Loads ALL 10M rows into RAM
712
+
713
+ Use:
714
+ for batch_df in cursor.to_dataframe_batches(batch_size=50000):
715
+ process(batch_df) # Only 50K rows in RAM at a time
716
+
717
+ Memory usage: O(batch_size) instead of O(total_rows)
718
+
719
+
720
+ Args:
721
+ batch_size: Number of rows per DataFrame batch (default: 10,000)
722
+ cache_read: Read from Parquet cache if available
723
+ cache_write: Write results to Parquet cache on cache miss
724
+ start_date: Filter cached data from this date (inclusive).
725
+ Accepts datetime, date, or ISO string with timezone.
726
+ end_date: Filter cached data until this date (exclusive).
727
+ coerce: Error handling mode ("raise" or "error")
728
+ max_workers: Maximum parallel workers for cache population (default: 4)
729
+ chunking_granularity: Time granularity for chunking (required for parallel fetch)
730
+
731
+ Yields:
732
+ pd.DataFrame: Batches of rows as DataFrames
733
+
734
+ Raises:
735
+ ValueError: If no schema is provided
736
+ ValueError: If date strings are not timezone-aware
737
+ ValueError: If cache doesn't exist and cache_write=False
738
+
739
+ Example:
740
+ >>> # Process 10M rows without loading all into RAM
741
+ >>> total = 0
742
+ >>> for batch_df in cursor.to_dataframe_batches(batch_size=50000):
743
+ ... total += len(batch_df)
744
+ ... # Process batch_df...
745
+ >>> logging.debug(f"Processed {total} rows")
746
+ >>>
747
+ >>> # With date filtering:
748
+ >>> for batch_df in cursor.to_dataframe_batches(
749
+ ... batch_size=10000,
750
+ ... start_date="2024-06-01T00:00:00Z",
751
+ ... end_date="2024-06-15T00:00:00Z"
752
+ ... ):
753
+ ... analyze(batch_df)
754
+ """
755
+ # Schema is required
756
+ schema = self._collection.schema
757
+ if schema is None:
758
+ raise ValueError(
759
+ "Schema is required for to_dataframe_batches(). "
760
+ "Provide a schema when creating the collection."
761
+ )
762
+
763
+ # CRITICAL: If limit() or skip() are used, fall back to PyMongo
764
+ # Reason: Downloading all data just to return a subset is impractical
765
+ if self._limit > 0 or self._skip > 0:
766
+ logger.info(
767
+ "limit() or skip() detected - falling back to PyMongo iteration "
768
+ "(acceleration would be impractical for subset queries)"
769
+ )
770
+ # Use fresh PyMongo cursor in batches (not self which may be exhausted)
771
+ pymongo_cursor = self._collection.pymongo_collection.find(
772
+ self._filter, self._projection
773
+ )
774
+ if self._sort:
775
+ pymongo_cursor = pymongo_cursor.sort(self._sort)
776
+ if self._skip:
777
+ pymongo_cursor = pymongo_cursor.skip(self._skip)
778
+ if self._limit:
779
+ pymongo_cursor = pymongo_cursor.limit(self._limit)
780
+ if self._batch_size:
781
+ pymongo_cursor = pymongo_cursor.batch_size(self._batch_size)
782
+
783
+ batch = []
784
+ for doc in pymongo_cursor:
785
+ batch.append(doc)
786
+ if len(batch) >= batch_size:
787
+ yield pd.DataFrame(batch)
788
+ batch = []
789
+ if batch:
790
+ yield pd.DataFrame(batch)
791
+ return
792
+
793
+ # CRITICAL: Validate projection doesn't exclude required fields
794
+ if self._projection:
795
+ projection_values = [v for k, v in self._projection.items() if k != "_id"]
796
+ is_inclusion = any(v == 1 for v in projection_values)
797
+
798
+ # Time field must be included
799
+ if is_inclusion:
800
+ time_in_projection = (
801
+ schema.time_field in self._projection
802
+ and self._projection[schema.time_field] == 1
803
+ )
804
+ if not time_in_projection:
805
+ raise ValueError(
806
+ f"Projection must include time field '{schema.time_field}'. "
807
+ f"Projection: {self._projection}"
808
+ )
809
+
810
+ # Sort fields must be included
811
+ if self._sort:
812
+ for sort_field, _ in self._sort:
813
+ if is_inclusion:
814
+ if (
815
+ sort_field not in self._projection
816
+ or self._projection[sort_field] != 1
817
+ ):
818
+ raise ValueError(
819
+ f"Projection must include sort field '{sort_field}'. "
820
+ f"Cannot sort by a field that is projected out. "
821
+ f"Projection: {self._projection}"
822
+ )
823
+
824
+ time_field = schema.time_field
825
+
826
+ # Validate sort field if specified
827
+ if self._sort:
828
+ sort_validation = validate_sort_field(self._sort, schema)
829
+ if not sort_validation.is_valid:
830
+ raise ValueError(f"Sort validation failed: {sort_validation.reason}")
831
+ logger.info(
832
+ "Sorted streaming enabled - using DuckDB K-way merge for global sort order"
833
+ )
834
+
835
+ # Parse and validate date filters
836
+ parsed_start = parse_datetime_tz_aware(start_date, "start_date")
837
+ parsed_end = parse_datetime_tz_aware(end_date, "end_date")
838
+
839
+ is_chunkable, reason, brackets, _ = build_brackets_for_find(
840
+ self._filter,
841
+ time_field,
842
+ self._sort, # Pass sort spec for $natural detection
843
+ )
844
+
845
+ # Handle REJECT mode (is_chunkable=False)
846
+ if not is_chunkable:
847
+ warnings.warn(
848
+ f"Invalid query syntax ({reason}). Cannot execute this query.",
849
+ UserWarning,
850
+ stacklevel=2,
851
+ )
852
+ # Override max_workers to 1 for invalid queries
853
+ max_workers = 1
854
+ chunking_granularity = None
855
+
856
+ # Handle SINGLE mode (is_chunkable=True but empty brackets)
857
+ elif is_chunkable and not brackets:
858
+ warnings.warn(
859
+ f"Query valid but not parallelizable ({reason}). Using single-worker mode.",
860
+ UserWarning,
861
+ stacklevel=2,
862
+ )
863
+ # Override max_workers to 1 for SINGLE mode
864
+ max_workers = 1
865
+ chunking_granularity = None
866
+
867
+ # Mark as started
868
+ if not self._started:
869
+ self._started = True
870
+
871
+ # Create cache manager
872
+ cache = CacheManager(
873
+ filter_dict=self._filter,
874
+ projection=self._projection,
875
+ sort=self._sort,
876
+ )
877
+
878
+ # Ensure cache exists
879
+ if not cache.exists():
880
+ if not cache_write:
881
+ raise ValueError(
882
+ "Cache does not exist and cache_write=False. "
883
+ "Either call to_dataframe() first to populate cache, "
884
+ "or set cache_write=True."
885
+ )
886
+
887
+ # Populate cache first
888
+ logging.debug("[Query] Cache miss - fetching from MongoDB...")
889
+
890
+ # Populate cache via accelerated executor
891
+ result = execute_parallel_stream_to_cache(
892
+ pymongo_collection=self._collection.pymongo_collection,
893
+ filter_dict=self._filter,
894
+ schema=schema,
895
+ cache_manager=cache,
896
+ projection=self._projection,
897
+ approx_document_size_bytes=self._collection.approx_document_size_bytes,
898
+ max_workers=max_workers,
899
+ peak_ram_limit_mb=flush_ram_limit_mb,
900
+ chunking_granularity=chunking_granularity,
901
+ mongo_uri=self._collection.mongo_uri,
902
+ sort_spec=self._sort, # Pass sort for pre-sorting during Parquet write
903
+ row_group_size=row_group_size,
904
+ )
905
+
906
+ logging.debug(
907
+ f"\n[Cache] Cache written: {result['total_docs']:,} docs in {result['duration_s']:.2f}s"
908
+ )
909
+
910
+ elif not cache_read and cache_write:
911
+ # CRITICAL: cache_read=False but cache_write=True and cache exists
912
+ # Clear old cache and re-populate to avoid duplicate data
913
+ logging.debug(
914
+ "[Clean] Clearing existing cache (cache_read=False, starting fresh)..."
915
+ )
916
+ cache.clean()
917
+
918
+ logging.debug("[Query] Re-fetching from MongoDB...")
919
+
920
+ # Re-populate cache via accelerated executor
921
+ result = execute_parallel_stream_to_cache(
922
+ pymongo_collection=self._collection.pymongo_collection,
923
+ filter_dict=self._filter,
924
+ schema=schema,
925
+ cache_manager=cache,
926
+ projection=self._projection,
927
+ approx_document_size_bytes=self._collection.approx_document_size_bytes,
928
+ max_workers=max_workers,
929
+ peak_ram_limit_mb=flush_ram_limit_mb,
930
+ chunking_granularity=chunking_granularity,
931
+ mongo_uri=self._collection.mongo_uri,
932
+ sort_spec=self._sort, # Pass sort for pre-sorting during Parquet write
933
+ row_group_size=row_group_size,
934
+ )
935
+
936
+ logging.debug(
937
+ f"\n[Cache] Cache re-written: {result['total_docs']:,} docs in {result['duration_s']:.2f}s"
938
+ )
939
+
940
+ # Now yield batches from cache
941
+ logging.debug(f"[Cache] Streaming batches from cache: {cache.cache_dir}")
942
+ reader = ParquetReader(cache.cache_dir)
943
+
944
+ # Use globally sorted streaming if sort is specified
945
+ if self._sort:
946
+ logging.debug("[Sort] Using DuckDB K-way merge for globally sorted batches")
947
+ yield from reader.iter_globally_sorted_batches(
948
+ sort_spec=self._sort, # Pass full sort spec for multi-field sorting
949
+ batch_size=batch_size,
950
+ schema=schema,
951
+ time_field=time_field,
952
+ start_date=parsed_start,
953
+ end_date=parsed_end,
954
+ coerce=coerce,
955
+ memory_limit_mb=flush_ram_limit_mb, # Pass RAM limit to DuckDB
956
+ threads=max_workers, # Pass thread count to DuckDB
957
+ )
958
+ else:
959
+ yield from reader.iter_dataframe_batches(
960
+ batch_size=batch_size,
961
+ schema=schema,
962
+ time_field=time_field,
963
+ start_date=parsed_start,
964
+ end_date=parsed_end,
965
+ coerce=coerce,
966
+ )
967
+
968
+ def stream_to_callback(
969
+ self,
970
+ callback: Callable[["pa.Table", Dict[str, Any]], None],
971
+ *,
972
+ partition_time_delta: timedelta,
973
+ partition_by: Optional[Union[str, List[str]]] = None,
974
+ any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
975
+ max_workers: int = 4,
976
+ chunking_granularity: Optional[timedelta] = None,
977
+ row_group_size: Optional[int] = None,
978
+ flush_ram_limit_mb: int = 512,
979
+ cache_read: bool = True,
980
+ cache_write: bool = True,
981
+ ) -> Dict[str, Any]:
982
+ """
983
+ Stream partitioned PyArrow tables to a callback function.
984
+
985
+ This is a two-phase operation:
986
+ 1. Download data from MongoDB to local Parquet cache (reuses Rust backend)
987
+ 2. Partition data and call callback in parallel for each partition
988
+
989
+ Perfect for populating data lakes with partitioned data structures.
990
+
991
+ ┌─────────────────────────────────────────────────────────────────────┐
992
+ │ PARTITION MODES: │
993
+ │ │
994
+ │ TIME ONLY (partition_by=None): │
995
+ │ partition_time_delta=timedelta(weeks=1) │
996
+ │ -> 1 callback per week of data │
997
+ │ │
998
+ │ TIME + FIELD (partition_by="metadata.instrument"): │
999
+ │ partition_time_delta=timedelta(weeks=1) │
1000
+ │ -> 1 callback per (week, instrument) combination │
1001
+ │ │
1002
+ │ Example: 1 year of data, 10 instruments, weekly partitions │
1003
+ │ -> 52 weeks × 10 instruments = 520 callbacks │
1004
+ └─────────────────────────────────────────────────────────────────────┘
1005
+
1006
+ The callback receives:
1007
+ - table: PyArrow Table with data for this partition
1008
+ - metadata: Dict with partition info:
1009
+ {
1010
+ "time_start": datetime, # Start of time bucket
1011
+ "time_end": datetime, # End of time bucket
1012
+ "partition_values": {...}, # Values for partition_by fields
1013
+ "row_count": int, # Rows in this table
1014
+ "partition_index": int, # 0-based partition index
1015
+ "total_partitions": int, # Total partition count
1016
+ }
1017
+
1018
+ Args:
1019
+ callback: Function(table: pa.Table, metadata: dict) -> None
1020
+ Called for each partition. Runs in ThreadPoolExecutor.
1021
+ partition_time_delta: Time bucket size for partitioning.
1022
+ Example: timedelta(weeks=1) creates weekly partitions.
1023
+ REQUIRED - determines how data is grouped.
1024
+ partition_by: Field(s) to partition by, in addition to time.
1025
+ Example: "metadata.instrument" or ["region", "device_id"]
1026
+ Can be any field in schema except time field.
1027
+ None = partition by time only.
1028
+ any_type_strategy: How to decode Types.Any() struct columns:
1029
+ - "float": Coalesce to Float64, prioritize numeric (default)
1030
+ - "string": Convert everything to string (lossless)
1031
+ - "keep_struct": Keep raw struct, don't decode
1032
+ max_workers: Number of parallel callback threads (default: 4).
1033
+ DuckDB releases GIL, so threads get true parallelism.
1034
+ chunking_granularity: Time granularity for MongoDB fetch chunks.
1035
+ Used during Phase 1 (download). Example: timedelta(hours=16).
1036
+ If None, defaults to partition_time_delta.
1037
+ flush_ram_limit_mb: RAM limit for buffered data (default: 512).
1038
+ Used during both download and partition phases.
1039
+ cache_read: Read from existing cache if available (default: True).
1040
+ cache_write: Write to cache during download (default: True).
1041
+
1042
+ Returns:
1043
+ Dict with:
1044
+ - total_partitions: Number of partitions processed
1045
+ - total_rows: Total rows across all partitions
1046
+ - skipped_partitions: Empty partitions skipped
1047
+ - duration_s: Total execution time
1048
+ - cache_duration_s: Time spent on cache population
1049
+ - partition_duration_s: Time spent on partition callbacks
1050
+
1051
+ Raises:
1052
+ ValueError: If no schema provided
1053
+ ValueError: If query not chunkable (no time bounds)
1054
+ ValueError: If sort specified on non-time field
1055
+ RuntimeError: If callback fails for any partition
1056
+
1057
+ Example:
1058
+ >>> # Upload weekly data per instrument to S3 data lake
1059
+ >>> import pyarrow.parquet as pq
1060
+ >>> import s3fs
1061
+ >>>
1062
+ >>> fs = s3fs.S3FileSystem()
1063
+ >>>
1064
+ >>> def upload_partition(table, metadata):
1065
+ ... instrument = metadata['partition_values'].get('metadata.instrument', 'unknown')
1066
+ ... week = metadata['time_start'].strftime('%Y-%m-%d')
1067
+ ... path = f"s3://bucket/data/instrument={instrument}/week={week}.parquet"
1068
+ ... pq.write_table(table, path, filesystem=fs)
1069
+ >>>
1070
+ >>> cursor.stream_to_callback(
1071
+ ... callback=upload_partition,
1072
+ ... partition_time_delta=timedelta(weeks=1),
1073
+ ... partition_by="metadata.instrument",
1074
+ ... max_workers=8,
1075
+ ... chunking_granularity=timedelta(hours=16),
1076
+ ... )
1077
+ """
1078
+ total_start = time.time()
1079
+
1080
+ schema = self._collection.schema
1081
+ if schema is None:
1082
+ raise ValueError(
1083
+ "Schema is required for stream_to_callback(). "
1084
+ "Provide a schema when creating the collection."
1085
+ )
1086
+
1087
+ # CRITICAL: limit() and skip() don't make sense for streaming callbacks
1088
+ # These operations require knowing the full result set, which defeats
1089
+ # the purpose of streaming
1090
+ if self._limit > 0 or self._skip > 0:
1091
+ raise ValueError(
1092
+ "stream_to_callback() does not support limit() or skip(). "
1093
+ "These operations require knowing the total result set size upfront, "
1094
+ "which defeats the purpose of streaming. "
1095
+ "Use to_dataframe() or iterate with PyMongo cursor instead."
1096
+ )
1097
+
1098
+ time_field = schema.time_field
1099
+
1100
+ # CRITICAL: Validate projection doesn't exclude partition_by fields
1101
+ if self._projection and partition_by:
1102
+ # Check if projection is inclusion (has 1 values) or exclusion (has 0 values)
1103
+ projection_values = [v for k, v in self._projection.items() if k != "_id"]
1104
+ is_inclusion = any(v == 1 for v in projection_values)
1105
+
1106
+ # Time field must be included
1107
+ if is_inclusion:
1108
+ time_in_projection = (
1109
+ time_field in self._projection and self._projection[time_field] == 1
1110
+ )
1111
+ if not time_in_projection:
1112
+ raise ValueError(
1113
+ f"Projection must include time field '{time_field}'. "
1114
+ f"Projection: {self._projection}"
1115
+ )
1116
+
1117
+ # Partition fields must be included
1118
+ partition_by_list = (
1119
+ [partition_by] if isinstance(partition_by, str) else partition_by
1120
+ )
1121
+ for field in partition_by_list:
1122
+ if is_inclusion:
1123
+ # For parent fields like "metadata", check if any child is included
1124
+ field_or_children_included = (
1125
+ field in self._projection and self._projection[field] == 1
1126
+ ) or any(
1127
+ k.startswith(f"{field}.") and self._projection[k] == 1
1128
+ for k in self._projection.keys()
1129
+ )
1130
+ if not field_or_children_included:
1131
+ raise ValueError(
1132
+ f"Projection must include partition field '{field}'. "
1133
+ f"Cannot partition by a field that is projected out. "
1134
+ f"Projection: {self._projection}"
1135
+ )
1136
+
1137
+ # Validate sort fields in projection
1138
+ if self._projection and self._sort:
1139
+ projection_values = [v for k, v in self._projection.items() if k != "_id"]
1140
+ is_inclusion = any(v == 1 for v in projection_values)
1141
+ for sort_field, _ in self._sort:
1142
+ if is_inclusion:
1143
+ if (
1144
+ sort_field not in self._projection
1145
+ or self._projection[sort_field] != 1
1146
+ ):
1147
+ raise ValueError(
1148
+ f"Projection must include sort field '{sort_field}'. "
1149
+ f"Projection: {self._projection}"
1150
+ )
1151
+
1152
+ # Validate sort - only allow time field sorting
1153
+ if self._sort:
1154
+ for field, _direction in self._sort:
1155
+ if field != time_field:
1156
+ raise ValueError(
1157
+ f"stream_to_callback() only supports sorting by time field '{time_field}'. "
1158
+ f"Got sort field: '{field}'. "
1159
+ "Remove .sort() or sort only by time field."
1160
+ )
1161
+ # Store sort direction
1162
+ sort_ascending = self._sort[0][1] == 1
1163
+ else:
1164
+ sort_ascending = True # Default to ascending
1165
+
1166
+ # Normalize partition_by to list
1167
+ partition_by_list: Optional[List[str]] = None
1168
+ if partition_by is not None:
1169
+ if isinstance(partition_by, str):
1170
+ partition_by_list = [partition_by]
1171
+ else:
1172
+ partition_by_list = list(partition_by)
1173
+
1174
+ # Validate partition_by fields exist in schema (or are parent fields with children)
1175
+ all_schema_fields = list(schema.fields.keys())
1176
+ for field in partition_by_list:
1177
+ if field == time_field:
1178
+ raise ValueError(
1179
+ f"Cannot partition by time field '{time_field}'. "
1180
+ "Time partitioning is automatic via partition_time_delta."
1181
+ )
1182
+ # Check if field exists directly OR has children
1183
+ has_direct = schema.has_field(field)
1184
+ has_children = any(f.startswith(f"{field}.") for f in all_schema_fields)
1185
+ if not has_direct and not has_children:
1186
+ raise ValueError(
1187
+ f"Partition field '{field}' not found in schema. "
1188
+ f"Available fields: {all_schema_fields}"
1189
+ )
1190
+
1191
+ # Default chunking_granularity to partition_time_delta
1192
+ if chunking_granularity is None:
1193
+ chunking_granularity = partition_time_delta
1194
+
1195
+ # NEW: build_brackets_for_find internally validates via is_chunkable_query
1196
+ is_chunkable, reason, brackets, _ = build_brackets_for_find(
1197
+ self._filter,
1198
+ time_field,
1199
+ self._sort, # Pass sort spec for $natural detection
1200
+ )
1201
+
1202
+ # Handle REJECT mode (is_chunkable=False)
1203
+ if not is_chunkable:
1204
+ warnings.warn(
1205
+ f"Invalid query syntax ({reason}). Cannot execute this query.",
1206
+ UserWarning,
1207
+ stacklevel=2,
1208
+ )
1209
+ # Override max_workers to 1 for invalid queries
1210
+ max_workers = 1
1211
+ chunking_granularity = None
1212
+
1213
+ # Handle SINGLE mode (is_chunkable=True but empty brackets)
1214
+ elif is_chunkable and not brackets:
1215
+ warnings.warn(
1216
+ f"Query valid but not parallelizable ({reason}). Using single-worker mode.",
1217
+ UserWarning,
1218
+ stacklevel=2,
1219
+ )
1220
+ # Override max_workers to 1 for SINGLE mode
1221
+ max_workers = 1
1222
+ chunking_granularity = None
1223
+
1224
+ # Mark as started
1225
+ if not self._started:
1226
+ self._started = True
1227
+
1228
+ # ─────────────────────────────────────────────────────────────────────
1229
+ # PHASE 1: Download to cache (reuses existing Rust backend)
1230
+ # ─────────────────────────────────────────────────────────────────────
1231
+ cache = CacheManager(
1232
+ filter_dict=self._filter,
1233
+ projection=self._projection,
1234
+ sort=self._sort,
1235
+ )
1236
+
1237
+ cache_start = time.time()
1238
+
1239
+ if cache_read and cache.exists():
1240
+ logging.debug(f"[Cache] Using existing cache: {cache.cache_dir}")
1241
+ else:
1242
+ if not cache_write:
1243
+ raise ValueError(
1244
+ "Cache does not exist and cache_write=False. "
1245
+ "Set cache_write=True to download data first."
1246
+ )
1247
+
1248
+ if cache.exists() and not cache_read:
1249
+ logging.debug("[Clean] Clearing existing cache (cache_read=False)...")
1250
+ cache.clean()
1251
+
1252
+ logging.debug("[Query] Downloading from MongoDB to cache...")
1253
+ result = execute_parallel_stream_to_cache(
1254
+ pymongo_collection=self._collection.pymongo_collection,
1255
+ filter_dict=self._filter,
1256
+ schema=schema,
1257
+ cache_manager=cache,
1258
+ projection=self._projection,
1259
+ approx_document_size_bytes=self._collection.approx_document_size_bytes,
1260
+ max_workers=max_workers,
1261
+ peak_ram_limit_mb=flush_ram_limit_mb,
1262
+ chunking_granularity=chunking_granularity,
1263
+ mongo_uri=self._collection.mongo_uri,
1264
+ row_group_size=row_group_size,
1265
+ )
1266
+ logging.debug(
1267
+ f"[Cache] Downloaded: {result['total_docs']:,} docs in {result['duration_s']:.2f}s"
1268
+ )
1269
+
1270
+ cache_duration = time.time() - cache_start
1271
+
1272
+ # ─────────────────────────────────────────────────────────────────────
1273
+ # PHASE 2: Partition and stream to callbacks
1274
+ # ─────────────────────────────────────────────────────────────────────
1275
+
1276
+ partition_result = execute_partitioned_callback(
1277
+ cache_dir=str(cache.cache_dir),
1278
+ schema=schema,
1279
+ callback=callback,
1280
+ partition_time_delta=partition_time_delta,
1281
+ partition_by=partition_by_list,
1282
+ any_type_strategy=any_type_strategy,
1283
+ max_workers=max_workers,
1284
+ sort_ascending=sort_ascending,
1285
+ memory_limit_mb=flush_ram_limit_mb,
1286
+ )
1287
+
1288
+ total_duration = time.time() - total_start
1289
+
1290
+ return {
1291
+ "total_partitions": partition_result["total_partitions"],
1292
+ "total_rows": partition_result["total_rows"],
1293
+ "skipped_partitions": partition_result["skipped_partitions"],
1294
+ "duration_s": total_duration,
1295
+ "cache_duration_s": cache_duration,
1296
+ "partition_duration_s": partition_result["duration_s"],
1297
+ }
1298
+
1299
+ def to_polars(
1300
+ self,
1301
+ accelerate: bool = True,
1302
+ cache_read: bool = True,
1303
+ cache_write: bool = True,
1304
+ start_date: Optional[Union[datetime, date, str]] = None,
1305
+ end_date: Optional[Union[datetime, date, str]] = None,
1306
+ coerce: Literal["raise", "error"] = "raise",
1307
+ max_workers: int = 4,
1308
+ chunking_granularity: Optional[timedelta] = None,
1309
+ row_group_size: Optional[int] = None,
1310
+ any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
1311
+ flush_ram_limit_mb: int = 512,
1312
+ ) -> pl.DataFrame:
1313
+ """
1314
+ Convert results to Polars DataFrame with optional acceleration.
1315
+
1316
+ This mirrors to_dataframe() but returns a Polars DataFrame.
1317
+ Uses ParquetReader with engine="polars" for efficient native reading.
1318
+
1319
+ Args:
1320
+ accelerate: Enable acceleration if query is chunkable
1321
+ cache_read: Read from Parquet cache if available
1322
+ cache_write: Write results to Parquet cache
1323
+ start_date: Filter cached data from this date (inclusive).
1324
+ Accepts datetime, date, or ISO string with timezone.
1325
+ end_date: Filter cached data until this date (exclusive).
1326
+ coerce: Error handling mode ("raise" or "error")
1327
+ max_workers: Maximum parallel workers (default: 4)
1328
+ chunking_granularity: Time granularity for chunking (e.g., timedelta(days=1))
1329
+ row_group_size: Rows per parquet row group. If None, Rust default is used.
1330
+ any_type_strategy: How to decode Types.Any() struct columns:
1331
+ - "float": Coalesce to Float64, prioritize numeric (default)
1332
+ - "string": Convert everything to string (lossless)
1333
+ - "keep_struct": Keep raw struct, don't decode
1334
+ flush_ram_limit_mb: RAM limit in MB for buffered data before flushing.
1335
+ (default: 512)
1336
+
1337
+ Returns:
1338
+ Polars DataFrame with results
1339
+
1340
+ Raises:
1341
+ ValueError: If no schema is provided
1342
+
1343
+ Example:
1344
+ >>> cursor = collection.find({...}).sort("timestamp", 1)
1345
+ >>> df = cursor.to_polars(
1346
+ ... max_workers=8,
1347
+ ... chunking_granularity=timedelta(days=7),
1348
+ ... flush_ram_limit_mb=2000,
1349
+ ... )
1350
+ """
1351
+ schema = self._collection.schema
1352
+ if schema is None:
1353
+ raise ValueError(
1354
+ "Schema is required for to_polars(). "
1355
+ "Provide a schema when creating the collection."
1356
+ )
1357
+
1358
+ # CRITICAL: If limit() or skip() are used, fall back to PyMongo
1359
+ # Reason: Downloading all data just to return a subset is impractical
1360
+ if self._limit > 0 or self._skip > 0:
1361
+ logger.info(
1362
+ "limit() or skip() detected - falling back to PyMongo iteration "
1363
+ "(acceleration would be impractical for subset queries)"
1364
+ )
1365
+ # Use fresh PyMongo cursor (not self which may be exhausted)
1366
+ pymongo_cursor = self._collection.pymongo_collection.find(
1367
+ self._filter, self._projection
1368
+ )
1369
+ if self._sort:
1370
+ pymongo_cursor = pymongo_cursor.sort(self._sort)
1371
+ if self._skip:
1372
+ pymongo_cursor = pymongo_cursor.skip(self._skip)
1373
+ if self._limit:
1374
+ pymongo_cursor = pymongo_cursor.limit(self._limit)
1375
+ if self._batch_size:
1376
+ pymongo_cursor = pymongo_cursor.batch_size(self._batch_size)
1377
+ docs = list(pymongo_cursor)
1378
+ if not docs:
1379
+ return pl.DataFrame()
1380
+ return pl.DataFrame(docs)
1381
+
1382
+ # CRITICAL: Validate projection doesn't exclude required fields
1383
+ if self._projection:
1384
+ projection_values = [v for k, v in self._projection.items() if k != "_id"]
1385
+ is_inclusion = any(v == 1 for v in projection_values)
1386
+
1387
+ # Time field must be included
1388
+ if is_inclusion:
1389
+ time_in_projection = (
1390
+ schema.time_field in self._projection
1391
+ and self._projection[schema.time_field] == 1
1392
+ )
1393
+ if not time_in_projection:
1394
+ raise ValueError(
1395
+ f"Projection must include time field '{schema.time_field}'. "
1396
+ f"Projection: {self._projection}"
1397
+ )
1398
+
1399
+ # Sort fields must be included
1400
+ if self._sort:
1401
+ for sort_field, _ in self._sort:
1402
+ if is_inclusion:
1403
+ if (
1404
+ sort_field not in self._projection
1405
+ or self._projection[sort_field] != 1
1406
+ ):
1407
+ raise ValueError(
1408
+ f"Projection must include sort field '{sort_field}'. "
1409
+ f"Cannot sort by a field that is projected out. "
1410
+ f"Projection: {self._projection}"
1411
+ )
1412
+
1413
+ time_field = schema.time_field
1414
+
1415
+ # Validate sort field if specified
1416
+ if self._sort:
1417
+ sort_validation = validate_sort_field(self._sort, schema)
1418
+ if not sort_validation.is_valid:
1419
+ raise ValueError(f"Sort validation failed: {sort_validation.reason}")
1420
+
1421
+ # Parse and validate date filters
1422
+ parsed_start = parse_datetime_tz_aware(start_date, "start_date")
1423
+ parsed_end = parse_datetime_tz_aware(end_date, "end_date")
1424
+
1425
+ if not accelerate:
1426
+ if parsed_start or parsed_end:
1427
+ logger.warning(
1428
+ "start_date/end_date filters are ignored when accelerate=False"
1429
+ )
1430
+ # Fallback to regular iteration (native Polars from dicts)
1431
+ return self._to_polars_regular()
1432
+
1433
+ is_chunkable, reason, brackets, _ = build_brackets_for_find(
1434
+ self._filter,
1435
+ schema.time_field,
1436
+ self._sort, # Pass sort spec for $natural detection
1437
+ )
1438
+
1439
+ # Handle REJECT mode (is_chunkable=False)
1440
+ if not is_chunkable:
1441
+ if parsed_start or parsed_end:
1442
+ logger.warning(
1443
+ "start_date/end_date filters are ignored for non-chunkable queries"
1444
+ )
1445
+ logger.info("Invalid query syntax (%s) - cannot execute", reason)
1446
+ # Fall back to single-worker mode
1447
+ max_workers = 1
1448
+ chunking_granularity = None
1449
+
1450
+ # Handle SINGLE mode (is_chunkable=True but empty brackets)
1451
+ elif is_chunkable and not brackets:
1452
+ logger.info(
1453
+ "Query valid but not parallelizable (%s) - using single-worker mode",
1454
+ reason,
1455
+ )
1456
+ # Fall back to single-worker mode
1457
+ max_workers = 1
1458
+ chunking_granularity = None
1459
+
1460
+ # Create cache manager
1461
+ cache = CacheManager(
1462
+ filter_dict=self._filter,
1463
+ projection=self._projection,
1464
+ sort=self._sort,
1465
+ )
1466
+
1467
+ # Check if cache exists
1468
+ if cache_read and cache.exists():
1469
+ logging.debug(f"[Cache] Reading from cache (polars): {cache.cache_dir}")
1470
+ reader = ParquetReader(cache.cache_dir)
1471
+ df = cast(
1472
+ pl.DataFrame,
1473
+ reader.to_dataframe(
1474
+ engine="polars",
1475
+ schema=schema,
1476
+ time_field=time_field,
1477
+ start_date=parsed_start,
1478
+ end_date=parsed_end,
1479
+ coerce=coerce,
1480
+ any_type_strategy=any_type_strategy,
1481
+ ),
1482
+ )
1483
+
1484
+ # Check if we need DuckDB sorting (Any types or List types)
1485
+ need_duckdb_sort = False
1486
+ sort_infos: List[Dict[str, Any]] = []
1487
+ if self._sort:
1488
+ sort_infos = get_sort_field_info(self._sort, schema)
1489
+
1490
+ # Expand parent fields to children and collect all fields to check
1491
+ fields_to_check = []
1492
+ for info in sort_infos:
1493
+ if info["is_parent"]:
1494
+ # Parent field - check all children
1495
+ fields_to_check.extend(info["child_fields"])
1496
+ else:
1497
+ # Direct field
1498
+ fields_to_check.append(info["field"])
1499
+
1500
+ # Check if any of the actual sort fields (after expansion) are Any/List types
1501
+ for field in fields_to_check:
1502
+ if field in schema.fields:
1503
+ field_type = schema.fields[field]
1504
+ if isinstance(field_type, (AnyType, ListType)):
1505
+ need_duckdb_sort = True
1506
+ break
1507
+
1508
+ if self._sort and need_duckdb_sort:
1509
+ # Use DuckDB for Any/List type sorting (requires BSON type ordering / array sorting)
1510
+ logging.debug(
1511
+ "[Sort] Using DuckDB for Types.Any()/Types.List() sorting..."
1512
+ )
1513
+
1514
+ warnings.warn(
1515
+ "Sorting by Types.Any() field in to_polars returns raw struct columns "
1516
+ "(e.g., 'value.float_value', 'value.int64_value'). "
1517
+ "Use to_dataframe() for decoded Any() values.",
1518
+ UserWarning,
1519
+ )
1520
+
1521
+ # Use get_globally_sorted_dataframe() - more efficient than batching
1522
+ combined_df = reader.get_globally_sorted_dataframe(
1523
+ sort_spec=self._sort,
1524
+ schema=schema,
1525
+ time_field=time_field,
1526
+ start_date=parsed_start,
1527
+ end_date=parsed_end,
1528
+ coerce=coerce,
1529
+ )
1530
+
1531
+ if not combined_df.empty:
1532
+ for col in combined_df.columns:
1533
+ if combined_df[col].dtype == object:
1534
+ first_val = (
1535
+ combined_df[col].dropna().iloc[0]
1536
+ if not combined_df[col].dropna().empty
1537
+ else None
1538
+ )
1539
+ if (
1540
+ first_val is not None
1541
+ and type(first_val).__name__ == "ObjectId"
1542
+ ):
1543
+ combined_df[col] = combined_df[col].astype(str)
1544
+ df = pl.from_pandas(combined_df)
1545
+ else:
1546
+ df = pl.DataFrame()
1547
+
1548
+ elif self._sort:
1549
+ # Native Polars sort - expand parent fields to children
1550
+ expanded_sort = []
1551
+ for info in sort_infos:
1552
+ if info["is_parent"]:
1553
+ # Expand parent field to all children
1554
+ for child in info["child_fields"]:
1555
+ expanded_sort.append((child, info["direction"]))
1556
+ else:
1557
+ expanded_sort.append((info["field"], info["direction"]))
1558
+
1559
+ sort_fields = [
1560
+ field for field, _ in expanded_sort if field in df.columns
1561
+ ]
1562
+ descending = [
1563
+ direction == -1
1564
+ for field, direction in expanded_sort
1565
+ if field in df.columns
1566
+ ]
1567
+ if sort_fields:
1568
+ df = df.sort(sort_fields, descending=descending)
1569
+
1570
+ # Apply skip/limit
1571
+ if self._skip:
1572
+ df = df.slice(self._skip)
1573
+ if self._limit:
1574
+ df = df.head(self._limit)
1575
+
1576
+ logging.debug(
1577
+ f"[OK] Loaded {len(df):,} documents from cache ({reader.get_statistics()['total_size_mb']:.1f} MB)"
1578
+ )
1579
+ return df
1580
+
1581
+ # Cache miss - need to fetch and write
1582
+ if not cache_write:
1583
+ raise ValueError(
1584
+ "Cache does not exist and cache_write=False. "
1585
+ "Either enable cache_write or call to_dataframe() first."
1586
+ )
1587
+
1588
+ # Fetch data (uses same logic as to_dataframe)
1589
+ mode_str = (
1590
+ "parallel" if is_chunkable and chunking_granularity else "single-worker"
1591
+ )
1592
+ logging.debug(
1593
+ f"[Query] Cache miss - fetching from MongoDB ({mode_str} mode)..."
1594
+ )
1595
+
1596
+ result = execute_parallel_stream_to_cache(
1597
+ pymongo_collection=self._collection.pymongo_collection,
1598
+ filter_dict=self._filter,
1599
+ schema=schema,
1600
+ cache_manager=cache,
1601
+ projection=self._projection,
1602
+ approx_document_size_bytes=self._collection.approx_document_size_bytes,
1603
+ max_workers=max_workers if is_chunkable else 1,
1604
+ peak_ram_limit_mb=flush_ram_limit_mb,
1605
+ chunking_granularity=chunking_granularity if is_chunkable else None,
1606
+ mongo_uri=self._collection.mongo_uri,
1607
+ row_group_size=row_group_size,
1608
+ )
1609
+
1610
+ logging.debug(
1611
+ f"\n[Cache] Cache written: {result['total_docs']:,} docs in {result['duration_s']:.2f}s"
1612
+ )
1613
+
1614
+ # Read from cache as Polars
1615
+ logging.debug("[Cache] Reading from cache to build Polars DataFrame...")
1616
+ reader = ParquetReader(cache.cache_dir)
1617
+
1618
+ # Check if we need DuckDB sorting (Any types or List types)
1619
+ need_duckdb_sort = False
1620
+ sort_infos: List[Dict[str, Any]] = []
1621
+ if self._sort:
1622
+ sort_infos = get_sort_field_info(self._sort, schema)
1623
+
1624
+ # Expand parent fields to children and collect all fields to check
1625
+ fields_to_check = []
1626
+ for info in sort_infos:
1627
+ if info["is_parent"]:
1628
+ # Parent field - check all children
1629
+ fields_to_check.extend(info["child_fields"])
1630
+ else:
1631
+ # Direct field
1632
+ fields_to_check.append(info["field"])
1633
+
1634
+ # Check if any of the actual sort fields (after expansion) are Any/List types
1635
+ for field in fields_to_check:
1636
+ if field in schema.fields:
1637
+ field_type = schema.fields[field]
1638
+ if isinstance(field_type, (AnyType, ListType)):
1639
+ need_duckdb_sort = True
1640
+ break
1641
+
1642
+ if self._sort and need_duckdb_sort:
1643
+ # Use DuckDB for Any/List type sorting (requires BSON type ordering / array sorting)
1644
+ logging.debug("[Sort] Using DuckDB for Types.Any()/Types.List() sorting...")
1645
+
1646
+ warnings.warn(
1647
+ "Sorting by Types.Any() field in to_polars returns raw struct columns "
1648
+ "(e.g., 'value.float_value', 'value.int64_value'). "
1649
+ "Use to_dataframe() for decoded Any() values.",
1650
+ UserWarning,
1651
+ )
1652
+
1653
+ # Use get_globally_sorted_dataframe() - more efficient than batching
1654
+ combined_df = reader.get_globally_sorted_dataframe(
1655
+ sort_spec=self._sort,
1656
+ schema=schema,
1657
+ time_field=time_field,
1658
+ start_date=parsed_start,
1659
+ end_date=parsed_end,
1660
+ coerce=coerce,
1661
+ )
1662
+
1663
+ if not combined_df.empty:
1664
+ for col in combined_df.columns:
1665
+ if combined_df[col].dtype == object:
1666
+ first_val = (
1667
+ combined_df[col].dropna().iloc[0]
1668
+ if not combined_df[col].dropna().empty
1669
+ else None
1670
+ )
1671
+ if (
1672
+ first_val is not None
1673
+ and type(first_val).__name__ == "ObjectId"
1674
+ ):
1675
+ combined_df[col] = combined_df[col].astype(str)
1676
+ df = pl.from_pandas(combined_df)
1677
+ else:
1678
+ df = pl.DataFrame()
1679
+ else:
1680
+ df = cast(
1681
+ pl.DataFrame,
1682
+ reader.to_dataframe(
1683
+ engine="polars",
1684
+ schema=schema,
1685
+ time_field=time_field,
1686
+ start_date=parsed_start,
1687
+ end_date=parsed_end,
1688
+ coerce=coerce,
1689
+ any_type_strategy=any_type_strategy,
1690
+ ),
1691
+ )
1692
+
1693
+ # Native Polars sort - expand parent fields to children
1694
+ if self._sort:
1695
+ expanded_sort = []
1696
+ for info in sort_infos:
1697
+ if info["is_parent"]:
1698
+ for child in info["child_fields"]:
1699
+ expanded_sort.append((child, info["direction"]))
1700
+ else:
1701
+ expanded_sort.append((info["field"], info["direction"]))
1702
+
1703
+ sort_fields = [
1704
+ field for field, _ in expanded_sort if field in df.columns
1705
+ ]
1706
+ descending = [
1707
+ direction == -1
1708
+ for field, direction in expanded_sort
1709
+ if field in df.columns
1710
+ ]
1711
+ if sort_fields:
1712
+ # Polars uses `reverse` (not `descending`) in older versions.
1713
+ df = df.sort(sort_fields, descending=descending)
1714
+
1715
+ # Apply skip/limit
1716
+ if self._skip:
1717
+ df = df.slice(self._skip)
1718
+ if self._limit:
1719
+ df = df.head(self._limit)
1720
+
1721
+ return df
1722
+
1723
+ def _to_dataframe_regular(self) -> pd.DataFrame:
1724
+ """
1725
+ Convert to DataFrame without acceleration.
1726
+
1727
+ Uses regular PyMongo iteration. Fallback for:
1728
+ - Non-chunkable queries
1729
+ - No schema provided
1730
+ - Acceleration disabled
1731
+
1732
+ Returns:
1733
+ Pandas DataFrame
1734
+ """
1735
+ # Collect all documents - __iter__ will set _started
1736
+ # Convert to DataFrame
1737
+ return pd.json_normalize(list(self))
1738
+
1739
+ def _to_polars_regular(self) -> "pl.DataFrame":
1740
+ """
1741
+ Convert to Polars DataFrame without acceleration.
1742
+
1743
+ Uses regular PyMongo iteration with native Polars conversion.
1744
+ Fallback for:
1745
+ - Non-chunkable queries
1746
+ - No schema provided
1747
+ - Acceleration disabled
1748
+
1749
+ Returns:
1750
+ Polars DataFrame
1751
+
1752
+ Note:
1753
+ Uses pl.from_dicts() which handles nested documents by creating
1754
+ struct columns. For flattened column names like pandas json_normalize,
1755
+ you would need to unnest() afterwards.
1756
+ """
1757
+ # Collect all documents - __iter__ will set _started
1758
+ docs = list(self)
1759
+
1760
+ if not docs:
1761
+ return pl.DataFrame()
1762
+
1763
+ return pl.from_dicts(docs)
1764
+
1765
+ def _to_dataframe_accelerated(
1766
+ self,
1767
+ cache_read: bool,
1768
+ cache_write: bool,
1769
+ start_date: Optional[datetime] = None,
1770
+ end_date: Optional[datetime] = None,
1771
+ coerce: Literal["raise", "error"] = "raise",
1772
+ max_workers: int = 4,
1773
+ chunking_granularity: Optional[timedelta] = None,
1774
+ is_chunkable: bool = True,
1775
+ flush_ram_limit_mb: int = 512,
1776
+ row_group_size: Optional[int] = None,
1777
+ ) -> pd.DataFrame:
1778
+ """
1779
+ Convert to DataFrame using parallel execution with Parquet caching.
1780
+
1781
+ ┌─────────────────────────────────────────────────────────────────────┐
1782
+ │ DATA FLOW - ACCELERATED EXECUTION: │
1783
+ │ │
1784
+ │ This is where the XLR8 magic happens. The flow is: │
1785
+ │ │
1786
+ │ 1. CACHE CHECK │
1787
+ │ Input: self._filter hashed to "abc123def" │
1788
+ │ Check: Does .cache/abc123def/*.parquet exist? │
1789
+ │ If yes -> Read directly from Parquet (instant!) │
1790
+ │ │
1791
+ │ 2. CACHE MISS -> PARALLEL FETCH (if chunkable) │
1792
+ │ Calls: execute_parallel_stream_to_cache() │
1793
+ │ Which does: │
1794
+ │ a) Build brackets from query (analysis/brackets.py) │
1795
+ │ Query -> [Bracket(static_filter, time_range), ...] │
1796
+ │ b) Plan execution (execution/planner.py) │
1797
+ │ Time range + RAM -> workers=N, batch_size=M │
1798
+ │ c) Chunk time ranges (analysis/chunker.py) │
1799
+ │ 6 months -> X chunks based on granularity │
1800
+ │ d) Parallel fetch (Rust backend fetch_chunks_bson) │
1801
+ │ N async workers pull chunks from queue │
1802
+ │ e) Stream to Parquet (Rust backend) │
1803
+ │ Each worker writes part files: part_0000.parquet, etc. │
1804
+ │ │
1805
+ │ 2b. CACHE MISS -> SINGLE-WORKER FETCH (if not chunkable) │
1806
+ │ - Single worker fetches all data │
1807
+ │ - No async, no chunking │
1808
+ │ - Still writes to Parquet for caching │
1809
+ │ │
1810
+ │ 3. READ FROM CACHE │
1811
+ │ After fetch, read the Parquet files we just wrote │
1812
+ │ Optionally filter by start_date/end_date │
1813
+ │ Returns: pandas DataFrame with original values │
1814
+ │ │
1815
+ │ EXAMPLE TIMING (500K docs): │
1816
+ │ - Cache hit: 0.5s (read Parquet) │
1817
+ │ - Cache miss: 10-15s (parallel fetch + write + read) │
1818
+ │ - Without XLR8: 30-40s (sequential cursor iteration) │
1819
+ └─────────────────────────────────────────────────────────────────────┘
1820
+
1821
+ Args:
1822
+ cache_read: Read from cache if available
1823
+ cache_write: Write to cache after fetching
1824
+ start_date: Filter cached data from this date (inclusive, tz-aware)
1825
+ end_date: Filter cached data until this date (exclusive, tz-aware)
1826
+ coerce: Error handling mode ("raise" or "error")
1827
+ max_workers: Maximum parallel workers (passed from to_dataframe)
1828
+ chunking_granularity: Time granularity for chunking (passed from to_dataframe)
1829
+ is_chunkable: Whether query is chunkable (determines parallel vs single-worker)
1830
+
1831
+ Returns:
1832
+ Pandas DataFrame with accelerated query results
1833
+ """
1834
+ schema = self._collection.schema
1835
+ time_field = schema.time_field
1836
+
1837
+ # Mark as started
1838
+ if not self._started:
1839
+ self._started = True
1840
+
1841
+ # ─────────────────────────────────────────────────────────────────────
1842
+ # STEP 1: Create cache manager (hashes query to unique directory)
1843
+ # Example: filter_dict hashes to "abc123def" -> .cache/abc123def/
1844
+ # ─────────────────────────────────────────────────────────────────────
1845
+ cache = CacheManager(
1846
+ filter_dict=self._filter,
1847
+ projection=self._projection,
1848
+ sort=self._sort,
1849
+ )
1850
+
1851
+ # ─────────────────────────────────────────────────────────────────────
1852
+ # STEP 2: Check cache - if hit, read directly from Parquet
1853
+ # Example: .cache/abc123def/ts_1704067200_1704070800_part_0000.parquet
1854
+ # ─────────────────────────────────────────────────────────────────────
1855
+ if cache_read and cache.exists():
1856
+ logging.debug(f"[Cache] Reading from cache: {cache.cache_dir}")
1857
+ reader = ParquetReader(cache.cache_dir)
1858
+
1859
+ # Check if we need DuckDB sorting (Any types or List types)
1860
+ need_duckdb_sort = False
1861
+ sort_infos: List[Dict[str, Any]] = []
1862
+ if self._sort:
1863
+ sort_infos = get_sort_field_info(self._sort, schema)
1864
+
1865
+ # Expand parent fields to children and collect all fields to check
1866
+ fields_to_check = []
1867
+ for info in sort_infos:
1868
+ if info["is_parent"]:
1869
+ # Parent field - check all children
1870
+ fields_to_check.extend(info["child_fields"])
1871
+ else:
1872
+ # Direct field
1873
+ fields_to_check.append(info["field"])
1874
+
1875
+ # Check if any of the actual sort fields (after expansion) are Any/List types
1876
+ for field in fields_to_check:
1877
+ if field in schema.fields:
1878
+ field_type = schema.fields[field]
1879
+ if isinstance(field_type, (AnyType, ListType)):
1880
+ need_duckdb_sort = True
1881
+ break
1882
+
1883
+ if self._sort and need_duckdb_sort:
1884
+ # Use DuckDB for Any/List type sorting (requires BSON type ordering / array sorting)
1885
+ logging.debug(
1886
+ "[Sort] Using DuckDB for Types.Any()/Types.List() sorting..."
1887
+ )
1888
+ df = cast(
1889
+ pd.DataFrame,
1890
+ reader.get_globally_sorted_dataframe(
1891
+ sort_spec=self._sort,
1892
+ schema=schema,
1893
+ time_field=time_field,
1894
+ start_date=start_date,
1895
+ end_date=end_date,
1896
+ coerce=coerce,
1897
+ memory_limit_mb=flush_ram_limit_mb,
1898
+ threads=max_workers,
1899
+ ),
1900
+ )
1901
+ else:
1902
+ # Normal read + native pandas sort
1903
+ df = cast(
1904
+ pd.DataFrame,
1905
+ reader.to_dataframe(
1906
+ engine="pandas",
1907
+ schema=schema,
1908
+ time_field=time_field,
1909
+ start_date=start_date,
1910
+ end_date=end_date,
1911
+ coerce=coerce,
1912
+ ),
1913
+ )
1914
+
1915
+ # Native pandas sort - expand parent fields to children
1916
+ if self._sort:
1917
+ expanded_sort = []
1918
+ for info in sort_infos:
1919
+ if info["is_parent"]:
1920
+ for child in info["child_fields"]:
1921
+ expanded_sort.append((child, info["direction"]))
1922
+ else:
1923
+ expanded_sort.append((info["field"], info["direction"]))
1924
+
1925
+ sort_fields = [
1926
+ field for field, _ in expanded_sort if field in df.columns
1927
+ ]
1928
+ ascending = [
1929
+ direction == 1
1930
+ for field, direction in expanded_sort
1931
+ if field in df.columns
1932
+ ]
1933
+ if sort_fields:
1934
+ df = df.sort_values(
1935
+ by=sort_fields, ascending=ascending, na_position="last"
1936
+ )
1937
+ logger.debug("Sorted DataFrame by %s", sort_fields)
1938
+
1939
+ # Apply skip/limit if set
1940
+ if self._skip:
1941
+ df = df.iloc[self._skip :]
1942
+ if self._limit:
1943
+ df = df.iloc[: self._limit]
1944
+
1945
+ filter_info = ""
1946
+ if start_date or end_date:
1947
+ filter_info = f" (filtered: {start_date} to {end_date})"
1948
+ logging.debug(
1949
+ f"[OK] Loaded {len(df):,} documents from cache{filter_info} ({reader.get_statistics()['total_size_mb']:.1f} MB)"
1950
+ )
1951
+ return cast(pd.DataFrame, df)
1952
+
1953
+ # ─────────────────────────────────────────────────────────────────────
1954
+ # STEP 3: Cache miss - execute fetch and stream to Parquet
1955
+ # This is where the heavy lifting happens
1956
+ # ─────────────────────────────────────────────────────────────────────
1957
+ mode_str = "parallel" if is_chunkable else "single-worker"
1958
+ logging.debug(
1959
+ f"[Query] Cache miss - fetching from MongoDB ({mode_str} mode)..."
1960
+ )
1961
+
1962
+ if cache_write:
1963
+ # CRITICAL: If cache_read=False but cache_write=True and cache exists,
1964
+ # we need to clear the old cache first to avoid duplicate data
1965
+ if not cache_read and cache.exists():
1966
+ logging.debug(
1967
+ "Clearing existing cache (cache_read=False, starting fresh)..."
1968
+ )
1969
+ cache.clean()
1970
+ # chunking_granularity is passed from to_dataframe()
1971
+ # If None, execute_parallel_stream_to_cache will use single-worker mode
1972
+
1973
+ # Streaming path: fetch -> encode -> write Parquet (memory efficient)
1974
+ result = execute_parallel_stream_to_cache(
1975
+ pymongo_collection=self._collection.pymongo_collection,
1976
+ filter_dict=self._filter,
1977
+ schema=schema,
1978
+ cache_manager=cache,
1979
+ projection=self._projection,
1980
+ approx_document_size_bytes=self._collection.approx_document_size_bytes,
1981
+ max_workers=max_workers, # From to_dataframe() parameter
1982
+ peak_ram_limit_mb=flush_ram_limit_mb,
1983
+ chunking_granularity=chunking_granularity, # None = single-worker mode
1984
+ mongo_uri=self._collection.mongo_uri,
1985
+ row_group_size=row_group_size,
1986
+ )
1987
+
1988
+ logging.debug("\n[Cache] Cache written:")
1989
+ logging.debug(f" - Total docs: {result['total_docs']:,}")
1990
+ logging.debug(f" - Total files: {result['total_files']}")
1991
+ logging.debug(f" - Workers: {result['workers']}")
1992
+ logging.debug(f" - Duration: {result['duration_s']:.2f}s")
1993
+ logging.debug(f" - Cache dir: {cache.cache_dir}")
1994
+
1995
+ # Now read from cache to build DataFrame (with optional date filter)
1996
+ logging.debug("\n[Cache] Reading from cache to build DataFrame...")
1997
+ reader = ParquetReader(cache.cache_dir)
1998
+
1999
+ # Check if we need DuckDB sorting (Any types or List types)
2000
+ need_duckdb_sort = False
2001
+ sort_infos: List[Dict[str, Any]] = []
2002
+ if self._sort:
2003
+ sort_infos = get_sort_field_info(self._sort, schema)
2004
+
2005
+ # Expand parent fields to children and collect all fields to check
2006
+ fields_to_check = []
2007
+ for info in sort_infos:
2008
+ if info["is_parent"]:
2009
+ # Parent field - check all children
2010
+ fields_to_check.extend(info["child_fields"])
2011
+ else:
2012
+ # Direct field
2013
+ fields_to_check.append(info["field"])
2014
+
2015
+ # Check if any of the actual sort fields (after expansion) are Any/List types
2016
+ for field in fields_to_check:
2017
+ if field in schema.fields:
2018
+ field_type = schema.fields[field]
2019
+ if isinstance(field_type, (AnyType, ListType)):
2020
+ need_duckdb_sort = True
2021
+ break
2022
+
2023
+ if self._sort and need_duckdb_sort:
2024
+ # Use DuckDB for Any/List type sorting (requires BSON type ordering / array sorting)
2025
+ logging.debug(
2026
+ "[Sort] Using DuckDB for Types.Any()/Types.List() sorting..."
2027
+ )
2028
+ df = cast(
2029
+ pd.DataFrame,
2030
+ reader.get_globally_sorted_dataframe(
2031
+ sort_spec=self._sort,
2032
+ schema=schema,
2033
+ time_field=time_field,
2034
+ start_date=start_date,
2035
+ end_date=end_date,
2036
+ coerce=coerce,
2037
+ memory_limit_mb=flush_ram_limit_mb,
2038
+ threads=max_workers,
2039
+ ),
2040
+ )
2041
+ else:
2042
+ # Normal read + native pandas sort
2043
+ df = cast(
2044
+ pd.DataFrame,
2045
+ reader.to_dataframe(
2046
+ engine="pandas",
2047
+ schema=schema,
2048
+ time_field=time_field,
2049
+ start_date=start_date,
2050
+ end_date=end_date,
2051
+ coerce=coerce,
2052
+ ),
2053
+ )
2054
+
2055
+ # Native pandas sort - expand parent fields to children
2056
+ if self._sort:
2057
+ expanded_sort = []
2058
+ for info in sort_infos:
2059
+ if info["is_parent"]:
2060
+ for child in info["child_fields"]:
2061
+ expanded_sort.append((child, info["direction"]))
2062
+ else:
2063
+ expanded_sort.append((info["field"], info["direction"]))
2064
+
2065
+ sort_fields = [
2066
+ field for field, _ in expanded_sort if field in df.columns
2067
+ ]
2068
+ ascending = [
2069
+ direction == 1
2070
+ for field, direction in expanded_sort
2071
+ if field in df.columns
2072
+ ]
2073
+ if sort_fields:
2074
+ df = df.sort_values(
2075
+ by=sort_fields, ascending=ascending, na_position="last"
2076
+ )
2077
+ logger.debug("Sorted DataFrame by %s", sort_fields)
2078
+
2079
+ else:
2080
+ # cache_write=False is not supported in single-worker mode
2081
+ # Always write to cache for consistency and performance
2082
+ raise ValueError(
2083
+ "cache_write=False is not supported. "
2084
+ "XLR8 always writes to Parquet cache for memory efficiency. "
2085
+ "Set cache_read=False if you don't want to read from existing cache."
2086
+ )
2087
+
2088
+ # Apply skip/limit if set
2089
+ if self._skip:
2090
+ df = df.iloc[self._skip :]
2091
+ if self._limit:
2092
+ df = df.iloc[: self._limit]
2093
+
2094
+ return cast(pd.DataFrame, df)
2095
+
2096
+ def explain_acceleration(self) -> Dict[str, Any]:
2097
+ """
2098
+ Get query execution plan.
2099
+
2100
+ Returns explanation of how query will be executed:
2101
+ - Whether acceleration is possible
2102
+ - Time bounds extracted
2103
+ - Estimated chunk count
2104
+ - Worker configuration
2105
+
2106
+ Returns:
2107
+ Dict with execution plan details
2108
+ """
2109
+ schema = self._collection.schema
2110
+
2111
+ result: Dict[str, Any] = {
2112
+ "filter": self._filter,
2113
+ "projection": self._projection,
2114
+ "skip": self._skip,
2115
+ "limit": self._limit,
2116
+ "sort": self._sort,
2117
+ "accelerated": False,
2118
+ }
2119
+
2120
+ if schema is None:
2121
+ result["reason"] = "No schema provided"
2122
+ return result
2123
+
2124
+ # NEW: build_brackets_for_find internally validates via is_chunkable_query
2125
+ is_chunkable, reason, brackets, bounds = build_brackets_for_find(
2126
+ self._filter,
2127
+ schema.time_field,
2128
+ self._sort, # Pass sort spec for $natural detection
2129
+ )
2130
+
2131
+ result["is_chunkable"] = is_chunkable
2132
+ result["reason"] = reason
2133
+
2134
+ # Distinguish REJECT vs SINGLE modes
2135
+ if not is_chunkable:
2136
+ # REJECT mode
2137
+ result["mode"] = "reject"
2138
+ elif is_chunkable and not brackets:
2139
+ # SINGLE mode - valid but not parallelizable
2140
+ result["mode"] = "single"
2141
+ else:
2142
+ # PARALLEL mode
2143
+ result["mode"] = "parallel"
2144
+
2145
+ if is_chunkable and bounds and bounds[0] and bounds[1]:
2146
+ start_bound = bounds[0]
2147
+ end_bound = bounds[1]
2148
+
2149
+ result["time_bounds"] = {
2150
+ "start": start_bound.isoformat(),
2151
+ "end": end_bound.isoformat(),
2152
+ }
2153
+
2154
+ chunks = chunk_time_range(
2155
+ start_bound, end_bound, chunk_size=timedelta(days=1)
2156
+ )
2157
+ result["estimated_chunks"] = len(chunks)
2158
+
2159
+ result["accelerated"] = True
2160
+
2161
+ return result