xlr8 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2145 @@
1
+ """
2
+ XLR8 cursor with PyMongo compatibility.
3
+
4
+ ================================================================================
5
+ DATA FLOW - CURSOR (WHERE ACCELERATION HAPPENS)
6
+ ================================================================================
7
+
8
+ This module is where the magic happens. When user calls cursor.to_dataframe(),
9
+ we decide whether to:
10
+ A) Use regular PyMongo iteration (slow)
11
+ B) Use accelerated parallel fetch + Parquet caching (fast)
12
+
13
+ DECISION FLOW:
14
+ ────────────────────────────────────────────────────────────────────────────────
15
+
16
+ cursor.to_dataframe() called
17
+
18
+
19
+ ┌─────────────────────────────┐
20
+ │ Is schema provided? │─── No ──▶ REGULAR PATH (PyMongo iteration)
21
+ └─────────────────────────────┘
22
+ │ Yes
23
+
24
+ ┌─────────────────────────────┐
25
+ │ Is query chunkable? │─── No ──▶ REGULAR PATH
26
+ │ (has time range, no │ (e.g., has $where or nested $or)
27
+ │ forbidden operators) │
28
+ └─────────────────────────────┘
29
+ │ Yes
30
+
31
+ ┌─────────────────────────────┐
32
+ │ Is data in cache? │─── Yes ─▶ READ FROM CACHE
33
+ │ (.cache/{query_hash}/*.parquet) (instant, ~100ms for 1M rows)
34
+ └─────────────────────────────┘
35
+ │ No
36
+
37
+ ┌─────────────────────────────┐
38
+ │ ACCELERATED PATH: │
39
+ │ 1. Build brackets │ ← analysis/brackets.py
40
+ │ 2. Plan execution │ ← execution/planner.py
41
+ │ 3. Chunk time ranges │ ← analysis/chunker.py
42
+ │ 4. Parallel async fetch │ ← Rust backend (fetch_chunks_bson)
43
+ │ 5. Stream to Parquet │ ← Rust backend writes shards
44
+ │ 6. Read back DataFrame │ ← storage/reader.py
45
+ └─────────────────────────────┘
46
+
47
+ EXAMPLE DATA TRANSFORMATIONS:
48
+ ────────────────────────────────────────────────────────────────────────────────
49
+
50
+ 1. INPUT QUERY (from user):
51
+ {
52
+ "$or": [
53
+ {"metadata.sensor_id": ObjectId("64a...")},
54
+ {"metadata.sensor_id": ObjectId("64b...")},
55
+ ],
56
+ "timestamp": {"$gte": datetime(2024, 1, 1), "$lt": datetime(2024, 7, 1)}
57
+ }
58
+
59
+ 2. AFTER BRACKET ANALYSIS (brackets.py):
60
+ [
61
+ Bracket(static={"metadata.sensor_id": "64a..."}, time=Jan-Jul),
62
+ Bracket(static={"metadata.sensor_id": "64b..."}, time=Jan-Jul),
63
+ ]
64
+
65
+ 3. AFTER CHUNKING (for each bracket):
66
+ Bracket 1 -> 13 chunks (14 days each for 6 months)
67
+ Bracket 2 -> 13 chunks
68
+ Total: 26 work items in queue
69
+
70
+ 4. PARALLEL FETCH (10 workers):
71
+ Worker 0: Chunk 1 -> 45,000 docs, write to part_0000.parquet
72
+ Worker 1: Chunk 2 -> 52,000 docs, write to part_0001.parquet
73
+ ...
74
+ Worker 9: Chunk 10 -> 38,000 docs, write to part_0009.parquet
75
+ (Rust async workers pull chunks as they finish)
76
+
77
+ 5. OUTPUT (DataFrame):
78
+ pandas.DataFrame with columns: [timestamp, metadata.device_id, value, ...]
79
+ 500,000 rows loaded from Parquet in ~0.5s
80
+
81
+ ================================================================================
82
+ """
83
+
84
+ from __future__ import annotations
85
+
86
+ from typing import (
87
+ Any,
88
+ Callable,
89
+ Dict,
90
+ List,
91
+ Optional,
92
+ Union,
93
+ Iterator,
94
+ Literal,
95
+ Generator,
96
+ cast,
97
+ )
98
+ from datetime import datetime, date, timezone, timedelta
99
+ import logging
100
+ import warnings
101
+ import pandas as pd
102
+ import time
103
+ import pyarrow as pa
104
+ import polars as pl
105
+
106
+ logger = logging.getLogger(__name__)
107
+
108
+ # Import after logger to avoid circular imports
109
+ from xlr8.constants import DEFAULT_BATCH_SIZE
110
+ from xlr8.execution.callback import execute_partitioned_callback
111
+ from xlr8.analysis import (
112
+ build_brackets_for_find,
113
+ chunk_time_range,
114
+ get_sort_field_info,
115
+ validate_sort_field,
116
+ )
117
+ from xlr8.schema.types import Any as AnyType, List as ListType
118
+ from xlr8.storage import CacheManager, ParquetReader
119
+ from xlr8.execution import execute_parallel_stream_to_cache
120
+
121
+
122
+ def parse_datetime_tz_aware(
123
+ value: Union[datetime, date, str, None],
124
+ param_name: str = "date",
125
+ ) -> Optional[datetime]:
126
+ """
127
+ Parse a date/datetime value to a timezone-aware datetime.
128
+
129
+ Accepts:
130
+ - datetime (must be tz-aware or will assume UTC)
131
+ - date (converted to midnight UTC)
132
+ - ISO format string with timezone (e.g., "2024-01-15T10:30:00Z", "2024-01-15T10:30:00+00:00")
133
+
134
+ Args:
135
+ value: The date value to parse
136
+ param_name: Name of parameter for error messages
137
+
138
+ Returns:
139
+ Timezone-aware datetime or None if value is None
140
+
141
+ Raises:
142
+ ValueError: If string is not a valid ISO format or missing timezone
143
+ """
144
+ if value is None:
145
+ return None
146
+
147
+ if isinstance(value, datetime):
148
+ if value.tzinfo is None:
149
+ # Assume UTC for naive datetimes
150
+ return value.replace(tzinfo=timezone.utc)
151
+ return value
152
+
153
+ if isinstance(value, date):
154
+ # Convert date to midnight UTC
155
+ return datetime(value.year, value.month, value.day, tzinfo=timezone.utc)
156
+
157
+ if isinstance(value, str):
158
+ # Try parsing ISO format
159
+ try:
160
+ # Python 3.11+ has datetime.fromisoformat with better Z support
161
+ # For compatibility, handle Z suffix manually
162
+ if value.endswith("Z"):
163
+ value = value[:-1] + "+00:00"
164
+
165
+ dt = datetime.fromisoformat(value)
166
+
167
+ if dt.tzinfo is None:
168
+ raise ValueError(
169
+ f"{param_name}: Timezone-aware datetime required. "
170
+ f"Got '{value}' without timezone. "
171
+ f"Use ISO format with timezone like '2024-01-15T10:30:00Z' or '2024-01-15T10:30:00+00:00'"
172
+ )
173
+ return dt
174
+ except ValueError as e:
175
+ if "Timezone-aware" in str(e):
176
+ raise
177
+ raise ValueError(
178
+ f"{param_name}: Invalid datetime string '{value}'. "
179
+ f"Use ISO format with timezone like '2024-01-15T10:30:00Z' or '2024-01-15T10:30:00+00:00'"
180
+ ) from e
181
+
182
+ raise TypeError(
183
+ f"{param_name}: Expected datetime, date, or ISO string, got {type(value).__name__}"
184
+ )
185
+
186
+
187
+ class XLR8Cursor:
188
+ """
189
+ PyMongo-compatible cursor with acceleration support.
190
+
191
+ Acts as drop-in replacement for pymongo.cursor.Cursor but can
192
+ accelerate queries through parallel execution and Parquet caching.
193
+
194
+ Key differences from PyMongo:
195
+ - to_dataframe() / to_polars() for efficient DataFrame conversion
196
+ - Transparent acceleration when query is chunkable
197
+ - Maintains full PyMongo API compatibility for iteration
198
+
199
+ Example:
200
+ >>> cursor = collection.find({"timestamp": {"$gte": start, "$lt": end}})
201
+ >>> df = cursor.to_dataframe() # Accelerated execution
202
+ >>>
203
+ >>> # Or use like regular PyMongo cursor:
204
+ >>> for doc in cursor:
205
+ ... print(doc)
206
+ """
207
+
208
+ def __init__(
209
+ self,
210
+ collection: Any, # XLR8Collection
211
+ query_filter: Dict[str, Any],
212
+ projection: Optional[Dict[str, Any]] = None,
213
+ skip: int = 0,
214
+ limit: int = 0,
215
+ sort: Optional[List[tuple]] = None,
216
+ batch_size: int = 1000,
217
+ ):
218
+ """
219
+ Initialize cursor.
220
+
221
+ Args:
222
+ collection: Parent XLR8Collection
223
+ query_filter: Query filter dict
224
+ projection: Field projection dict
225
+ skip: Number of documents to skip
226
+ limit: Maximum documents to return (0 = unlimited)
227
+ sort: List of (field, direction) tuples
228
+ batch_size: Batch size for iteration
229
+ """
230
+ self._collection = collection
231
+ self._filter = query_filter
232
+ self._projection = projection
233
+ self._skip = skip
234
+ self._limit = limit
235
+ self._sort = sort
236
+ self._batch_size = batch_size
237
+
238
+ # Iteration state
239
+ self._started = False
240
+ self._pymongo_cursor: Optional[Any] = None
241
+ self._exhausted = False
242
+
243
+ def __iter__(self) -> Iterator[Dict[str, Any]]:
244
+ """Iterate over documents."""
245
+ if not self._started:
246
+ self._started = True
247
+ # Create actual PyMongo cursor for iteration
248
+ self._ensure_pymongo_cursor()
249
+
250
+ if self._pymongo_cursor is None:
251
+ return iter([])
252
+
253
+ return iter(self._pymongo_cursor)
254
+
255
+ def __next__(self) -> Dict[str, Any]:
256
+ """Get next document."""
257
+ if not self._started:
258
+ self.__iter__()
259
+
260
+ if self._pymongo_cursor is None:
261
+ raise StopIteration
262
+
263
+ return next(self._pymongo_cursor)
264
+
265
+ def _ensure_pymongo_cursor(self) -> None:
266
+ """Lazily create PyMongo cursor only when needed for iteration/delegation."""
267
+ if self._pymongo_cursor is None:
268
+ self._pymongo_cursor = self._collection.pymongo_collection.find(
269
+ filter=self._filter,
270
+ projection=self._projection,
271
+ skip=self._skip,
272
+ limit=self._limit,
273
+ sort=self._sort,
274
+ batch_size=self._batch_size,
275
+ )
276
+
277
+ def raw_cursor(self):
278
+ """
279
+ Get direct access to underlying PyMongo cursor.
280
+
281
+ This is an escape hatch for power users who need access to PyMongo cursor
282
+ methods not explicitly implemented in XLR8Cursor.
283
+
284
+ Returns:
285
+ pymongo.cursor.Cursor: The underlying PyMongo cursor
286
+
287
+ Example:
288
+ >>> cursor = collection.find(...)
289
+ >>> cursor.raw_cursor().comment("my query").max_time_ms(5000)
290
+ """
291
+ self._ensure_pymongo_cursor()
292
+ return self._pymongo_cursor
293
+
294
+ def __getattr__(self, name: str) -> Any:
295
+ """
296
+ Delegate unknown attributes to underlying PyMongo cursor.
297
+
298
+ This provides transparent access to all PyMongo cursor methods while
299
+ preserving XLR8's accelerated methods.
300
+
301
+ Note: PyMongo cursor is created lazily only when delegation is needed.
302
+ For explicit access, use .raw_cursor()
303
+ """
304
+ # Avoid infinite recursion
305
+ if name.startswith("_"):
306
+ raise AttributeError(
307
+ f"'{type(self).__name__}' object has no attribute '{name}'"
308
+ )
309
+
310
+ # Create PyMongo cursor if needed
311
+ self._ensure_pymongo_cursor()
312
+
313
+ # Get attribute from PyMongo cursor
314
+ attr = getattr(self._pymongo_cursor, name)
315
+
316
+ # If it's a method that returns cursor, wrap the result
317
+ if callable(attr):
318
+
319
+ def wrapper(*args, **kwargs):
320
+ result = attr(*args, **kwargs)
321
+ # If PyMongo method returns cursor, it returns self (the PyMongo cursor)
322
+ # We want to return our wrapper instead
323
+ if result is self._pymongo_cursor:
324
+ return self
325
+ return result
326
+
327
+ return wrapper
328
+
329
+ return attr
330
+
331
+ def __enter__(self):
332
+ """Context manager entry."""
333
+ return self
334
+
335
+ def __exit__(self, exc_type, exc_val, exc_tb):
336
+ """Context manager exit."""
337
+ self.close()
338
+
339
+ # PyMongo compatibility methods
340
+
341
+ def skip(self, count: int) -> "XLR8Cursor":
342
+ """
343
+ Skip documents.
344
+
345
+ Args:
346
+ count: Number of documents to skip
347
+
348
+ Returns:
349
+ Self for chaining
350
+ """
351
+ if self._started:
352
+ raise RuntimeError("Cannot modify cursor after iteration started")
353
+
354
+ self._skip = count
355
+ return self
356
+
357
+ def limit(self, count: int) -> "XLR8Cursor":
358
+ """
359
+ Limit result count.
360
+
361
+ Args:
362
+ count: Maximum documents to return
363
+
364
+ Returns:
365
+ Self for chaining
366
+ """
367
+ if self._started:
368
+ raise RuntimeError("Cannot modify cursor after iteration started")
369
+
370
+ self._limit = count
371
+ return self
372
+
373
+ def sort(
374
+ self, key_or_list: Union[str, List[tuple]], direction: int = 1
375
+ ) -> "XLR8Cursor":
376
+ """
377
+ Sort results.
378
+
379
+ Automatically adds _id as final tie-breaker for deterministic ordering
380
+ (matching MongoDB's behavior).
381
+
382
+ Args:
383
+ key_or_list: Field name or list of (field, direction) tuples
384
+ direction: Sort direction (1=ascending, -1=descending)
385
+
386
+ Returns:
387
+ Self for chaining
388
+ """
389
+ if self._started:
390
+ raise RuntimeError("Cannot modify cursor after iteration started")
391
+
392
+ if isinstance(key_or_list, str):
393
+ self._sort = [(key_or_list, direction)]
394
+ else:
395
+ self._sort = key_or_list
396
+
397
+ return self
398
+
399
+ def batch_size(self, size: int) -> "XLR8Cursor":
400
+ """
401
+ Set batch size for iteration.
402
+
403
+ Args:
404
+ size: Batch size
405
+
406
+ Returns:
407
+ Self for chaining
408
+ """
409
+ if self._started:
410
+ raise RuntimeError("Cannot modify cursor after iteration started")
411
+
412
+ self._batch_size = size
413
+ return self
414
+
415
+ def close(self) -> None:
416
+ """Close cursor and free resources."""
417
+ if self._pymongo_cursor is not None:
418
+ self._pymongo_cursor.close()
419
+ self._pymongo_cursor = None
420
+ self._exhausted = True
421
+
422
+ # count() and distinct() removed - use __getattr__ delegation to PyMongo
423
+ # These are available via: cursor.count(), cursor.distinct()
424
+ # __getattr__ automatically forwards them to the underlying PyMongo cursor
425
+
426
+ # XLR8-specific acceleration methods
427
+
428
+ def to_dataframe(
429
+ self,
430
+ accelerate: bool = True,
431
+ cache_read: bool = True,
432
+ cache_write: bool = True,
433
+ start_date: Optional[Union[datetime, date, str]] = None,
434
+ end_date: Optional[Union[datetime, date, str]] = None,
435
+ coerce: Literal["raise", "error"] = "raise",
436
+ max_workers: int = 4,
437
+ chunking_granularity: Optional[timedelta] = None,
438
+ row_group_size: Optional[int] = None,
439
+ flush_ram_limit_mb: int = 512,
440
+ ) -> pd.DataFrame:
441
+ """
442
+ Convert results to Pandas DataFrame with optional acceleration.
443
+
444
+ This is the main acceleration entry point. If the query is chunkable
445
+ and acceleration is enabled, uses parallel execution and Parquet caching
446
+ for 2-5x speedup on large result sets.
447
+
448
+ ┌─────────────────────────────────────────────────────────────────────┐
449
+ │ DATA FLOW - ACCELERATION DECISION: │
450
+ │ │
451
+ │ INPUT: self._filter (the MongoDB query) │
452
+ │ Example: { │
453
+ │ "timestamp": {"$gte": datetime(2024,1,1), "$lt": datetime(...)},│
454
+ │ "$or": [{"metadata.sensor_id": ObjectId("64a...")}] │
455
+ │ } │
456
+ │ │
457
+ │ DECISION STEPS: │
458
+ │ 1. Check if schema exists -> No: raise error (schema required)│
459
+ │ 2. Check if query is chunkable -> No: single-worker, still Parquet│
460
+ │ (is_chunkable_query checks for time bounds, forbidden ops) │
461
+ │ 3. If chunkable: use parallel workers based on time span │
462
+ │ │
463
+ │ OUTPUT: pandas.DataFrame with columns from schema │
464
+ │ Example columns: [timestamp, metadata.device_id, value] │
465
+ │ │
466
+ │ PERFORMANCE: │
467
+ │ - Regular path: ~30s for 500K docs (sequential cursor iteration) │
468
+ │ - Accelerated path: ~10s for 500K docs (parallel + caching) │
469
+ │ - Cache hit: ~0.5s for 500K docs (read from Parquet) │
470
+ └─────────────────────────────────────────────────────────────────────┘
471
+
472
+ Args:
473
+ accelerate: Enable acceleration if query is chunkable
474
+ cache_read: Read from Parquet cache if available
475
+ cache_write: Write results to Parquet cache
476
+ start_date: Filter cached data from this date (inclusive).
477
+ Accepts datetime, date, or ISO string with timezone.
478
+ Example: "2024-01-15T00:00:00Z" or datetime with tzinfo
479
+ end_date: Filter cached data until this date (exclusive).
480
+ Accepts datetime, date, or ISO string with timezone.
481
+ coerce: Error handling mode:
482
+ - "raise": Raise exceptions on schema validation errors (default)
483
+ - "error": Log errors and store None for invalid values
484
+ max_workers: Maximum parallel workers (default: 4). More workers use
485
+ more RAM but process faster. Set to 1 for single-threaded.
486
+ Only used when chunking_granularity is provided.
487
+ chunking_granularity: Time granularity for chunking the query.
488
+ Example: timedelta(days=1) chunks by day, timedelta(hours=1) by hour.
489
+ REQUIRED for parallel execution - determines chunk boundaries.
490
+ If None, single-worker mode is used (no parallelization).
491
+ row_group_size: Rows per Parquet row group. If None, Rust default is used.
492
+ flush_ram_limit_mb: RAM limit in MB for buffered data before flushing to
493
+ Parquet. Higher values mean fewer files but more memory usage.
494
+ (default: 512)
495
+
496
+ Returns:
497
+ Pandas DataFrame with results
498
+
499
+ Raises:
500
+ ValueError: If no schema is provided (schema is required for acceleration)
501
+ ValueError: If date strings are not timezone-aware
502
+
503
+ Example:
504
+ >>> cursor = collection.find({
505
+ ... "timestamp": {"$gte": start, "$lt": end},
506
+ ... "status": "active"
507
+ ... })
508
+ >>> df = cursor.to_dataframe() # Accelerated automatically
509
+ >>>
510
+ """
511
+ # Schema is required for acceleration
512
+ schema = self._collection.schema
513
+ if schema is None:
514
+ raise ValueError(
515
+ "Schema is required for to_dataframe(). "
516
+ "Provide a schema when creating the collection: "
517
+ "xlr8_collection = xlr8.wrap(collection, schema=my_schema)"
518
+ )
519
+
520
+ # CRITICAL: Validate projection doesn't exclude required fields
521
+ if self._projection:
522
+ # Check if projection is inclusion (has 1 values) or exclusion (has 0 values)
523
+ projection_values = [v for k, v in self._projection.items() if k != "_id"]
524
+ is_inclusion = any(v == 1 for v in projection_values)
525
+
526
+ # Time field must be included (required for all operations)
527
+ if is_inclusion:
528
+ time_in_projection = (
529
+ schema.time_field in self._projection
530
+ and self._projection[schema.time_field] == 1
531
+ )
532
+ if not time_in_projection:
533
+ raise ValueError(
534
+ f"Projection must include time field '{schema.time_field}'. "
535
+ f"Projection: {self._projection}"
536
+ )
537
+
538
+ # Sort fields must be included
539
+ if self._sort:
540
+ for sort_field, _ in self._sort:
541
+ if is_inclusion:
542
+ if (
543
+ sort_field not in self._projection
544
+ or self._projection[sort_field] != 1
545
+ ):
546
+ raise ValueError(
547
+ f"Projection must include sort field '{sort_field}'. "
548
+ f"Cannot sort by a field that is projected out. "
549
+ f"Projection: {self._projection}"
550
+ )
551
+
552
+ # CRITICAL: If limit() or skip() are used, fall back to PyMongo
553
+ # Reason: Downloading all data just to return a subset is impractical
554
+ # MongoDB can efficiently handle limit/skip operations
555
+ if self._limit > 0 or self._skip > 0:
556
+ logger.info(
557
+ "limit() or skip() detected - falling back to PyMongo iteration "
558
+ "(acceleration would be impractical for subset queries)"
559
+ )
560
+ # Use fresh PyMongo cursor (not self which may be exhausted)
561
+ pymongo_cursor = self._collection.pymongo_collection.find(
562
+ self._filter, self._projection
563
+ )
564
+ if self._sort:
565
+ pymongo_cursor = pymongo_cursor.sort(self._sort)
566
+ if self._skip:
567
+ pymongo_cursor = pymongo_cursor.skip(self._skip)
568
+ if self._limit:
569
+ pymongo_cursor = pymongo_cursor.limit(self._limit)
570
+ if self._batch_size:
571
+ pymongo_cursor = pymongo_cursor.batch_size(self._batch_size)
572
+ return pd.json_normalize(list(pymongo_cursor))
573
+
574
+ # Validate sort field if specified
575
+ if self._sort:
576
+ sort_validation = validate_sort_field(self._sort, schema)
577
+ if not sort_validation.is_valid:
578
+ raise ValueError(f"Sort validation failed: {sort_validation.reason}")
579
+
580
+ # Parse and validate date filters
581
+ parsed_start = parse_datetime_tz_aware(start_date, "start_date")
582
+ parsed_end = parse_datetime_tz_aware(end_date, "end_date")
583
+
584
+ if not accelerate:
585
+ # Fallback to regular iteration (ignores date filters)
586
+ if parsed_start or parsed_end:
587
+ logger.warning(
588
+ "start_date/end_date filters are ignored when accelerate=False"
589
+ )
590
+ return self._to_dataframe_regular()
591
+
592
+ is_chunkable, reason, brackets, _ = build_brackets_for_find(
593
+ self._filter,
594
+ schema.time_field,
595
+ self._sort, # Pass sort spec for $natural detection
596
+ )
597
+
598
+ # Validate chunking_granularity if provided
599
+ # CRITICAL: If chunking_granularity is None, we CANNOT chunk the query
600
+ # because we don't know the data's time precision (could be ms, us, ns)
601
+ if chunking_granularity is not None:
602
+ if chunking_granularity.total_seconds() <= 0:
603
+ raise ValueError(
604
+ f"chunking_granularity must be positive, got {chunking_granularity}"
605
+ )
606
+
607
+ if not is_chunkable:
608
+ # REJECT mode - invalid query syntax or contradictory constraints
609
+ # This is different from SINGLE mode (where is_chunkable=True, brackets empty)
610
+ if parsed_start or parsed_end:
611
+ logger.warning(
612
+ "start_date/end_date filters are ignored for non-chunkable queries"
613
+ )
614
+ logger.info("Query has invalid syntax (%s) - cannot execute", reason)
615
+ return self._to_dataframe_accelerated(
616
+ cache_read=cache_read,
617
+ cache_write=cache_write,
618
+ start_date=parsed_start,
619
+ end_date=parsed_end,
620
+ coerce=coerce,
621
+ max_workers=1, # Single worker for invalid queries
622
+ chunking_granularity=None, # No chunking
623
+ is_chunkable=False,
624
+ )
625
+
626
+ # Check for SINGLE mode - valid query but single-worker fallback
627
+ # Indicated by: is_chunkable=True AND empty brackets
628
+ if is_chunkable and not brackets:
629
+ # SINGLE mode examples: $natural sort, unbounded $or branches
630
+ logger.info(
631
+ "Query valid but not parallelizable (%s) - using single-worker mode",
632
+ reason,
633
+ )
634
+ return self._to_dataframe_accelerated(
635
+ cache_read=cache_read,
636
+ cache_write=cache_write,
637
+ start_date=parsed_start,
638
+ end_date=parsed_end,
639
+ coerce=coerce,
640
+ max_workers=1, # Single worker for SINGLE mode
641
+ chunking_granularity=None, # No chunking
642
+ is_chunkable=False,
643
+ )
644
+
645
+ # Query IS chunkable, but do we have granularity info?
646
+ if chunking_granularity is None:
647
+ # No chunking_granularity provided - cannot parallelize safely
648
+ # because we don't know how to split the time range
649
+ logger.info(
650
+ "Query is chunkable but chunking_granularity not provided - "
651
+ "using single-worker mode. Provide chunking_granularity=timedelta(...) "
652
+ "to enable parallel execution."
653
+ )
654
+ return self._to_dataframe_accelerated(
655
+ cache_read=cache_read,
656
+ cache_write=cache_write,
657
+ start_date=parsed_start,
658
+ end_date=parsed_end,
659
+ coerce=coerce,
660
+ max_workers=1, # Single worker - no chunking info
661
+ chunking_granularity=None,
662
+ is_chunkable=False, # Treat as non-chunkable since we can't chunk
663
+ flush_ram_limit_mb=flush_ram_limit_mb, # Pass through for cache reading
664
+ row_group_size=row_group_size, # Pass through for DuckDB batch
665
+ )
666
+
667
+ # Use accelerated parallel execution - we have chunking info!
668
+ return self._to_dataframe_accelerated(
669
+ cache_read=cache_read,
670
+ cache_write=cache_write,
671
+ start_date=parsed_start,
672
+ end_date=parsed_end,
673
+ coerce=coerce,
674
+ max_workers=max_workers,
675
+ chunking_granularity=chunking_granularity,
676
+ is_chunkable=True,
677
+ flush_ram_limit_mb=flush_ram_limit_mb,
678
+ row_group_size=row_group_size,
679
+ )
680
+
681
+ def to_dataframe_batches(
682
+ self,
683
+ batch_size: int = DEFAULT_BATCH_SIZE,
684
+ cache_read: bool = True,
685
+ cache_write: bool = True,
686
+ start_date: Optional[Union[datetime, date, str]] = None,
687
+ end_date: Optional[Union[datetime, date, str]] = None,
688
+ coerce: Literal["raise", "error"] = "raise",
689
+ max_workers: int = 4,
690
+ chunking_granularity: Optional[timedelta] = None,
691
+ row_group_size: Optional[int] = None,
692
+ flush_ram_limit_mb: int = 512,
693
+ ) -> Generator[pd.DataFrame, None, None]:
694
+ """
695
+ Yield DataFrames in batches from cache without loading all data into memory.
696
+
697
+ This is a memory-efficient alternative to to_dataframe() for very large
698
+ result sets. Instead of loading the entire result into memory, it yields
699
+ smaller DataFrames that can be processed incrementally.
700
+
701
+ ┌─────────────────────────────────────────────────────────────────────┐
702
+ │ MEMORY-EFFICIENT BATCH PROCESSING: │
703
+ │ │
704
+ │ Instead of: │
705
+ │ df = cursor.to_dataframe() # Loads ALL 10M rows into RAM │
706
+ │ │
707
+ │ Use: │
708
+ │ for batch_df in cursor.to_dataframe_batches(batch_size=50000): │
709
+ │ process(batch_df) # Only 50K rows in RAM at a time │
710
+ │ │
711
+ │ Memory usage: O(batch_size) instead of O(total_rows) │
712
+ └─────────────────────────────────────────────────────────────────────┘
713
+
714
+ Args:
715
+ batch_size: Number of rows per DataFrame batch (default: 10,000)
716
+ cache_read: Read from Parquet cache if available
717
+ cache_write: Write results to Parquet cache on cache miss
718
+ start_date: Filter cached data from this date (inclusive).
719
+ Accepts datetime, date, or ISO string with timezone.
720
+ end_date: Filter cached data until this date (exclusive).
721
+ coerce: Error handling mode ("raise" or "error")
722
+ max_workers: Maximum parallel workers for cache population (default: 4)
723
+ chunking_granularity: Time granularity for chunking (required for parallel fetch)
724
+
725
+ Yields:
726
+ pd.DataFrame: Batches of rows as DataFrames
727
+
728
+ Raises:
729
+ ValueError: If no schema is provided
730
+ ValueError: If date strings are not timezone-aware
731
+ ValueError: If cache doesn't exist and cache_write=False
732
+
733
+ Example:
734
+ >>> # Process 10M rows without loading all into RAM
735
+ >>> total = 0
736
+ >>> for batch_df in cursor.to_dataframe_batches(batch_size=50000):
737
+ ... total += len(batch_df)
738
+ ... # Process batch_df...
739
+ >>> print(f"Processed {total} rows")
740
+ >>>
741
+ >>> # With date filtering:
742
+ >>> for batch_df in cursor.to_dataframe_batches(
743
+ ... batch_size=10000,
744
+ ... start_date="2024-06-01T00:00:00Z",
745
+ ... end_date="2024-06-15T00:00:00Z"
746
+ ... ):
747
+ ... analyze(batch_df)
748
+ """
749
+ # Schema is required
750
+ schema = self._collection.schema
751
+ if schema is None:
752
+ raise ValueError(
753
+ "Schema is required for to_dataframe_batches(). "
754
+ "Provide a schema when creating the collection."
755
+ )
756
+
757
+ # CRITICAL: If limit() or skip() are used, fall back to PyMongo
758
+ # Reason: Downloading all data just to return a subset is impractical
759
+ if self._limit > 0 or self._skip > 0:
760
+ logger.info(
761
+ "limit() or skip() detected - falling back to PyMongo iteration "
762
+ "(acceleration would be impractical for subset queries)"
763
+ )
764
+ # Use fresh PyMongo cursor in batches (not self which may be exhausted)
765
+ pymongo_cursor = self._collection.pymongo_collection.find(
766
+ self._filter, self._projection
767
+ )
768
+ if self._sort:
769
+ pymongo_cursor = pymongo_cursor.sort(self._sort)
770
+ if self._skip:
771
+ pymongo_cursor = pymongo_cursor.skip(self._skip)
772
+ if self._limit:
773
+ pymongo_cursor = pymongo_cursor.limit(self._limit)
774
+ if self._batch_size:
775
+ pymongo_cursor = pymongo_cursor.batch_size(self._batch_size)
776
+
777
+ batch = []
778
+ for doc in pymongo_cursor:
779
+ batch.append(doc)
780
+ if len(batch) >= batch_size:
781
+ yield pd.DataFrame(batch)
782
+ batch = []
783
+ if batch:
784
+ yield pd.DataFrame(batch)
785
+ return
786
+
787
+ # CRITICAL: Validate projection doesn't exclude required fields
788
+ if self._projection:
789
+ projection_values = [v for k, v in self._projection.items() if k != "_id"]
790
+ is_inclusion = any(v == 1 for v in projection_values)
791
+
792
+ # Time field must be included
793
+ if is_inclusion:
794
+ time_in_projection = (
795
+ schema.time_field in self._projection
796
+ and self._projection[schema.time_field] == 1
797
+ )
798
+ if not time_in_projection:
799
+ raise ValueError(
800
+ f"Projection must include time field '{schema.time_field}'. "
801
+ f"Projection: {self._projection}"
802
+ )
803
+
804
+ # Sort fields must be included
805
+ if self._sort:
806
+ for sort_field, _ in self._sort:
807
+ if is_inclusion:
808
+ if (
809
+ sort_field not in self._projection
810
+ or self._projection[sort_field] != 1
811
+ ):
812
+ raise ValueError(
813
+ f"Projection must include sort field '{sort_field}'. "
814
+ f"Cannot sort by a field that is projected out. "
815
+ f"Projection: {self._projection}"
816
+ )
817
+
818
+ time_field = schema.time_field
819
+
820
+ # Validate sort field if specified
821
+ if self._sort:
822
+ sort_validation = validate_sort_field(self._sort, schema)
823
+ if not sort_validation.is_valid:
824
+ raise ValueError(f"Sort validation failed: {sort_validation.reason}")
825
+ logger.info(
826
+ "Sorted streaming enabled - using DuckDB K-way merge for global sort order"
827
+ )
828
+
829
+ # Parse and validate date filters
830
+ parsed_start = parse_datetime_tz_aware(start_date, "start_date")
831
+ parsed_end = parse_datetime_tz_aware(end_date, "end_date")
832
+
833
+ is_chunkable, reason, brackets, _ = build_brackets_for_find(
834
+ self._filter,
835
+ time_field,
836
+ self._sort, # Pass sort spec for $natural detection
837
+ )
838
+
839
+ # Handle REJECT mode (is_chunkable=False)
840
+ if not is_chunkable:
841
+ warnings.warn(
842
+ f"Invalid query syntax ({reason}). Cannot execute this query.",
843
+ UserWarning,
844
+ stacklevel=2,
845
+ )
846
+ # Override max_workers to 1 for invalid queries
847
+ max_workers = 1
848
+ chunking_granularity = None
849
+
850
+ # Handle SINGLE mode (is_chunkable=True but empty brackets)
851
+ elif is_chunkable and not brackets:
852
+ warnings.warn(
853
+ f"Query valid but not parallelizable ({reason}). Using single-worker mode.",
854
+ UserWarning,
855
+ stacklevel=2,
856
+ )
857
+ # Override max_workers to 1 for SINGLE mode
858
+ max_workers = 1
859
+ chunking_granularity = None
860
+
861
+ # Mark as started
862
+ if not self._started:
863
+ self._started = True
864
+
865
+ # Create cache manager
866
+ cache = CacheManager(
867
+ filter_dict=self._filter,
868
+ projection=self._projection,
869
+ sort=self._sort,
870
+ )
871
+
872
+ # Ensure cache exists
873
+ if not cache.exists():
874
+ if not cache_write:
875
+ raise ValueError(
876
+ "Cache does not exist and cache_write=False. "
877
+ "Either call to_dataframe() first to populate cache, "
878
+ "or set cache_write=True."
879
+ )
880
+
881
+ # Populate cache first
882
+ print("[Query] Cache miss - fetching from MongoDB...")
883
+
884
+ # Populate cache via accelerated executor
885
+ result = execute_parallel_stream_to_cache(
886
+ pymongo_collection=self._collection.pymongo_collection,
887
+ filter_dict=self._filter,
888
+ schema=schema,
889
+ cache_manager=cache,
890
+ projection=self._projection,
891
+ approx_document_size_bytes=self._collection.approx_document_size_bytes,
892
+ max_workers=max_workers,
893
+ peak_ram_limit_mb=flush_ram_limit_mb,
894
+ chunking_granularity=chunking_granularity,
895
+ mongo_uri=self._collection.mongo_uri,
896
+ sort_spec=self._sort, # Pass sort for pre-sorting during Parquet write
897
+ row_group_size=row_group_size,
898
+ )
899
+
900
+ print(
901
+ f"\n[Cache] Cache written: {result['total_docs']:,} docs in {result['duration_s']:.2f}s"
902
+ )
903
+
904
+ elif not cache_read and cache_write:
905
+ # CRITICAL: cache_read=False but cache_write=True and cache exists
906
+ # Clear old cache and re-populate to avoid duplicate data
907
+ print(
908
+ "[Clean] Clearing existing cache (cache_read=False, starting fresh)..."
909
+ )
910
+ cache.clean()
911
+
912
+ print("[Query] Re-fetching from MongoDB...")
913
+
914
+ # Re-populate cache via accelerated executor
915
+ result = execute_parallel_stream_to_cache(
916
+ pymongo_collection=self._collection.pymongo_collection,
917
+ filter_dict=self._filter,
918
+ schema=schema,
919
+ cache_manager=cache,
920
+ projection=self._projection,
921
+ approx_document_size_bytes=self._collection.approx_document_size_bytes,
922
+ max_workers=max_workers,
923
+ peak_ram_limit_mb=flush_ram_limit_mb,
924
+ chunking_granularity=chunking_granularity,
925
+ mongo_uri=self._collection.mongo_uri,
926
+ sort_spec=self._sort, # Pass sort for pre-sorting during Parquet write
927
+ row_group_size=row_group_size,
928
+ )
929
+
930
+ print(
931
+ f"\n[Cache] Cache re-written: {result['total_docs']:,} docs in {result['duration_s']:.2f}s"
932
+ )
933
+
934
+ # Now yield batches from cache
935
+ print(f"[Cache] Streaming batches from cache: {cache.cache_dir}")
936
+ reader = ParquetReader(cache.cache_dir)
937
+
938
+ # Use globally sorted streaming if sort is specified
939
+ if self._sort:
940
+ print("[Sort] Using DuckDB K-way merge for globally sorted batches")
941
+ yield from reader.iter_globally_sorted_batches(
942
+ sort_spec=self._sort, # Pass full sort spec for multi-field sorting
943
+ batch_size=batch_size,
944
+ schema=schema,
945
+ time_field=time_field,
946
+ start_date=parsed_start,
947
+ end_date=parsed_end,
948
+ coerce=coerce,
949
+ memory_limit_mb=flush_ram_limit_mb, # Pass RAM limit to DuckDB
950
+ threads=max_workers, # Pass thread count to DuckDB
951
+ )
952
+ else:
953
+ yield from reader.iter_dataframe_batches(
954
+ batch_size=batch_size,
955
+ schema=schema,
956
+ time_field=time_field,
957
+ start_date=parsed_start,
958
+ end_date=parsed_end,
959
+ coerce=coerce,
960
+ )
961
+
962
+ def stream_to_callback(
963
+ self,
964
+ callback: Callable[["pa.Table", Dict[str, Any]], None],
965
+ *,
966
+ partition_time_delta: timedelta,
967
+ partition_by: Optional[Union[str, List[str]]] = None,
968
+ any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
969
+ max_workers: int = 4,
970
+ chunking_granularity: Optional[timedelta] = None,
971
+ row_group_size: Optional[int] = None,
972
+ flush_ram_limit_mb: int = 512,
973
+ cache_read: bool = True,
974
+ cache_write: bool = True,
975
+ ) -> Dict[str, Any]:
976
+ """
977
+ Stream partitioned PyArrow tables to a callback function.
978
+
979
+ This is a two-phase operation:
980
+ 1. Download data from MongoDB to local Parquet cache (reuses Rust backend)
981
+ 2. Partition data and call callback in parallel for each partition
982
+
983
+ Perfect for populating data lakes with partitioned data structures.
984
+
985
+ ┌─────────────────────────────────────────────────────────────────────┐
986
+ │ PARTITION MODES: │
987
+ │ │
988
+ │ TIME ONLY (partition_by=None): │
989
+ │ partition_time_delta=timedelta(weeks=1) │
990
+ │ -> 1 callback per week of data │
991
+ │ │
992
+ │ TIME + FIELD (partition_by="metadata.instrument"): │
993
+ │ partition_time_delta=timedelta(weeks=1) │
994
+ │ -> 1 callback per (week, instrument) combination │
995
+ │ │
996
+ │ Example: 1 year of data, 10 instruments, weekly partitions │
997
+ │ -> 52 weeks × 10 instruments = 520 callbacks │
998
+ └─────────────────────────────────────────────────────────────────────┘
999
+
1000
+ The callback receives:
1001
+ - table: PyArrow Table with data for this partition
1002
+ - metadata: Dict with partition info:
1003
+ {
1004
+ "time_start": datetime, # Start of time bucket
1005
+ "time_end": datetime, # End of time bucket
1006
+ "partition_values": {...}, # Values for partition_by fields
1007
+ "row_count": int, # Rows in this table
1008
+ "partition_index": int, # 0-based partition index
1009
+ "total_partitions": int, # Total partition count
1010
+ }
1011
+
1012
+ Args:
1013
+ callback: Function(table: pa.Table, metadata: dict) -> None
1014
+ Called for each partition. Runs in ThreadPoolExecutor.
1015
+ partition_time_delta: Time bucket size for partitioning.
1016
+ Example: timedelta(weeks=1) creates weekly partitions.
1017
+ REQUIRED - determines how data is grouped.
1018
+ partition_by: Field(s) to partition by, in addition to time.
1019
+ Example: "metadata.instrument" or ["region", "device_id"]
1020
+ Can be any field in schema except time field.
1021
+ None = partition by time only.
1022
+ any_type_strategy: How to decode Types.Any() struct columns:
1023
+ - "float": Coalesce to Float64, prioritize numeric (default)
1024
+ - "string": Convert everything to string (lossless)
1025
+ - "keep_struct": Keep raw struct, don't decode
1026
+ max_workers: Number of parallel callback threads (default: 4).
1027
+ DuckDB releases GIL, so threads get true parallelism.
1028
+ chunking_granularity: Time granularity for MongoDB fetch chunks.
1029
+ Used during Phase 1 (download). Example: timedelta(hours=16).
1030
+ If None, defaults to partition_time_delta.
1031
+ flush_ram_limit_mb: RAM limit for buffered data (default: 512).
1032
+ Used during both download and partition phases.
1033
+ cache_read: Read from existing cache if available (default: True).
1034
+ cache_write: Write to cache during download (default: True).
1035
+
1036
+ Returns:
1037
+ Dict with:
1038
+ - total_partitions: Number of partitions processed
1039
+ - total_rows: Total rows across all partitions
1040
+ - skipped_partitions: Empty partitions skipped
1041
+ - duration_s: Total execution time
1042
+ - cache_duration_s: Time spent on cache population
1043
+ - partition_duration_s: Time spent on partition callbacks
1044
+
1045
+ Raises:
1046
+ ValueError: If no schema provided
1047
+ ValueError: If query not chunkable (no time bounds)
1048
+ ValueError: If sort specified on non-time field
1049
+ RuntimeError: If callback fails for any partition
1050
+
1051
+ Example:
1052
+ >>> # Upload weekly data per instrument to S3 data lake
1053
+ >>> import pyarrow.parquet as pq
1054
+ >>> import s3fs
1055
+ >>>
1056
+ >>> fs = s3fs.S3FileSystem()
1057
+ >>>
1058
+ >>> def upload_partition(table, metadata):
1059
+ ... instrument = metadata['partition_values'].get('metadata.instrument', 'unknown')
1060
+ ... week = metadata['time_start'].strftime('%Y-%m-%d')
1061
+ ... path = f"s3://bucket/data/instrument={instrument}/week={week}.parquet"
1062
+ ... pq.write_table(table, path, filesystem=fs)
1063
+ >>>
1064
+ >>> cursor.stream_to_callback(
1065
+ ... callback=upload_partition,
1066
+ ... partition_time_delta=timedelta(weeks=1),
1067
+ ... partition_by="metadata.instrument",
1068
+ ... max_workers=8,
1069
+ ... chunking_granularity=timedelta(hours=16),
1070
+ ... )
1071
+ """
1072
+ total_start = time.time()
1073
+
1074
+ schema = self._collection.schema
1075
+ if schema is None:
1076
+ raise ValueError(
1077
+ "Schema is required for stream_to_callback(). "
1078
+ "Provide a schema when creating the collection."
1079
+ )
1080
+
1081
+ # CRITICAL: limit() and skip() don't make sense for streaming callbacks
1082
+ # These operations require knowing the full result set, which defeats
1083
+ # the purpose of streaming
1084
+ if self._limit > 0 or self._skip > 0:
1085
+ raise ValueError(
1086
+ "stream_to_callback() does not support limit() or skip(). "
1087
+ "These operations require knowing the total result set size upfront, "
1088
+ "which defeats the purpose of streaming. "
1089
+ "Use to_dataframe() or iterate with PyMongo cursor instead."
1090
+ )
1091
+
1092
+ time_field = schema.time_field
1093
+
1094
+ # CRITICAL: Validate projection doesn't exclude partition_by fields
1095
+ if self._projection and partition_by:
1096
+ # Check if projection is inclusion (has 1 values) or exclusion (has 0 values)
1097
+ projection_values = [v for k, v in self._projection.items() if k != "_id"]
1098
+ is_inclusion = any(v == 1 for v in projection_values)
1099
+
1100
+ # Time field must be included
1101
+ if is_inclusion:
1102
+ time_in_projection = (
1103
+ time_field in self._projection and self._projection[time_field] == 1
1104
+ )
1105
+ if not time_in_projection:
1106
+ raise ValueError(
1107
+ f"Projection must include time field '{time_field}'. "
1108
+ f"Projection: {self._projection}"
1109
+ )
1110
+
1111
+ # Partition fields must be included
1112
+ partition_by_list = (
1113
+ [partition_by] if isinstance(partition_by, str) else partition_by
1114
+ )
1115
+ for field in partition_by_list:
1116
+ if is_inclusion:
1117
+ # For parent fields like "metadata", check if any child is included
1118
+ field_or_children_included = (
1119
+ field in self._projection and self._projection[field] == 1
1120
+ ) or any(
1121
+ k.startswith(f"{field}.") and self._projection[k] == 1
1122
+ for k in self._projection.keys()
1123
+ )
1124
+ if not field_or_children_included:
1125
+ raise ValueError(
1126
+ f"Projection must include partition field '{field}'. "
1127
+ f"Cannot partition by a field that is projected out. "
1128
+ f"Projection: {self._projection}"
1129
+ )
1130
+
1131
+ # Validate sort fields in projection
1132
+ if self._projection and self._sort:
1133
+ projection_values = [v for k, v in self._projection.items() if k != "_id"]
1134
+ is_inclusion = any(v == 1 for v in projection_values)
1135
+ for sort_field, _ in self._sort:
1136
+ if is_inclusion:
1137
+ if (
1138
+ sort_field not in self._projection
1139
+ or self._projection[sort_field] != 1
1140
+ ):
1141
+ raise ValueError(
1142
+ f"Projection must include sort field '{sort_field}'. "
1143
+ f"Projection: {self._projection}"
1144
+ )
1145
+
1146
+ # Validate sort - only allow time field sorting
1147
+ if self._sort:
1148
+ for field, _direction in self._sort:
1149
+ if field != time_field:
1150
+ raise ValueError(
1151
+ f"stream_to_callback() only supports sorting by time field '{time_field}'. "
1152
+ f"Got sort field: '{field}'. "
1153
+ "Remove .sort() or sort only by time field."
1154
+ )
1155
+ # Store sort direction
1156
+ sort_ascending = self._sort[0][1] == 1
1157
+ else:
1158
+ sort_ascending = True # Default to ascending
1159
+
1160
+ # Normalize partition_by to list
1161
+ partition_by_list: Optional[List[str]] = None
1162
+ if partition_by is not None:
1163
+ if isinstance(partition_by, str):
1164
+ partition_by_list = [partition_by]
1165
+ else:
1166
+ partition_by_list = list(partition_by)
1167
+
1168
+ # Validate partition_by fields exist in schema (or are parent fields with children)
1169
+ all_schema_fields = list(schema.fields.keys())
1170
+ for field in partition_by_list:
1171
+ if field == time_field:
1172
+ raise ValueError(
1173
+ f"Cannot partition by time field '{time_field}'. "
1174
+ "Time partitioning is automatic via partition_time_delta."
1175
+ )
1176
+ # Check if field exists directly OR has children
1177
+ has_direct = schema.has_field(field)
1178
+ has_children = any(f.startswith(f"{field}.") for f in all_schema_fields)
1179
+ if not has_direct and not has_children:
1180
+ raise ValueError(
1181
+ f"Partition field '{field}' not found in schema. "
1182
+ f"Available fields: {all_schema_fields}"
1183
+ )
1184
+
1185
+ # Default chunking_granularity to partition_time_delta
1186
+ if chunking_granularity is None:
1187
+ chunking_granularity = partition_time_delta
1188
+
1189
+ # NEW: build_brackets_for_find internally validates via is_chunkable_query
1190
+ is_chunkable, reason, brackets, _ = build_brackets_for_find(
1191
+ self._filter,
1192
+ time_field,
1193
+ self._sort, # Pass sort spec for $natural detection
1194
+ )
1195
+
1196
+ # Handle REJECT mode (is_chunkable=False)
1197
+ if not is_chunkable:
1198
+ warnings.warn(
1199
+ f"Invalid query syntax ({reason}). Cannot execute this query.",
1200
+ UserWarning,
1201
+ stacklevel=2,
1202
+ )
1203
+ # Override max_workers to 1 for invalid queries
1204
+ max_workers = 1
1205
+ chunking_granularity = None
1206
+
1207
+ # Handle SINGLE mode (is_chunkable=True but empty brackets)
1208
+ elif is_chunkable and not brackets:
1209
+ warnings.warn(
1210
+ f"Query valid but not parallelizable ({reason}). Using single-worker mode.",
1211
+ UserWarning,
1212
+ stacklevel=2,
1213
+ )
1214
+ # Override max_workers to 1 for SINGLE mode
1215
+ max_workers = 1
1216
+ chunking_granularity = None
1217
+
1218
+ # Mark as started
1219
+ if not self._started:
1220
+ self._started = True
1221
+
1222
+ # ─────────────────────────────────────────────────────────────────────
1223
+ # PHASE 1: Download to cache (reuses existing Rust backend)
1224
+ # ─────────────────────────────────────────────────────────────────────
1225
+ cache = CacheManager(
1226
+ filter_dict=self._filter,
1227
+ projection=self._projection,
1228
+ sort=self._sort,
1229
+ )
1230
+
1231
+ cache_start = time.time()
1232
+
1233
+ if cache_read and cache.exists():
1234
+ print(f"[Cache] Using existing cache: {cache.cache_dir}")
1235
+ else:
1236
+ if not cache_write:
1237
+ raise ValueError(
1238
+ "Cache does not exist and cache_write=False. "
1239
+ "Set cache_write=True to download data first."
1240
+ )
1241
+
1242
+ if cache.exists() and not cache_read:
1243
+ print("[Clean] Clearing existing cache (cache_read=False)...")
1244
+ cache.clean()
1245
+
1246
+ print("[Query] Downloading from MongoDB to cache...")
1247
+ result = execute_parallel_stream_to_cache(
1248
+ pymongo_collection=self._collection.pymongo_collection,
1249
+ filter_dict=self._filter,
1250
+ schema=schema,
1251
+ cache_manager=cache,
1252
+ projection=self._projection,
1253
+ approx_document_size_bytes=self._collection.approx_document_size_bytes,
1254
+ max_workers=max_workers,
1255
+ peak_ram_limit_mb=flush_ram_limit_mb,
1256
+ chunking_granularity=chunking_granularity,
1257
+ mongo_uri=self._collection.mongo_uri,
1258
+ row_group_size=row_group_size,
1259
+ )
1260
+ print(
1261
+ f"[Cache] Downloaded: {result['total_docs']:,} docs in {result['duration_s']:.2f}s"
1262
+ )
1263
+
1264
+ cache_duration = time.time() - cache_start
1265
+
1266
+ # ─────────────────────────────────────────────────────────────────────
1267
+ # PHASE 2: Partition and stream to callbacks
1268
+ # ─────────────────────────────────────────────────────────────────────
1269
+
1270
+ partition_result = execute_partitioned_callback(
1271
+ cache_dir=str(cache.cache_dir),
1272
+ schema=schema,
1273
+ callback=callback,
1274
+ partition_time_delta=partition_time_delta,
1275
+ partition_by=partition_by_list,
1276
+ any_type_strategy=any_type_strategy,
1277
+ max_workers=max_workers,
1278
+ sort_ascending=sort_ascending,
1279
+ memory_limit_mb=flush_ram_limit_mb,
1280
+ )
1281
+
1282
+ total_duration = time.time() - total_start
1283
+
1284
+ return {
1285
+ "total_partitions": partition_result["total_partitions"],
1286
+ "total_rows": partition_result["total_rows"],
1287
+ "skipped_partitions": partition_result["skipped_partitions"],
1288
+ "duration_s": total_duration,
1289
+ "cache_duration_s": cache_duration,
1290
+ "partition_duration_s": partition_result["duration_s"],
1291
+ }
1292
+
1293
+ def to_polars(
1294
+ self,
1295
+ accelerate: bool = True,
1296
+ cache_read: bool = True,
1297
+ cache_write: bool = True,
1298
+ start_date: Optional[Union[datetime, date, str]] = None,
1299
+ end_date: Optional[Union[datetime, date, str]] = None,
1300
+ coerce: Literal["raise", "error"] = "raise",
1301
+ max_workers: int = 4,
1302
+ chunking_granularity: Optional[timedelta] = None,
1303
+ row_group_size: Optional[int] = None,
1304
+ any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
1305
+ flush_ram_limit_mb: int = 512,
1306
+ ) -> pl.DataFrame:
1307
+ """
1308
+ Convert results to Polars DataFrame with optional acceleration.
1309
+
1310
+ This mirrors to_dataframe() but returns a Polars DataFrame.
1311
+ Uses ParquetReader with engine="polars" for efficient native reading.
1312
+
1313
+ Args:
1314
+ accelerate: Enable acceleration if query is chunkable
1315
+ cache_read: Read from Parquet cache if available
1316
+ cache_write: Write results to Parquet cache
1317
+ start_date: Filter cached data from this date (inclusive).
1318
+ Accepts datetime, date, or ISO string with timezone.
1319
+ end_date: Filter cached data until this date (exclusive).
1320
+ coerce: Error handling mode ("raise" or "error")
1321
+ max_workers: Maximum parallel workers (default: 4)
1322
+ chunking_granularity: Time granularity for chunking (e.g., timedelta(days=1))
1323
+ row_group_size: Rows per parquet row group. If None, Rust default is used.
1324
+ any_type_strategy: How to decode Types.Any() struct columns:
1325
+ - "float": Coalesce to Float64, prioritize numeric (default)
1326
+ - "string": Convert everything to string (lossless)
1327
+ - "keep_struct": Keep raw struct, don't decode
1328
+ flush_ram_limit_mb: RAM limit in MB for buffered data before flushing.
1329
+ (default: 512)
1330
+
1331
+ Returns:
1332
+ Polars DataFrame with results
1333
+
1334
+ Raises:
1335
+ ValueError: If no schema is provided
1336
+
1337
+ Example:
1338
+ >>> cursor = collection.find({...}).sort("timestamp", 1)
1339
+ >>> df = cursor.to_polars(
1340
+ ... max_workers=8,
1341
+ ... chunking_granularity=timedelta(days=7),
1342
+ ... flush_ram_limit_mb=2000,
1343
+ ... )
1344
+ """
1345
+ schema = self._collection.schema
1346
+ if schema is None:
1347
+ raise ValueError(
1348
+ "Schema is required for to_polars(). "
1349
+ "Provide a schema when creating the collection."
1350
+ )
1351
+
1352
+ # CRITICAL: If limit() or skip() are used, fall back to PyMongo
1353
+ # Reason: Downloading all data just to return a subset is impractical
1354
+ if self._limit > 0 or self._skip > 0:
1355
+ logger.info(
1356
+ "limit() or skip() detected - falling back to PyMongo iteration "
1357
+ "(acceleration would be impractical for subset queries)"
1358
+ )
1359
+ # Use fresh PyMongo cursor (not self which may be exhausted)
1360
+ pymongo_cursor = self._collection.pymongo_collection.find(
1361
+ self._filter, self._projection
1362
+ )
1363
+ if self._sort:
1364
+ pymongo_cursor = pymongo_cursor.sort(self._sort)
1365
+ if self._skip:
1366
+ pymongo_cursor = pymongo_cursor.skip(self._skip)
1367
+ if self._limit:
1368
+ pymongo_cursor = pymongo_cursor.limit(self._limit)
1369
+ if self._batch_size:
1370
+ pymongo_cursor = pymongo_cursor.batch_size(self._batch_size)
1371
+ docs = list(pymongo_cursor)
1372
+ if not docs:
1373
+ return pl.DataFrame()
1374
+ return pl.DataFrame(docs)
1375
+
1376
+ # CRITICAL: Validate projection doesn't exclude required fields
1377
+ if self._projection:
1378
+ projection_values = [v for k, v in self._projection.items() if k != "_id"]
1379
+ is_inclusion = any(v == 1 for v in projection_values)
1380
+
1381
+ # Time field must be included
1382
+ if is_inclusion:
1383
+ time_in_projection = (
1384
+ schema.time_field in self._projection
1385
+ and self._projection[schema.time_field] == 1
1386
+ )
1387
+ if not time_in_projection:
1388
+ raise ValueError(
1389
+ f"Projection must include time field '{schema.time_field}'. "
1390
+ f"Projection: {self._projection}"
1391
+ )
1392
+
1393
+ # Sort fields must be included
1394
+ if self._sort:
1395
+ for sort_field, _ in self._sort:
1396
+ if is_inclusion:
1397
+ if (
1398
+ sort_field not in self._projection
1399
+ or self._projection[sort_field] != 1
1400
+ ):
1401
+ raise ValueError(
1402
+ f"Projection must include sort field '{sort_field}'. "
1403
+ f"Cannot sort by a field that is projected out. "
1404
+ f"Projection: {self._projection}"
1405
+ )
1406
+
1407
+ time_field = schema.time_field
1408
+
1409
+ # Validate sort field if specified
1410
+ if self._sort:
1411
+ sort_validation = validate_sort_field(self._sort, schema)
1412
+ if not sort_validation.is_valid:
1413
+ raise ValueError(f"Sort validation failed: {sort_validation.reason}")
1414
+
1415
+ # Parse and validate date filters
1416
+ parsed_start = parse_datetime_tz_aware(start_date, "start_date")
1417
+ parsed_end = parse_datetime_tz_aware(end_date, "end_date")
1418
+
1419
+ if not accelerate:
1420
+ if parsed_start or parsed_end:
1421
+ logger.warning(
1422
+ "start_date/end_date filters are ignored when accelerate=False"
1423
+ )
1424
+ # Fallback to regular iteration (native Polars from dicts)
1425
+ return self._to_polars_regular()
1426
+
1427
+ is_chunkable, reason, brackets, _ = build_brackets_for_find(
1428
+ self._filter,
1429
+ schema.time_field,
1430
+ self._sort, # Pass sort spec for $natural detection
1431
+ )
1432
+
1433
+ # Handle REJECT mode (is_chunkable=False)
1434
+ if not is_chunkable:
1435
+ if parsed_start or parsed_end:
1436
+ logger.warning(
1437
+ "start_date/end_date filters are ignored for non-chunkable queries"
1438
+ )
1439
+ logger.info("Invalid query syntax (%s) - cannot execute", reason)
1440
+ # Fall back to single-worker mode
1441
+ max_workers = 1
1442
+ chunking_granularity = None
1443
+
1444
+ # Handle SINGLE mode (is_chunkable=True but empty brackets)
1445
+ elif is_chunkable and not brackets:
1446
+ logger.info(
1447
+ "Query valid but not parallelizable (%s) - using single-worker mode",
1448
+ reason,
1449
+ )
1450
+ # Fall back to single-worker mode
1451
+ max_workers = 1
1452
+ chunking_granularity = None
1453
+
1454
+ # Create cache manager
1455
+ cache = CacheManager(
1456
+ filter_dict=self._filter,
1457
+ projection=self._projection,
1458
+ sort=self._sort,
1459
+ )
1460
+
1461
+ # Check if cache exists
1462
+ if cache_read and cache.exists():
1463
+ print(f"[Cache] Reading from cache (polars): {cache.cache_dir}")
1464
+ reader = ParquetReader(cache.cache_dir)
1465
+ df = cast(
1466
+ pl.DataFrame,
1467
+ reader.to_dataframe(
1468
+ engine="polars",
1469
+ schema=schema,
1470
+ time_field=time_field,
1471
+ start_date=parsed_start,
1472
+ end_date=parsed_end,
1473
+ coerce=coerce,
1474
+ any_type_strategy=any_type_strategy,
1475
+ ),
1476
+ )
1477
+
1478
+ # Check if we need DuckDB sorting (Any types or List types)
1479
+ need_duckdb_sort = False
1480
+ sort_infos: List[Dict[str, Any]] = []
1481
+ if self._sort:
1482
+ sort_infos = get_sort_field_info(self._sort, schema)
1483
+
1484
+ # Expand parent fields to children and collect all fields to check
1485
+ fields_to_check = []
1486
+ for info in sort_infos:
1487
+ if info["is_parent"]:
1488
+ # Parent field - check all children
1489
+ fields_to_check.extend(info["child_fields"])
1490
+ else:
1491
+ # Direct field
1492
+ fields_to_check.append(info["field"])
1493
+
1494
+ # Check if any of the actual sort fields (after expansion) are Any/List types
1495
+ for field in fields_to_check:
1496
+ if field in schema.fields:
1497
+ field_type = schema.fields[field]
1498
+ if isinstance(field_type, (AnyType, ListType)):
1499
+ need_duckdb_sort = True
1500
+ break
1501
+
1502
+ if self._sort and need_duckdb_sort:
1503
+ # Use DuckDB for Any/List type sorting (requires BSON type ordering / array sorting)
1504
+ print("[Sort] Using DuckDB for Types.Any()/Types.List() sorting...")
1505
+
1506
+ warnings.warn(
1507
+ "Sorting by Types.Any() field in to_polars returns raw struct columns "
1508
+ "(e.g., 'value.float_value', 'value.int64_value'). "
1509
+ "Use to_dataframe() for decoded Any() values.",
1510
+ UserWarning,
1511
+ )
1512
+
1513
+ # Use get_globally_sorted_dataframe() - more efficient than batching
1514
+ combined_df = reader.get_globally_sorted_dataframe(
1515
+ sort_spec=self._sort,
1516
+ schema=schema,
1517
+ time_field=time_field,
1518
+ start_date=parsed_start,
1519
+ end_date=parsed_end,
1520
+ coerce=coerce,
1521
+ )
1522
+
1523
+ if not combined_df.empty:
1524
+ for col in combined_df.columns:
1525
+ if combined_df[col].dtype == object:
1526
+ first_val = (
1527
+ combined_df[col].dropna().iloc[0]
1528
+ if not combined_df[col].dropna().empty
1529
+ else None
1530
+ )
1531
+ if (
1532
+ first_val is not None
1533
+ and type(first_val).__name__ == "ObjectId"
1534
+ ):
1535
+ combined_df[col] = combined_df[col].astype(str)
1536
+ df = pl.from_pandas(combined_df)
1537
+ else:
1538
+ df = pl.DataFrame()
1539
+
1540
+ elif self._sort:
1541
+ # Native Polars sort - expand parent fields to children
1542
+ expanded_sort = []
1543
+ for info in sort_infos:
1544
+ if info["is_parent"]:
1545
+ # Expand parent field to all children
1546
+ for child in info["child_fields"]:
1547
+ expanded_sort.append((child, info["direction"]))
1548
+ else:
1549
+ expanded_sort.append((info["field"], info["direction"]))
1550
+
1551
+ sort_fields = [
1552
+ field for field, _ in expanded_sort if field in df.columns
1553
+ ]
1554
+ descending = [
1555
+ direction == -1
1556
+ for field, direction in expanded_sort
1557
+ if field in df.columns
1558
+ ]
1559
+ if sort_fields:
1560
+ df = df.sort(sort_fields, descending=descending)
1561
+
1562
+ # Apply skip/limit
1563
+ if self._skip:
1564
+ df = df.slice(self._skip)
1565
+ if self._limit:
1566
+ df = df.head(self._limit)
1567
+
1568
+ print(
1569
+ f"[OK] Loaded {len(df):,} documents from cache ({reader.get_statistics()['total_size_mb']:.1f} MB)"
1570
+ )
1571
+ return df
1572
+
1573
+ # Cache miss - need to fetch and write
1574
+ if not cache_write:
1575
+ raise ValueError(
1576
+ "Cache does not exist and cache_write=False. "
1577
+ "Either enable cache_write or call to_dataframe() first."
1578
+ )
1579
+
1580
+ # Fetch data (uses same logic as to_dataframe)
1581
+ mode_str = (
1582
+ "parallel" if is_chunkable and chunking_granularity else "single-worker"
1583
+ )
1584
+ print(f"[Query] Cache miss - fetching from MongoDB ({mode_str} mode)...")
1585
+
1586
+ result = execute_parallel_stream_to_cache(
1587
+ pymongo_collection=self._collection.pymongo_collection,
1588
+ filter_dict=self._filter,
1589
+ schema=schema,
1590
+ cache_manager=cache,
1591
+ projection=self._projection,
1592
+ approx_document_size_bytes=self._collection.approx_document_size_bytes,
1593
+ max_workers=max_workers if is_chunkable else 1,
1594
+ peak_ram_limit_mb=flush_ram_limit_mb,
1595
+ chunking_granularity=chunking_granularity if is_chunkable else None,
1596
+ mongo_uri=self._collection.mongo_uri,
1597
+ row_group_size=row_group_size,
1598
+ )
1599
+
1600
+ print(
1601
+ f"\n[Cache] Cache written: {result['total_docs']:,} docs in {result['duration_s']:.2f}s"
1602
+ )
1603
+
1604
+ # Read from cache as Polars
1605
+ print("[Cache] Reading from cache to build Polars DataFrame...")
1606
+ reader = ParquetReader(cache.cache_dir)
1607
+
1608
+ # Check if we need DuckDB sorting (Any types or List types)
1609
+ need_duckdb_sort = False
1610
+ sort_infos: List[Dict[str, Any]] = []
1611
+ if self._sort:
1612
+ sort_infos = get_sort_field_info(self._sort, schema)
1613
+
1614
+ # Expand parent fields to children and collect all fields to check
1615
+ fields_to_check = []
1616
+ for info in sort_infos:
1617
+ if info["is_parent"]:
1618
+ # Parent field - check all children
1619
+ fields_to_check.extend(info["child_fields"])
1620
+ else:
1621
+ # Direct field
1622
+ fields_to_check.append(info["field"])
1623
+
1624
+ # Check if any of the actual sort fields (after expansion) are Any/List types
1625
+ for field in fields_to_check:
1626
+ if field in schema.fields:
1627
+ field_type = schema.fields[field]
1628
+ if isinstance(field_type, (AnyType, ListType)):
1629
+ need_duckdb_sort = True
1630
+ break
1631
+
1632
+ if self._sort and need_duckdb_sort:
1633
+ # Use DuckDB for Any/List type sorting (requires BSON type ordering / array sorting)
1634
+ print("[Sort] Using DuckDB for Types.Any()/Types.List() sorting...")
1635
+
1636
+ warnings.warn(
1637
+ "Sorting by Types.Any() field in to_polars returns raw struct columns "
1638
+ "(e.g., 'value.float_value', 'value.int64_value'). "
1639
+ "Use to_dataframe() for decoded Any() values.",
1640
+ UserWarning,
1641
+ )
1642
+
1643
+ # Use get_globally_sorted_dataframe() - more efficient than batching
1644
+ combined_df = reader.get_globally_sorted_dataframe(
1645
+ sort_spec=self._sort,
1646
+ schema=schema,
1647
+ time_field=time_field,
1648
+ start_date=parsed_start,
1649
+ end_date=parsed_end,
1650
+ coerce=coerce,
1651
+ )
1652
+
1653
+ if not combined_df.empty:
1654
+ for col in combined_df.columns:
1655
+ if combined_df[col].dtype == object:
1656
+ first_val = (
1657
+ combined_df[col].dropna().iloc[0]
1658
+ if not combined_df[col].dropna().empty
1659
+ else None
1660
+ )
1661
+ if (
1662
+ first_val is not None
1663
+ and type(first_val).__name__ == "ObjectId"
1664
+ ):
1665
+ combined_df[col] = combined_df[col].astype(str)
1666
+ df = pl.from_pandas(combined_df)
1667
+ else:
1668
+ df = pl.DataFrame()
1669
+ else:
1670
+ df = cast(
1671
+ pl.DataFrame,
1672
+ reader.to_dataframe(
1673
+ engine="polars",
1674
+ schema=schema,
1675
+ time_field=time_field,
1676
+ start_date=parsed_start,
1677
+ end_date=parsed_end,
1678
+ coerce=coerce,
1679
+ any_type_strategy=any_type_strategy,
1680
+ ),
1681
+ )
1682
+
1683
+ # Native Polars sort - expand parent fields to children
1684
+ if self._sort:
1685
+ expanded_sort = []
1686
+ for info in sort_infos:
1687
+ if info["is_parent"]:
1688
+ for child in info["child_fields"]:
1689
+ expanded_sort.append((child, info["direction"]))
1690
+ else:
1691
+ expanded_sort.append((info["field"], info["direction"]))
1692
+
1693
+ sort_fields = [
1694
+ field for field, _ in expanded_sort if field in df.columns
1695
+ ]
1696
+ descending = [
1697
+ direction == -1
1698
+ for field, direction in expanded_sort
1699
+ if field in df.columns
1700
+ ]
1701
+ if sort_fields:
1702
+ # Polars uses `reverse` (not `descending`) in older versions.
1703
+ df = df.sort(sort_fields, descending=descending)
1704
+
1705
+ # Apply skip/limit
1706
+ if self._skip:
1707
+ df = df.slice(self._skip)
1708
+ if self._limit:
1709
+ df = df.head(self._limit)
1710
+
1711
+ return df
1712
+
1713
+ def _to_dataframe_regular(self) -> pd.DataFrame:
1714
+ """
1715
+ Convert to DataFrame without acceleration.
1716
+
1717
+ Uses regular PyMongo iteration. Fallback for:
1718
+ - Non-chunkable queries
1719
+ - No schema provided
1720
+ - Acceleration disabled
1721
+
1722
+ Returns:
1723
+ Pandas DataFrame
1724
+ """
1725
+ # Collect all documents - __iter__ will set _started
1726
+ # Convert to DataFrame
1727
+ return pd.json_normalize(list(self))
1728
+
1729
+ def _to_polars_regular(self) -> "pl.DataFrame":
1730
+ """
1731
+ Convert to Polars DataFrame without acceleration.
1732
+
1733
+ Uses regular PyMongo iteration with native Polars conversion.
1734
+ Fallback for:
1735
+ - Non-chunkable queries
1736
+ - No schema provided
1737
+ - Acceleration disabled
1738
+
1739
+ Returns:
1740
+ Polars DataFrame
1741
+
1742
+ Note:
1743
+ Uses pl.from_dicts() which handles nested documents by creating
1744
+ struct columns. For flattened column names like pandas json_normalize,
1745
+ you would need to unnest() afterwards.
1746
+ """
1747
+ # Collect all documents - __iter__ will set _started
1748
+ docs = list(self)
1749
+
1750
+ if not docs:
1751
+ return pl.DataFrame()
1752
+
1753
+ return pl.from_dicts(docs)
1754
+
1755
+ def _to_dataframe_accelerated(
1756
+ self,
1757
+ cache_read: bool,
1758
+ cache_write: bool,
1759
+ start_date: Optional[datetime] = None,
1760
+ end_date: Optional[datetime] = None,
1761
+ coerce: Literal["raise", "error"] = "raise",
1762
+ max_workers: int = 4,
1763
+ chunking_granularity: Optional[timedelta] = None,
1764
+ is_chunkable: bool = True,
1765
+ flush_ram_limit_mb: int = 512,
1766
+ row_group_size: Optional[int] = None,
1767
+ ) -> pd.DataFrame:
1768
+ """
1769
+ Convert to DataFrame using parallel execution with Parquet caching.
1770
+
1771
+ ┌─────────────────────────────────────────────────────────────────────┐
1772
+ │ DATA FLOW - ACCELERATED EXECUTION: │
1773
+ │ │
1774
+ │ This is where the XLR8 magic happens. The flow is: │
1775
+ │ │
1776
+ │ 1. CACHE CHECK │
1777
+ │ Input: self._filter hashed to "abc123def" │
1778
+ │ Check: Does .cache/abc123def/*.parquet exist? │
1779
+ │ If yes -> Read directly from Parquet (instant!) │
1780
+ │ │
1781
+ │ 2. CACHE MISS -> PARALLEL FETCH (if chunkable) │
1782
+ │ Calls: execute_parallel_stream_to_cache() │
1783
+ │ Which does: │
1784
+ │ a) Build brackets from query (analysis/brackets.py) │
1785
+ │ Query -> [Bracket(static_filter, time_range), ...] │
1786
+ │ b) Plan execution (execution/planner.py) │
1787
+ │ Time range + RAM -> workers=N, batch_size=M │
1788
+ │ c) Chunk time ranges (analysis/chunker.py) │
1789
+ │ 6 months -> X chunks based on granularity │
1790
+ │ d) Parallel fetch (Rust backend fetch_chunks_bson) │
1791
+ │ N async workers pull chunks from queue │
1792
+ │ e) Stream to Parquet (Rust backend) │
1793
+ │ Each worker writes part files: part_0000.parquet, etc. │
1794
+ │ │
1795
+ │ 2b. CACHE MISS -> SINGLE-WORKER FETCH (if not chunkable) │
1796
+ │ - Single worker fetches all data │
1797
+ │ - No async, no chunking │
1798
+ │ - Still writes to Parquet for caching │
1799
+ │ │
1800
+ │ 3. READ FROM CACHE │
1801
+ │ After fetch, read the Parquet files we just wrote │
1802
+ │ Optionally filter by start_date/end_date │
1803
+ │ Returns: pandas DataFrame with original values │
1804
+ │ │
1805
+ │ EXAMPLE TIMING (500K docs): │
1806
+ │ - Cache hit: 0.5s (read Parquet) │
1807
+ │ - Cache miss: 10-15s (parallel fetch + write + read) │
1808
+ │ - Without XLR8: 30-40s (sequential cursor iteration) │
1809
+ └─────────────────────────────────────────────────────────────────────┘
1810
+
1811
+ Args:
1812
+ cache_read: Read from cache if available
1813
+ cache_write: Write to cache after fetching
1814
+ start_date: Filter cached data from this date (inclusive, tz-aware)
1815
+ end_date: Filter cached data until this date (exclusive, tz-aware)
1816
+ coerce: Error handling mode ("raise" or "error")
1817
+ max_workers: Maximum parallel workers (passed from to_dataframe)
1818
+ chunking_granularity: Time granularity for chunking (passed from to_dataframe)
1819
+ is_chunkable: Whether query is chunkable (determines parallel vs single-worker)
1820
+
1821
+ Returns:
1822
+ Pandas DataFrame with accelerated query results
1823
+ """
1824
+ schema = self._collection.schema
1825
+ time_field = schema.time_field
1826
+
1827
+ # Mark as started
1828
+ if not self._started:
1829
+ self._started = True
1830
+
1831
+ # ─────────────────────────────────────────────────────────────────────
1832
+ # STEP 1: Create cache manager (hashes query to unique directory)
1833
+ # Example: filter_dict hashes to "abc123def" -> .cache/abc123def/
1834
+ # ─────────────────────────────────────────────────────────────────────
1835
+ cache = CacheManager(
1836
+ filter_dict=self._filter,
1837
+ projection=self._projection,
1838
+ sort=self._sort,
1839
+ )
1840
+
1841
+ # ─────────────────────────────────────────────────────────────────────
1842
+ # STEP 2: Check cache - if hit, read directly from Parquet
1843
+ # Example: .cache/abc123def/ts_1704067200_1704070800_part_0000.parquet
1844
+ # ─────────────────────────────────────────────────────────────────────
1845
+ if cache_read and cache.exists():
1846
+ print(f"[Cache] Reading from cache: {cache.cache_dir}")
1847
+ reader = ParquetReader(cache.cache_dir)
1848
+
1849
+ # Check if we need DuckDB sorting (Any types or List types)
1850
+ need_duckdb_sort = False
1851
+ sort_infos: List[Dict[str, Any]] = []
1852
+ if self._sort:
1853
+ sort_infos = get_sort_field_info(self._sort, schema)
1854
+
1855
+ # Expand parent fields to children and collect all fields to check
1856
+ fields_to_check = []
1857
+ for info in sort_infos:
1858
+ if info["is_parent"]:
1859
+ # Parent field - check all children
1860
+ fields_to_check.extend(info["child_fields"])
1861
+ else:
1862
+ # Direct field
1863
+ fields_to_check.append(info["field"])
1864
+
1865
+ # Check if any of the actual sort fields (after expansion) are Any/List types
1866
+ for field in fields_to_check:
1867
+ if field in schema.fields:
1868
+ field_type = schema.fields[field]
1869
+ if isinstance(field_type, (AnyType, ListType)):
1870
+ need_duckdb_sort = True
1871
+ break
1872
+
1873
+ if self._sort and need_duckdb_sort:
1874
+ # Use DuckDB for Any/List type sorting (requires BSON type ordering / array sorting)
1875
+ print("[Sort] Using DuckDB for Types.Any()/Types.List() sorting...")
1876
+ df = cast(
1877
+ pd.DataFrame,
1878
+ reader.get_globally_sorted_dataframe(
1879
+ sort_spec=self._sort,
1880
+ schema=schema,
1881
+ time_field=time_field,
1882
+ start_date=start_date,
1883
+ end_date=end_date,
1884
+ coerce=coerce,
1885
+ memory_limit_mb=flush_ram_limit_mb,
1886
+ threads=max_workers,
1887
+ ),
1888
+ )
1889
+ else:
1890
+ # Normal read + native pandas sort
1891
+ df = cast(
1892
+ pd.DataFrame,
1893
+ reader.to_dataframe(
1894
+ engine="pandas",
1895
+ schema=schema,
1896
+ time_field=time_field,
1897
+ start_date=start_date,
1898
+ end_date=end_date,
1899
+ coerce=coerce,
1900
+ ),
1901
+ )
1902
+
1903
+ # Native pandas sort - expand parent fields to children
1904
+ if self._sort:
1905
+ expanded_sort = []
1906
+ for info in sort_infos:
1907
+ if info["is_parent"]:
1908
+ for child in info["child_fields"]:
1909
+ expanded_sort.append((child, info["direction"]))
1910
+ else:
1911
+ expanded_sort.append((info["field"], info["direction"]))
1912
+
1913
+ sort_fields = [
1914
+ field for field, _ in expanded_sort if field in df.columns
1915
+ ]
1916
+ ascending = [
1917
+ direction == 1
1918
+ for field, direction in expanded_sort
1919
+ if field in df.columns
1920
+ ]
1921
+ if sort_fields:
1922
+ df = df.sort_values(
1923
+ by=sort_fields, ascending=ascending, na_position="last"
1924
+ )
1925
+ logger.debug("Sorted DataFrame by %s", sort_fields)
1926
+
1927
+ # Apply skip/limit if set
1928
+ if self._skip:
1929
+ df = df.iloc[self._skip :]
1930
+ if self._limit:
1931
+ df = df.iloc[: self._limit]
1932
+
1933
+ filter_info = ""
1934
+ if start_date or end_date:
1935
+ filter_info = f" (filtered: {start_date} to {end_date})"
1936
+ print(
1937
+ f"[OK] Loaded {len(df):,} documents from cache{filter_info} ({reader.get_statistics()['total_size_mb']:.1f} MB)"
1938
+ )
1939
+ return cast(pd.DataFrame, df)
1940
+
1941
+ # ─────────────────────────────────────────────────────────────────────
1942
+ # STEP 3: Cache miss - execute fetch and stream to Parquet
1943
+ # This is where the heavy lifting happens
1944
+ # ─────────────────────────────────────────────────────────────────────
1945
+ mode_str = "parallel" if is_chunkable else "single-worker"
1946
+ print(f"[Query] Cache miss - fetching from MongoDB ({mode_str} mode)...")
1947
+
1948
+ if cache_write:
1949
+ # CRITICAL: If cache_read=False but cache_write=True and cache exists,
1950
+ # we need to clear the old cache first to avoid duplicate data
1951
+ if not cache_read and cache.exists():
1952
+ print(
1953
+ "🧹 Clearing existing cache (cache_read=False, starting fresh)..."
1954
+ )
1955
+ cache.clean()
1956
+ # chunking_granularity is passed from to_dataframe()
1957
+ # If None, execute_parallel_stream_to_cache will use single-worker mode
1958
+
1959
+ # Streaming path: fetch -> encode -> write Parquet (memory efficient)
1960
+ result = execute_parallel_stream_to_cache(
1961
+ pymongo_collection=self._collection.pymongo_collection,
1962
+ filter_dict=self._filter,
1963
+ schema=schema,
1964
+ cache_manager=cache,
1965
+ projection=self._projection,
1966
+ approx_document_size_bytes=self._collection.approx_document_size_bytes,
1967
+ max_workers=max_workers, # From to_dataframe() parameter
1968
+ peak_ram_limit_mb=flush_ram_limit_mb,
1969
+ chunking_granularity=chunking_granularity, # None = single-worker mode
1970
+ mongo_uri=self._collection.mongo_uri,
1971
+ row_group_size=row_group_size,
1972
+ )
1973
+
1974
+ print("\n[Cache] Cache written:")
1975
+ print(f" - Total docs: {result['total_docs']:,}")
1976
+ print(f" - Total files: {result['total_files']}")
1977
+ print(f" - Workers: {result['workers']}")
1978
+ print(f" - Duration: {result['duration_s']:.2f}s")
1979
+ print(f" - Cache dir: {cache.cache_dir}")
1980
+
1981
+ # Now read from cache to build DataFrame (with optional date filter)
1982
+ print("\n[Cache] Reading from cache to build DataFrame...")
1983
+ reader = ParquetReader(cache.cache_dir)
1984
+
1985
+ # Check if we need DuckDB sorting (Any types or List types)
1986
+ need_duckdb_sort = False
1987
+ sort_infos: List[Dict[str, Any]] = []
1988
+ if self._sort:
1989
+ sort_infos = get_sort_field_info(self._sort, schema)
1990
+
1991
+ # Expand parent fields to children and collect all fields to check
1992
+ fields_to_check = []
1993
+ for info in sort_infos:
1994
+ if info["is_parent"]:
1995
+ # Parent field - check all children
1996
+ fields_to_check.extend(info["child_fields"])
1997
+ else:
1998
+ # Direct field
1999
+ fields_to_check.append(info["field"])
2000
+
2001
+ # Check if any of the actual sort fields (after expansion) are Any/List types
2002
+ for field in fields_to_check:
2003
+ if field in schema.fields:
2004
+ field_type = schema.fields[field]
2005
+ if isinstance(field_type, (AnyType, ListType)):
2006
+ need_duckdb_sort = True
2007
+ break
2008
+
2009
+ if self._sort and need_duckdb_sort:
2010
+ # Use DuckDB for Any/List type sorting (requires BSON type ordering / array sorting)
2011
+ print("[Sort] Using DuckDB for Types.Any()/Types.List() sorting...")
2012
+ df = cast(
2013
+ pd.DataFrame,
2014
+ reader.get_globally_sorted_dataframe(
2015
+ sort_spec=self._sort,
2016
+ schema=schema,
2017
+ time_field=time_field,
2018
+ start_date=start_date,
2019
+ end_date=end_date,
2020
+ coerce=coerce,
2021
+ memory_limit_mb=flush_ram_limit_mb,
2022
+ threads=max_workers,
2023
+ ),
2024
+ )
2025
+ else:
2026
+ # Normal read + native pandas sort
2027
+ df = cast(
2028
+ pd.DataFrame,
2029
+ reader.to_dataframe(
2030
+ engine="pandas",
2031
+ schema=schema,
2032
+ time_field=time_field,
2033
+ start_date=start_date,
2034
+ end_date=end_date,
2035
+ coerce=coerce,
2036
+ ),
2037
+ )
2038
+
2039
+ # Native pandas sort - expand parent fields to children
2040
+ if self._sort:
2041
+ expanded_sort = []
2042
+ for info in sort_infos:
2043
+ if info["is_parent"]:
2044
+ for child in info["child_fields"]:
2045
+ expanded_sort.append((child, info["direction"]))
2046
+ else:
2047
+ expanded_sort.append((info["field"], info["direction"]))
2048
+
2049
+ sort_fields = [
2050
+ field for field, _ in expanded_sort if field in df.columns
2051
+ ]
2052
+ ascending = [
2053
+ direction == 1
2054
+ for field, direction in expanded_sort
2055
+ if field in df.columns
2056
+ ]
2057
+ if sort_fields:
2058
+ df = df.sort_values(
2059
+ by=sort_fields, ascending=ascending, na_position="last"
2060
+ )
2061
+ logger.debug("Sorted DataFrame by %s", sort_fields)
2062
+
2063
+ else:
2064
+ # cache_write=False is not supported in single-worker mode
2065
+ # Always write to cache for consistency and performance
2066
+ raise ValueError(
2067
+ "cache_write=False is not supported. "
2068
+ "XLR8 always writes to Parquet cache for memory efficiency. "
2069
+ "Set cache_read=False if you don't want to read from existing cache."
2070
+ )
2071
+
2072
+ # Apply skip/limit if set
2073
+ if self._skip:
2074
+ df = df.iloc[self._skip :]
2075
+ if self._limit:
2076
+ df = df.iloc[: self._limit]
2077
+
2078
+ return cast(pd.DataFrame, df)
2079
+
2080
+ def explain_acceleration(self) -> Dict[str, Any]:
2081
+ """
2082
+ Get query execution plan.
2083
+
2084
+ Returns explanation of how query will be executed:
2085
+ - Whether acceleration is possible
2086
+ - Time bounds extracted
2087
+ - Estimated chunk count
2088
+ - Worker configuration
2089
+
2090
+ Returns:
2091
+ Dict with execution plan details
2092
+ """
2093
+ schema = self._collection.schema
2094
+
2095
+ result: Dict[str, Any] = {
2096
+ "filter": self._filter,
2097
+ "projection": self._projection,
2098
+ "skip": self._skip,
2099
+ "limit": self._limit,
2100
+ "sort": self._sort,
2101
+ "accelerated": False,
2102
+ }
2103
+
2104
+ if schema is None:
2105
+ result["reason"] = "No schema provided"
2106
+ return result
2107
+
2108
+ # NEW: build_brackets_for_find internally validates via is_chunkable_query
2109
+ is_chunkable, reason, brackets, bounds = build_brackets_for_find(
2110
+ self._filter,
2111
+ schema.time_field,
2112
+ self._sort, # Pass sort spec for $natural detection
2113
+ )
2114
+
2115
+ result["is_chunkable"] = is_chunkable
2116
+ result["reason"] = reason
2117
+
2118
+ # Distinguish REJECT vs SINGLE modes
2119
+ if not is_chunkable:
2120
+ # REJECT mode
2121
+ result["mode"] = "reject"
2122
+ elif is_chunkable and not brackets:
2123
+ # SINGLE mode - valid but not parallelizable
2124
+ result["mode"] = "single"
2125
+ else:
2126
+ # PARALLEL mode
2127
+ result["mode"] = "parallel"
2128
+
2129
+ if is_chunkable and bounds and bounds[0] and bounds[1]:
2130
+ start_bound = bounds[0]
2131
+ end_bound = bounds[1]
2132
+
2133
+ result["time_bounds"] = {
2134
+ "start": start_bound.isoformat(),
2135
+ "end": end_bound.isoformat(),
2136
+ }
2137
+
2138
+ chunks = chunk_time_range(
2139
+ start_bound, end_bound, chunk_size=timedelta(days=1)
2140
+ )
2141
+ result["estimated_chunks"] = len(chunks)
2142
+
2143
+ result["accelerated"] = True
2144
+
2145
+ return result