xlr8 0.1.7b2__cp313-cp313-macosx_10_12_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2155 @@
1
+ """
2
+ XLR8 cursor with PyMongo compatibility.
3
+
4
+ ================================================================================
5
+ DATA FLOW - CURSOR (WHERE ACCELERATION HAPPENS)
6
+ ================================================================================
7
+
8
+ This module is where the magic happens. When user calls cursor.to_dataframe(),
9
+ we decide whether to:
10
+ A) Use regular PyMongo iteration (slow)
11
+ B) Use accelerated parallel fetch + Parquet caching (fast)
12
+
13
+ DECISION FLOW:
14
+ ────────────────────────────────────────────────────────────────────────────────
15
+
16
+ cursor.to_dataframe() called
17
+
18
+
19
+ ┌─────────────────────────────┐
20
+ │ Is schema provided? │─── No ──▶ REGULAR PATH (PyMongo iteration)
21
+ └─────────────────────────────┘
22
+ │ Yes
23
+
24
+ ┌─────────────────────────────┐
25
+ │ Is query chunkable? │─── No ──▶ REGULAR PATH
26
+ │ (has time range, no │ (e.g., has $where or nested $or)
27
+ │ forbidden operators) │
28
+ └─────────────────────────────┘
29
+ │ Yes
30
+
31
+ ┌─────────────────────────────┐
32
+ │ Is data in cache? │─── Yes ─▶ READ FROM CACHE
33
+ │ (.cache/{query_hash}/*.parquet) (instant, ~100ms for 1M rows)
34
+ └─────────────────────────────┘
35
+ │ No
36
+
37
+ ┌─────────────────────────────┐
38
+ │ ACCELERATED PATH: │
39
+ │ 1. Build brackets │ ← analysis/brackets.py
40
+ │ 2. Plan execution │ ← execution/planner.py
41
+ │ 3. Chunk time ranges │ ← analysis/chunker.py
42
+ │ 4. Parallel async fetch │ ← Rust backend (fetch_chunks_bson)
43
+ │ 5. Stream to Parquet │ ← Rust backend writes shards
44
+ │ 6. Read back DataFrame │ ← storage/reader.py
45
+ └─────────────────────────────┘
46
+
47
+ EXAMPLE DATA TRANSFORMATIONS:
48
+ ────────────────────────────────────────────────────────────────────────────────
49
+
50
+ 1. INPUT QUERY (from user):
51
+ {
52
+ "$or": [
53
+ {"metadata.sensor_id": ObjectId("64a...")},
54
+ {"metadata.sensor_id": ObjectId("64b...")},
55
+ ],
56
+ "timestamp": {"$gte": datetime(2024, 1, 1), "$lt": datetime(2024, 7, 1)}
57
+ }
58
+
59
+ 2. AFTER BRACKET ANALYSIS (brackets.py):
60
+ [
61
+ Bracket(static={"metadata.sensor_id": "64a..."}, time=Jan-Jul),
62
+ Bracket(static={"metadata.sensor_id": "64b..."}, time=Jan-Jul),
63
+ ]
64
+
65
+ 3. AFTER CHUNKING (for each bracket):
66
+ Bracket 1 -> 13 chunks (14 days each for 6 months)
67
+ Bracket 2 -> 13 chunks
68
+ Total: 26 work items in queue
69
+
70
+ 4. PARALLEL FETCH (10 workers):
71
+ Worker 0: Chunk 1 -> 45,000 docs, write to part_0000.parquet
72
+ Worker 1: Chunk 2 -> 52,000 docs, write to part_0001.parquet
73
+ ...
74
+ Worker 9: Chunk 10 -> 38,000 docs, write to part_0009.parquet
75
+ (Rust async workers pull chunks as they finish)
76
+
77
+ 5. OUTPUT (DataFrame):
78
+ pandas.DataFrame with columns: [timestamp, metadata.device_id, value, ...]
79
+ 500,000 rows loaded from Parquet in ~0.5s
80
+
81
+ ================================================================================
82
+ """
83
+
84
+ from __future__ import annotations
85
+
86
+ from typing import (
87
+ Any,
88
+ Callable,
89
+ Dict,
90
+ List,
91
+ Optional,
92
+ Union,
93
+ Iterator,
94
+ Literal,
95
+ Generator,
96
+ cast,
97
+ )
98
+ from datetime import datetime, date, timezone, timedelta
99
+ import logging
100
+ import warnings
101
+ import pandas as pd
102
+ import time
103
+ import pyarrow as pa
104
+ import polars as pl
105
+
106
+ logger = logging.getLogger(__name__)
107
+
108
+ # Import after logger to avoid circular imports
109
+ from xlr8.constants import DEFAULT_BATCH_SIZE
110
+ from xlr8.execution.callback import execute_partitioned_callback
111
+ from xlr8.analysis import (
112
+ build_brackets_for_find,
113
+ chunk_time_range,
114
+ get_sort_field_info,
115
+ validate_sort_field,
116
+ )
117
+ from xlr8.schema.types import Any as AnyType, List as ListType
118
+ from xlr8.storage import CacheManager, ParquetReader
119
+ from xlr8.execution import execute_parallel_stream_to_cache
120
+
121
+
122
+ def parse_datetime_tz_aware(
123
+ value: Union[datetime, date, str, None],
124
+ param_name: str = "date",
125
+ ) -> Optional[datetime]:
126
+ """
127
+ Parse a date/datetime value to a timezone-aware datetime.
128
+
129
+ Accepts:
130
+ - datetime (must be tz-aware or will assume UTC)
131
+ - date (converted to midnight UTC)
132
+ - ISO format string with timezone (e.g., "2024-01-15T10:30:00Z", "2024-01-15T10:30:00+00:00")
133
+
134
+ Args:
135
+ value: The date value to parse
136
+ param_name: Name of parameter for error messages
137
+
138
+ Returns:
139
+ Timezone-aware datetime or None if value is None
140
+
141
+ Raises:
142
+ ValueError: If string is not a valid ISO format or missing timezone
143
+ """
144
+ if value is None:
145
+ return None
146
+
147
+ if isinstance(value, datetime):
148
+ if value.tzinfo is None:
149
+ # Assume UTC for naive datetimes
150
+ return value.replace(tzinfo=timezone.utc)
151
+ return value
152
+
153
+ if isinstance(value, date):
154
+ # Convert date to midnight UTC
155
+ return datetime(value.year, value.month, value.day, tzinfo=timezone.utc)
156
+
157
+ if isinstance(value, str):
158
+ # Try parsing ISO format
159
+ try:
160
+ # Python 3.11+ has datetime.fromisoformat with better Z support
161
+ # For compatibility, handle Z suffix manually
162
+ if value.endswith("Z"):
163
+ value = value[:-1] + "+00:00"
164
+
165
+ dt = datetime.fromisoformat(value)
166
+
167
+ if dt.tzinfo is None:
168
+ raise ValueError(
169
+ f"{param_name}: Timezone-aware datetime required. "
170
+ f"Got '{value}' without timezone. "
171
+ f"Use ISO format with timezone like '2024-01-15T10:30:00Z' or '2024-01-15T10:30:00+00:00'"
172
+ )
173
+ return dt
174
+ except ValueError as e:
175
+ if "Timezone-aware" in str(e):
176
+ raise
177
+ raise ValueError(
178
+ f"{param_name}: Invalid datetime string '{value}'. "
179
+ f"Use ISO format with timezone like '2024-01-15T10:30:00Z' or '2024-01-15T10:30:00+00:00'"
180
+ ) from e
181
+
182
+ raise TypeError(
183
+ f"{param_name}: Expected datetime, date, or ISO string, got {type(value).__name__}"
184
+ )
185
+
186
+
187
+ class XLR8Cursor:
188
+ """
189
+ PyMongo-compatible cursor with acceleration support.
190
+
191
+ Acts as drop-in replacement for pymongo.cursor.Cursor but can
192
+ accelerate queries through parallel execution and Parquet caching.
193
+
194
+ Key differences from PyMongo:
195
+ - to_dataframe() / to_polars() for efficient DataFrame conversion
196
+ - Transparent acceleration when query is chunkable
197
+ - Maintains full PyMongo API compatibility for iteration
198
+
199
+ Example:
200
+ >>> cursor = collection.find({"timestamp": {"$gte": start, "$lt": end}})
201
+ >>> df = cursor.to_dataframe() # Accelerated execution
202
+ >>>
203
+ >>> # Or use like regular PyMongo cursor:
204
+ >>> for doc in cursor:
205
+ ... logging.debug(doc)
206
+ """
207
+
208
+ def __init__(
209
+ self,
210
+ collection: Any, # XLR8Collection
211
+ query_filter: Dict[str, Any],
212
+ projection: Optional[Dict[str, Any]] = None,
213
+ skip: int = 0,
214
+ limit: int = 0,
215
+ sort: Optional[List[tuple]] = None,
216
+ batch_size: int = 1000,
217
+ ):
218
+ """
219
+ Initialize cursor.
220
+
221
+ Args:
222
+ collection: Parent XLR8Collection
223
+ query_filter: Query filter dict
224
+ projection: Field projection dict
225
+ skip: Number of documents to skip
226
+ limit: Maximum documents to return (0 = unlimited)
227
+ sort: List of (field, direction) tuples
228
+ batch_size: Batch size for iteration
229
+ """
230
+ self._collection = collection
231
+ self._filter = query_filter
232
+ self._projection = projection
233
+ self._skip = skip
234
+ self._limit = limit
235
+ self._sort = sort
236
+ self._batch_size = batch_size
237
+
238
+ # Iteration state
239
+ self._started = False
240
+ self._pymongo_cursor: Optional[Any] = None
241
+ self._exhausted = False
242
+
243
+ def __iter__(self) -> Iterator[Dict[str, Any]]:
244
+ """Iterate over documents."""
245
+ if not self._started:
246
+ self._started = True
247
+ # Create actual PyMongo cursor for iteration
248
+ self._ensure_pymongo_cursor()
249
+
250
+ if self._pymongo_cursor is None:
251
+ return iter([])
252
+
253
+ return iter(self._pymongo_cursor)
254
+
255
+ def __next__(self) -> Dict[str, Any]:
256
+ """Get next document."""
257
+ if not self._started:
258
+ self.__iter__()
259
+
260
+ if self._pymongo_cursor is None:
261
+ raise StopIteration
262
+
263
+ return next(self._pymongo_cursor)
264
+
265
+ def _ensure_pymongo_cursor(self) -> None:
266
+ """Lazily create PyMongo cursor only when needed for iteration/delegation."""
267
+ if self._pymongo_cursor is None:
268
+ self._pymongo_cursor = self._collection.pymongo_collection.find(
269
+ filter=self._filter,
270
+ projection=self._projection,
271
+ skip=self._skip,
272
+ limit=self._limit,
273
+ sort=self._sort,
274
+ batch_size=self._batch_size,
275
+ )
276
+
277
+ def raw_cursor(self):
278
+ """
279
+ Get direct access to underlying PyMongo cursor.
280
+
281
+ This is an escape hatch for power users who need access to PyMongo cursor
282
+ methods not explicitly implemented in XLR8Cursor.
283
+
284
+ Returns:
285
+ pymongo.cursor.Cursor: The underlying PyMongo cursor
286
+
287
+ Example:
288
+ >>> cursor = collection.find(...)
289
+ >>> cursor.raw_cursor().comment("my query").max_time_ms(5000)
290
+ """
291
+ self._ensure_pymongo_cursor()
292
+ return self._pymongo_cursor
293
+
294
+ def __getattr__(self, name: str) -> Any:
295
+ """
296
+ Delegate unknown attributes to underlying PyMongo cursor.
297
+
298
+ This provides transparent access to all PyMongo cursor methods while
299
+ preserving XLR8's accelerated methods.
300
+
301
+ Note: PyMongo cursor is created lazily only when delegation is needed.
302
+ For explicit access, use .raw_cursor()
303
+ """
304
+ # Avoid infinite recursion
305
+ if name.startswith("_"):
306
+ raise AttributeError(
307
+ f"'{type(self).__name__}' object has no attribute '{name}'"
308
+ )
309
+
310
+ # Create PyMongo cursor if needed
311
+ self._ensure_pymongo_cursor()
312
+
313
+ # Get attribute from PyMongo cursor
314
+ attr = getattr(self._pymongo_cursor, name)
315
+
316
+ # If it's a method that returns cursor, wrap the result
317
+ if callable(attr):
318
+
319
+ def wrapper(*args, **kwargs):
320
+ result = attr(*args, **kwargs)
321
+ # If PyMongo method returns cursor, it returns self (the PyMongo cursor)
322
+ # We want to return our wrapper instead
323
+ if result is self._pymongo_cursor:
324
+ return self
325
+ return result
326
+
327
+ return wrapper
328
+
329
+ return attr
330
+
331
+ def __enter__(self):
332
+ """Context manager entry."""
333
+ return self
334
+
335
+ def __exit__(self, exc_type, exc_val, exc_tb):
336
+ """Context manager exit."""
337
+ self.close()
338
+
339
+ # PyMongo compatibility methods
340
+
341
+ def skip(self, count: int) -> "XLR8Cursor":
342
+ """
343
+ Skip documents.
344
+
345
+ Args:
346
+ count: Number of documents to skip
347
+
348
+ Returns:
349
+ Self for chaining
350
+ """
351
+ if self._started:
352
+ raise RuntimeError("Cannot modify cursor after iteration started")
353
+
354
+ self._skip = count
355
+ return self
356
+
357
+ def limit(self, count: int) -> "XLR8Cursor":
358
+ """
359
+ Limit result count.
360
+
361
+ Args:
362
+ count: Maximum documents to return
363
+
364
+ Returns:
365
+ Self for chaining
366
+ """
367
+ if self._started:
368
+ raise RuntimeError("Cannot modify cursor after iteration started")
369
+
370
+ self._limit = count
371
+ return self
372
+
373
+ def sort(
374
+ self, key_or_list: Union[str, List[tuple]], direction: int = 1
375
+ ) -> "XLR8Cursor":
376
+ """
377
+ Sort results.
378
+
379
+ Automatically adds _id as final tie-breaker for deterministic ordering
380
+ (matching MongoDB's behavior).
381
+
382
+ Args:
383
+ key_or_list: Field name or list of (field, direction) tuples
384
+ direction: Sort direction (1=ascending, -1=descending)
385
+
386
+ Returns:
387
+ Self for chaining
388
+ """
389
+ if self._started:
390
+ raise RuntimeError("Cannot modify cursor after iteration started")
391
+
392
+ if isinstance(key_or_list, str):
393
+ self._sort = [(key_or_list, direction)]
394
+ else:
395
+ self._sort = key_or_list
396
+
397
+ return self
398
+
399
+ def batch_size(self, size: int) -> "XLR8Cursor":
400
+ """
401
+ Set batch size for iteration.
402
+
403
+ Args:
404
+ size: Batch size
405
+
406
+ Returns:
407
+ Self for chaining
408
+ """
409
+ if self._started:
410
+ raise RuntimeError("Cannot modify cursor after iteration started")
411
+
412
+ self._batch_size = size
413
+ return self
414
+
415
+ def close(self) -> None:
416
+ """Close cursor and free resources."""
417
+ if self._pymongo_cursor is not None:
418
+ self._pymongo_cursor.close()
419
+ self._pymongo_cursor = None
420
+ self._exhausted = True
421
+
422
+ # count() and distinct() removed - use __getattr__ delegation to PyMongo
423
+ # These are available via: cursor.count(), cursor.distinct()
424
+ # __getattr__ automatically forwards them to the underlying PyMongo cursor
425
+
426
+ # XLR8-specific acceleration methods
427
+
428
+ def to_dataframe(
429
+ self,
430
+ accelerate: bool = True,
431
+ cache_read: bool = True,
432
+ cache_write: bool = True,
433
+ start_date: Optional[Union[datetime, date, str]] = None,
434
+ end_date: Optional[Union[datetime, date, str]] = None,
435
+ coerce: Literal["raise", "error"] = "raise",
436
+ max_workers: int = 4,
437
+ chunking_granularity: Optional[timedelta] = None,
438
+ row_group_size: Optional[int] = None,
439
+ flush_ram_limit_mb: int = 512,
440
+ ) -> pd.DataFrame:
441
+ """
442
+ Convert results to Pandas DataFrame with optional acceleration.
443
+
444
+ This is the main acceleration entry point. If the query is chunkable
445
+ and acceleration is enabled, uses parallel execution and Parquet caching
446
+ for upto 4x speedup on large result sets.
447
+
448
+
449
+ DATA FLOW - ACCELERATION DECISION:
450
+
451
+ INPUT: self._filter (the MongoDB query)
452
+ Example: {
453
+ "timestamp": {"$gte": datetime(2024,1,1), "$lt": datetime(...)},
454
+ "$or": [{"metadata.sensor_id": ObjectId("64a...")}]
455
+ }
456
+
457
+ DECISION STEPS:
458
+ 1. Check if schema exists -> No: raise error (schema required)
459
+ 2. Check if query is chunkable -> No: single-worker, still Parquet
460
+ (is_chunkable_query checks for time bounds, forbidden ops)
461
+ 3. If chunkable: use parallel workers based on time span
462
+
463
+ OUTPUT: pandas.DataFrame with columns from schema
464
+ Example columns: [timestamp, metadata.device_id, value]
465
+
466
+ PERFORMANCE ( Obviously depends on data size, schema,
467
+ cache state etc. but this is just for illustration ):
468
+ - Regular path: ~30s for 500K docs (sequential cursor iteration)
469
+ - Accelerated path: ~10s for 500K docs (parallel + caching)
470
+ - Cache hit: ~0.5s for 500K docs (read from Parquet)
471
+
472
+ Args:
473
+ accelerate: Enable acceleration if query is chunkable
474
+ cache_read: Read from Parquet cache if available
475
+ cache_write: Write results to Parquet cache
476
+ start_date: Filter cached data from this date (inclusive).
477
+ Accepts datetime, date, or ISO string with timezone.
478
+ Example: "2024-01-15T00:00:00Z" or datetime with tzinfo
479
+ end_date: Filter cached data until this date (exclusive).
480
+ Accepts datetime, date, or ISO string with timezone.
481
+ coerce: Error handling mode:
482
+ - "raise": Raise exceptions on schema validation errors (default)
483
+ - "error": Log errors and store None for invalid values
484
+ max_workers: Maximum parallel workers (default: 4). More workers use
485
+ more RAM but process faster. Set to 1 for single-threaded.
486
+ Only used when chunking_granularity is provided.
487
+ chunking_granularity: Time granularity for chunking the query.
488
+ Example: timedelta(days=1) chunks by day, timedelta(hours=1) by hour.
489
+ REQUIRED for parallel execution - determines chunk boundaries.
490
+ If None, single-worker mode is used (no parallelization).
491
+ row_group_size: Rows per Parquet row group. If None, Rust default is used.
492
+ flush_ram_limit_mb: RAM limit in MB for buffered data before flushing to
493
+ Parquet. Higher values mean fewer files but more memory usage.
494
+ (default: 512)
495
+
496
+ Returns:
497
+ Pandas DataFrame with results
498
+
499
+ Raises:
500
+ ValueError: If no schema is provided (schema is required for acceleration)
501
+ ValueError: If date strings are not timezone-aware
502
+
503
+ Example:
504
+ >>> cursor = collection.find({
505
+ ... "timestamp": {"$gte": start, "$lt": end},
506
+ ... "status": "active"
507
+ ... })
508
+ >>> df = cursor.to_dataframe() # Accelerated automatically
509
+ >>>
510
+ """
511
+ # Schema is required for acceleration
512
+ schema = self._collection.schema
513
+ if schema is None:
514
+ raise ValueError(
515
+ "Schema is required for to_dataframe(). "
516
+ "Provide a schema when creating the collection: "
517
+ "xlr8_collection = xlr8.wrap(collection, schema=my_schema)"
518
+ )
519
+
520
+ # CRITICAL: Validate projection doesn't exclude required fields
521
+ if self._projection:
522
+ # Check if projection is inclusion (has 1 values) or exclusion (has 0 values)
523
+ projection_values = [v for k, v in self._projection.items() if k != "_id"]
524
+ is_inclusion = any(v == 1 for v in projection_values)
525
+
526
+ # Time field must be included (required for all operations)
527
+ if is_inclusion:
528
+ time_in_projection = (
529
+ schema.time_field in self._projection
530
+ and self._projection[schema.time_field] == 1
531
+ )
532
+ if not time_in_projection:
533
+ raise ValueError(
534
+ f"Projection must include time field '{schema.time_field}'. "
535
+ f"Projection: {self._projection}"
536
+ )
537
+
538
+ # Sort fields must be included
539
+ if self._sort:
540
+ for sort_field, _ in self._sort:
541
+ if is_inclusion:
542
+ if (
543
+ sort_field not in self._projection
544
+ or self._projection[sort_field] != 1
545
+ ):
546
+ raise ValueError(
547
+ f"Projection must include sort field '{sort_field}'. "
548
+ f"Cannot sort by a field that is projected out. "
549
+ f"Projection: {self._projection}"
550
+ )
551
+
552
+ # CRITICAL: If limit() or skip() are used, fall back to PyMongo
553
+ # Reason: Downloading all data just to return a subset is impractical
554
+ # MongoDB can efficiently handle limit/skip operations
555
+ if self._limit > 0 or self._skip > 0:
556
+ logger.info(
557
+ "limit() or skip() detected - falling back to PyMongo iteration "
558
+ "(acceleration would be impractical for subset queries)"
559
+ )
560
+ # Use fresh PyMongo cursor (not self which may be exhausted)
561
+ pymongo_cursor = self._collection.pymongo_collection.find(
562
+ self._filter, self._projection
563
+ )
564
+ if self._sort:
565
+ pymongo_cursor = pymongo_cursor.sort(self._sort)
566
+ if self._skip:
567
+ pymongo_cursor = pymongo_cursor.skip(self._skip)
568
+ if self._limit:
569
+ pymongo_cursor = pymongo_cursor.limit(self._limit)
570
+ if self._batch_size:
571
+ pymongo_cursor = pymongo_cursor.batch_size(self._batch_size)
572
+ return pd.json_normalize(list(pymongo_cursor))
573
+
574
+ # Validate sort field if specified
575
+ if self._sort:
576
+ sort_validation = validate_sort_field(self._sort, schema)
577
+ if not sort_validation.is_valid:
578
+ raise ValueError(f"Sort validation failed: {sort_validation.reason}")
579
+
580
+ # Parse and validate date filters
581
+ parsed_start = parse_datetime_tz_aware(start_date, "start_date")
582
+ parsed_end = parse_datetime_tz_aware(end_date, "end_date")
583
+
584
+ if not accelerate:
585
+ # Fallback to regular iteration (ignores date filters)
586
+ if parsed_start or parsed_end:
587
+ logger.warning(
588
+ "start_date/end_date filters are ignored when accelerate=False"
589
+ )
590
+ return self._to_dataframe_regular()
591
+
592
+ is_chunkable, reason, brackets, _ = build_brackets_for_find(
593
+ self._filter,
594
+ schema.time_field,
595
+ self._sort, # Pass sort spec for $natural detection
596
+ )
597
+
598
+ # Validate chunking_granularity if provided
599
+ # CRITICAL: If chunking_granularity is None, we CANNOT chunk the query
600
+ # because we don't know the data's time precision (could be ms, us, ns)
601
+ if chunking_granularity is not None:
602
+ if chunking_granularity.total_seconds() <= 0:
603
+ raise ValueError(
604
+ f"chunking_granularity must be positive, got {chunking_granularity}"
605
+ )
606
+
607
+ if not is_chunkable:
608
+ # REJECT mode - invalid query syntax or contradictory constraints
609
+ # This is different from SINGLE mode (where is_chunkable=True, brackets empty)
610
+ if parsed_start or parsed_end:
611
+ logger.warning(
612
+ "start_date/end_date filters are ignored for non-chunkable queries"
613
+ )
614
+ logger.info("Query has invalid syntax (%s) - cannot execute", reason)
615
+ return self._to_dataframe_accelerated(
616
+ cache_read=cache_read,
617
+ cache_write=cache_write,
618
+ start_date=parsed_start,
619
+ end_date=parsed_end,
620
+ coerce=coerce,
621
+ max_workers=1, # Single worker for invalid queries
622
+ chunking_granularity=None, # No chunking
623
+ is_chunkable=False,
624
+ )
625
+
626
+ # Check for SINGLE mode - valid query but single-worker fallback
627
+ # Indicated by: is_chunkable=True AND empty brackets
628
+ if is_chunkable and not brackets:
629
+ # SINGLE mode examples: $natural sort, unbounded $or branches
630
+ logger.info(
631
+ "Query valid but not parallelizable (%s) - using single-worker mode",
632
+ reason,
633
+ )
634
+ return self._to_dataframe_accelerated(
635
+ cache_read=cache_read,
636
+ cache_write=cache_write,
637
+ start_date=parsed_start,
638
+ end_date=parsed_end,
639
+ coerce=coerce,
640
+ max_workers=1, # Single worker for SINGLE mode
641
+ chunking_granularity=None, # No chunking
642
+ is_chunkable=False,
643
+ )
644
+
645
+ # Query IS chunkable, but do we have granularity info?
646
+ if chunking_granularity is None:
647
+ # No chunking_granularity provided - cannot parallelize safely
648
+ # because we don't know how to split the time range
649
+ logger.info(
650
+ "Query is chunkable but chunking_granularity not provided - "
651
+ "using single-worker mode. Provide chunking_granularity=timedelta(...) "
652
+ "to enable parallel execution."
653
+ )
654
+ return self._to_dataframe_accelerated(
655
+ cache_read=cache_read,
656
+ cache_write=cache_write,
657
+ start_date=parsed_start,
658
+ end_date=parsed_end,
659
+ coerce=coerce,
660
+ max_workers=1, # Single worker - no chunking info
661
+ chunking_granularity=None,
662
+ is_chunkable=False, # Treat as non-chunkable since we can't chunk
663
+ flush_ram_limit_mb=flush_ram_limit_mb, # Pass through for cache reading
664
+ row_group_size=row_group_size, # Pass through for DuckDB batch
665
+ )
666
+
667
+ # Use accelerated parallel execution - we have chunking info!
668
+ return self._to_dataframe_accelerated(
669
+ cache_read=cache_read,
670
+ cache_write=cache_write,
671
+ start_date=parsed_start,
672
+ end_date=parsed_end,
673
+ coerce=coerce,
674
+ max_workers=max_workers,
675
+ chunking_granularity=chunking_granularity,
676
+ is_chunkable=True,
677
+ flush_ram_limit_mb=flush_ram_limit_mb,
678
+ row_group_size=row_group_size,
679
+ )
680
+
681
+ def to_dataframe_batches(
682
+ self,
683
+ batch_size: int = DEFAULT_BATCH_SIZE,
684
+ cache_read: bool = True,
685
+ cache_write: bool = True,
686
+ start_date: Optional[Union[datetime, date, str]] = None,
687
+ end_date: Optional[Union[datetime, date, str]] = None,
688
+ coerce: Literal["raise", "error"] = "raise",
689
+ max_workers: int = 4,
690
+ chunking_granularity: Optional[timedelta] = None,
691
+ row_group_size: Optional[int] = None,
692
+ flush_ram_limit_mb: int = 512,
693
+ ) -> Generator[pd.DataFrame, None, None]:
694
+ """
695
+ Yield DataFrames in batches from cache without loading all data into memory.
696
+
697
+ This is a memory-efficient alternative to to_dataframe() for very large
698
+ result sets. Instead of loading the entire result into memory, it yields
699
+ smaller DataFrames that can be processed incrementally.
700
+
701
+
702
+ MEMORY-EFFICIENT BATCH PROCESSING:
703
+
704
+ Instead of:
705
+ df = cursor.to_dataframe() # Loads ALL 10M rows into RAM
706
+
707
+ Use:
708
+ for batch_df in cursor.to_dataframe_batches(batch_size=50000):
709
+ process(batch_df) # Only 50K rows in RAM at a time
710
+
711
+ Memory usage: O(batch_size) instead of O(total_rows)
712
+
713
+
714
+ Args:
715
+ batch_size: Number of rows per DataFrame batch (default: 10,000)
716
+ cache_read: Read from Parquet cache if available
717
+ cache_write: Write results to Parquet cache on cache miss
718
+ start_date: Filter cached data from this date (inclusive).
719
+ Accepts datetime, date, or ISO string with timezone.
720
+ end_date: Filter cached data until this date (exclusive).
721
+ coerce: Error handling mode ("raise" or "error")
722
+ max_workers: Maximum parallel workers for cache population (default: 4)
723
+ chunking_granularity: Time granularity for chunking (required for parallel fetch)
724
+
725
+ Yields:
726
+ pd.DataFrame: Batches of rows as DataFrames
727
+
728
+ Raises:
729
+ ValueError: If no schema is provided
730
+ ValueError: If date strings are not timezone-aware
731
+ ValueError: If cache doesn't exist and cache_write=False
732
+
733
+ Example:
734
+ >>> # Process 10M rows without loading all into RAM
735
+ >>> total = 0
736
+ >>> for batch_df in cursor.to_dataframe_batches(batch_size=50000):
737
+ ... total += len(batch_df)
738
+ ... # Process batch_df...
739
+ >>> logging.debug(f"Processed {total} rows")
740
+ >>>
741
+ >>> # With date filtering:
742
+ >>> for batch_df in cursor.to_dataframe_batches(
743
+ ... batch_size=10000,
744
+ ... start_date="2024-06-01T00:00:00Z",
745
+ ... end_date="2024-06-15T00:00:00Z"
746
+ ... ):
747
+ ... analyze(batch_df)
748
+ """
749
+ # Schema is required
750
+ schema = self._collection.schema
751
+ if schema is None:
752
+ raise ValueError(
753
+ "Schema is required for to_dataframe_batches(). "
754
+ "Provide a schema when creating the collection."
755
+ )
756
+
757
+ # CRITICAL: If limit() or skip() are used, fall back to PyMongo
758
+ # Reason: Downloading all data just to return a subset is impractical
759
+ if self._limit > 0 or self._skip > 0:
760
+ logger.info(
761
+ "limit() or skip() detected - falling back to PyMongo iteration "
762
+ "(acceleration would be impractical for subset queries)"
763
+ )
764
+ # Use fresh PyMongo cursor in batches (not self which may be exhausted)
765
+ pymongo_cursor = self._collection.pymongo_collection.find(
766
+ self._filter, self._projection
767
+ )
768
+ if self._sort:
769
+ pymongo_cursor = pymongo_cursor.sort(self._sort)
770
+ if self._skip:
771
+ pymongo_cursor = pymongo_cursor.skip(self._skip)
772
+ if self._limit:
773
+ pymongo_cursor = pymongo_cursor.limit(self._limit)
774
+ if self._batch_size:
775
+ pymongo_cursor = pymongo_cursor.batch_size(self._batch_size)
776
+
777
+ batch = []
778
+ for doc in pymongo_cursor:
779
+ batch.append(doc)
780
+ if len(batch) >= batch_size:
781
+ yield pd.DataFrame(batch)
782
+ batch = []
783
+ if batch:
784
+ yield pd.DataFrame(batch)
785
+ return
786
+
787
+ # CRITICAL: Validate projection doesn't exclude required fields
788
+ if self._projection:
789
+ projection_values = [v for k, v in self._projection.items() if k != "_id"]
790
+ is_inclusion = any(v == 1 for v in projection_values)
791
+
792
+ # Time field must be included
793
+ if is_inclusion:
794
+ time_in_projection = (
795
+ schema.time_field in self._projection
796
+ and self._projection[schema.time_field] == 1
797
+ )
798
+ if not time_in_projection:
799
+ raise ValueError(
800
+ f"Projection must include time field '{schema.time_field}'. "
801
+ f"Projection: {self._projection}"
802
+ )
803
+
804
+ # Sort fields must be included
805
+ if self._sort:
806
+ for sort_field, _ in self._sort:
807
+ if is_inclusion:
808
+ if (
809
+ sort_field not in self._projection
810
+ or self._projection[sort_field] != 1
811
+ ):
812
+ raise ValueError(
813
+ f"Projection must include sort field '{sort_field}'. "
814
+ f"Cannot sort by a field that is projected out. "
815
+ f"Projection: {self._projection}"
816
+ )
817
+
818
+ time_field = schema.time_field
819
+
820
+ # Validate sort field if specified
821
+ if self._sort:
822
+ sort_validation = validate_sort_field(self._sort, schema)
823
+ if not sort_validation.is_valid:
824
+ raise ValueError(f"Sort validation failed: {sort_validation.reason}")
825
+ logger.info(
826
+ "Sorted streaming enabled - using DuckDB K-way merge for global sort order"
827
+ )
828
+
829
+ # Parse and validate date filters
830
+ parsed_start = parse_datetime_tz_aware(start_date, "start_date")
831
+ parsed_end = parse_datetime_tz_aware(end_date, "end_date")
832
+
833
+ is_chunkable, reason, brackets, _ = build_brackets_for_find(
834
+ self._filter,
835
+ time_field,
836
+ self._sort, # Pass sort spec for $natural detection
837
+ )
838
+
839
+ # Handle REJECT mode (is_chunkable=False)
840
+ if not is_chunkable:
841
+ warnings.warn(
842
+ f"Invalid query syntax ({reason}). Cannot execute this query.",
843
+ UserWarning,
844
+ stacklevel=2,
845
+ )
846
+ # Override max_workers to 1 for invalid queries
847
+ max_workers = 1
848
+ chunking_granularity = None
849
+
850
+ # Handle SINGLE mode (is_chunkable=True but empty brackets)
851
+ elif is_chunkable and not brackets:
852
+ warnings.warn(
853
+ f"Query valid but not parallelizable ({reason}). Using single-worker mode.",
854
+ UserWarning,
855
+ stacklevel=2,
856
+ )
857
+ # Override max_workers to 1 for SINGLE mode
858
+ max_workers = 1
859
+ chunking_granularity = None
860
+
861
+ # Mark as started
862
+ if not self._started:
863
+ self._started = True
864
+
865
+ # Create cache manager
866
+ cache = CacheManager(
867
+ filter_dict=self._filter,
868
+ projection=self._projection,
869
+ sort=self._sort,
870
+ )
871
+
872
+ # Ensure cache exists
873
+ if not cache.exists():
874
+ if not cache_write:
875
+ raise ValueError(
876
+ "Cache does not exist and cache_write=False. "
877
+ "Either call to_dataframe() first to populate cache, "
878
+ "or set cache_write=True."
879
+ )
880
+
881
+ # Populate cache first
882
+ logging.debug("[Query] Cache miss - fetching from MongoDB...")
883
+
884
+ # Populate cache via accelerated executor
885
+ result = execute_parallel_stream_to_cache(
886
+ pymongo_collection=self._collection.pymongo_collection,
887
+ filter_dict=self._filter,
888
+ schema=schema,
889
+ cache_manager=cache,
890
+ projection=self._projection,
891
+ approx_document_size_bytes=self._collection.approx_document_size_bytes,
892
+ max_workers=max_workers,
893
+ peak_ram_limit_mb=flush_ram_limit_mb,
894
+ chunking_granularity=chunking_granularity,
895
+ mongo_uri=self._collection.mongo_uri,
896
+ sort_spec=self._sort, # Pass sort for pre-sorting during Parquet write
897
+ row_group_size=row_group_size,
898
+ )
899
+
900
+ logging.debug(
901
+ f"\n[Cache] Cache written: {result['total_docs']:,} docs in {result['duration_s']:.2f}s"
902
+ )
903
+
904
+ elif not cache_read and cache_write:
905
+ # CRITICAL: cache_read=False but cache_write=True and cache exists
906
+ # Clear old cache and re-populate to avoid duplicate data
907
+ logging.debug(
908
+ "[Clean] Clearing existing cache (cache_read=False, starting fresh)..."
909
+ )
910
+ cache.clean()
911
+
912
+ logging.debug("[Query] Re-fetching from MongoDB...")
913
+
914
+ # Re-populate cache via accelerated executor
915
+ result = execute_parallel_stream_to_cache(
916
+ pymongo_collection=self._collection.pymongo_collection,
917
+ filter_dict=self._filter,
918
+ schema=schema,
919
+ cache_manager=cache,
920
+ projection=self._projection,
921
+ approx_document_size_bytes=self._collection.approx_document_size_bytes,
922
+ max_workers=max_workers,
923
+ peak_ram_limit_mb=flush_ram_limit_mb,
924
+ chunking_granularity=chunking_granularity,
925
+ mongo_uri=self._collection.mongo_uri,
926
+ sort_spec=self._sort, # Pass sort for pre-sorting during Parquet write
927
+ row_group_size=row_group_size,
928
+ )
929
+
930
+ logging.debug(
931
+ f"\n[Cache] Cache re-written: {result['total_docs']:,} docs in {result['duration_s']:.2f}s"
932
+ )
933
+
934
+ # Now yield batches from cache
935
+ logging.debug(f"[Cache] Streaming batches from cache: {cache.cache_dir}")
936
+ reader = ParquetReader(cache.cache_dir)
937
+
938
+ # Use globally sorted streaming if sort is specified
939
+ if self._sort:
940
+ logging.debug("[Sort] Using DuckDB K-way merge for globally sorted batches")
941
+ yield from reader.iter_globally_sorted_batches(
942
+ sort_spec=self._sort, # Pass full sort spec for multi-field sorting
943
+ batch_size=batch_size,
944
+ schema=schema,
945
+ time_field=time_field,
946
+ start_date=parsed_start,
947
+ end_date=parsed_end,
948
+ coerce=coerce,
949
+ memory_limit_mb=flush_ram_limit_mb, # Pass RAM limit to DuckDB
950
+ threads=max_workers, # Pass thread count to DuckDB
951
+ )
952
+ else:
953
+ yield from reader.iter_dataframe_batches(
954
+ batch_size=batch_size,
955
+ schema=schema,
956
+ time_field=time_field,
957
+ start_date=parsed_start,
958
+ end_date=parsed_end,
959
+ coerce=coerce,
960
+ )
961
+
962
+ def stream_to_callback(
963
+ self,
964
+ callback: Callable[["pa.Table", Dict[str, Any]], None],
965
+ *,
966
+ partition_time_delta: timedelta,
967
+ partition_by: Optional[Union[str, List[str]]] = None,
968
+ any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
969
+ max_workers: int = 4,
970
+ chunking_granularity: Optional[timedelta] = None,
971
+ row_group_size: Optional[int] = None,
972
+ flush_ram_limit_mb: int = 512,
973
+ cache_read: bool = True,
974
+ cache_write: bool = True,
975
+ ) -> Dict[str, Any]:
976
+ """
977
+ Stream partitioned PyArrow tables to a callback function.
978
+
979
+ This is a two-phase operation:
980
+ 1. Download data from MongoDB to local Parquet cache (reuses Rust backend)
981
+ 2. Partition data and call callback in parallel for each partition
982
+
983
+ Perfect for populating data lakes with partitioned data structures.
984
+
985
+ ┌─────────────────────────────────────────────────────────────────────┐
986
+ │ PARTITION MODES: │
987
+ │ │
988
+ │ TIME ONLY (partition_by=None): │
989
+ │ partition_time_delta=timedelta(weeks=1) │
990
+ │ -> 1 callback per week of data │
991
+ │ │
992
+ │ TIME + FIELD (partition_by="metadata.instrument"): │
993
+ │ partition_time_delta=timedelta(weeks=1) │
994
+ │ -> 1 callback per (week, instrument) combination │
995
+ │ │
996
+ │ Example: 1 year of data, 10 instruments, weekly partitions │
997
+ │ -> 52 weeks × 10 instruments = 520 callbacks │
998
+ └─────────────────────────────────────────────────────────────────────┘
999
+
1000
+ The callback receives:
1001
+ - table: PyArrow Table with data for this partition
1002
+ - metadata: Dict with partition info:
1003
+ {
1004
+ "time_start": datetime, # Start of time bucket
1005
+ "time_end": datetime, # End of time bucket
1006
+ "partition_values": {...}, # Values for partition_by fields
1007
+ "row_count": int, # Rows in this table
1008
+ "partition_index": int, # 0-based partition index
1009
+ "total_partitions": int, # Total partition count
1010
+ }
1011
+
1012
+ Args:
1013
+ callback: Function(table: pa.Table, metadata: dict) -> None
1014
+ Called for each partition. Runs in ThreadPoolExecutor.
1015
+ partition_time_delta: Time bucket size for partitioning.
1016
+ Example: timedelta(weeks=1) creates weekly partitions.
1017
+ REQUIRED - determines how data is grouped.
1018
+ partition_by: Field(s) to partition by, in addition to time.
1019
+ Example: "metadata.instrument" or ["region", "device_id"]
1020
+ Can be any field in schema except time field.
1021
+ None = partition by time only.
1022
+ any_type_strategy: How to decode Types.Any() struct columns:
1023
+ - "float": Coalesce to Float64, prioritize numeric (default)
1024
+ - "string": Convert everything to string (lossless)
1025
+ - "keep_struct": Keep raw struct, don't decode
1026
+ max_workers: Number of parallel callback threads (default: 4).
1027
+ DuckDB releases GIL, so threads get true parallelism.
1028
+ chunking_granularity: Time granularity for MongoDB fetch chunks.
1029
+ Used during Phase 1 (download). Example: timedelta(hours=16).
1030
+ If None, defaults to partition_time_delta.
1031
+ flush_ram_limit_mb: RAM limit for buffered data (default: 512).
1032
+ Used during both download and partition phases.
1033
+ cache_read: Read from existing cache if available (default: True).
1034
+ cache_write: Write to cache during download (default: True).
1035
+
1036
+ Returns:
1037
+ Dict with:
1038
+ - total_partitions: Number of partitions processed
1039
+ - total_rows: Total rows across all partitions
1040
+ - skipped_partitions: Empty partitions skipped
1041
+ - duration_s: Total execution time
1042
+ - cache_duration_s: Time spent on cache population
1043
+ - partition_duration_s: Time spent on partition callbacks
1044
+
1045
+ Raises:
1046
+ ValueError: If no schema provided
1047
+ ValueError: If query not chunkable (no time bounds)
1048
+ ValueError: If sort specified on non-time field
1049
+ RuntimeError: If callback fails for any partition
1050
+
1051
+ Example:
1052
+ >>> # Upload weekly data per instrument to S3 data lake
1053
+ >>> import pyarrow.parquet as pq
1054
+ >>> import s3fs
1055
+ >>>
1056
+ >>> fs = s3fs.S3FileSystem()
1057
+ >>>
1058
+ >>> def upload_partition(table, metadata):
1059
+ ... instrument = metadata['partition_values'].get('metadata.instrument', 'unknown')
1060
+ ... week = metadata['time_start'].strftime('%Y-%m-%d')
1061
+ ... path = f"s3://bucket/data/instrument={instrument}/week={week}.parquet"
1062
+ ... pq.write_table(table, path, filesystem=fs)
1063
+ >>>
1064
+ >>> cursor.stream_to_callback(
1065
+ ... callback=upload_partition,
1066
+ ... partition_time_delta=timedelta(weeks=1),
1067
+ ... partition_by="metadata.instrument",
1068
+ ... max_workers=8,
1069
+ ... chunking_granularity=timedelta(hours=16),
1070
+ ... )
1071
+ """
1072
+ total_start = time.time()
1073
+
1074
+ schema = self._collection.schema
1075
+ if schema is None:
1076
+ raise ValueError(
1077
+ "Schema is required for stream_to_callback(). "
1078
+ "Provide a schema when creating the collection."
1079
+ )
1080
+
1081
+ # CRITICAL: limit() and skip() don't make sense for streaming callbacks
1082
+ # These operations require knowing the full result set, which defeats
1083
+ # the purpose of streaming
1084
+ if self._limit > 0 or self._skip > 0:
1085
+ raise ValueError(
1086
+ "stream_to_callback() does not support limit() or skip(). "
1087
+ "These operations require knowing the total result set size upfront, "
1088
+ "which defeats the purpose of streaming. "
1089
+ "Use to_dataframe() or iterate with PyMongo cursor instead."
1090
+ )
1091
+
1092
+ time_field = schema.time_field
1093
+
1094
+ # CRITICAL: Validate projection doesn't exclude partition_by fields
1095
+ if self._projection and partition_by:
1096
+ # Check if projection is inclusion (has 1 values) or exclusion (has 0 values)
1097
+ projection_values = [v for k, v in self._projection.items() if k != "_id"]
1098
+ is_inclusion = any(v == 1 for v in projection_values)
1099
+
1100
+ # Time field must be included
1101
+ if is_inclusion:
1102
+ time_in_projection = (
1103
+ time_field in self._projection and self._projection[time_field] == 1
1104
+ )
1105
+ if not time_in_projection:
1106
+ raise ValueError(
1107
+ f"Projection must include time field '{time_field}'. "
1108
+ f"Projection: {self._projection}"
1109
+ )
1110
+
1111
+ # Partition fields must be included
1112
+ partition_by_list = (
1113
+ [partition_by] if isinstance(partition_by, str) else partition_by
1114
+ )
1115
+ for field in partition_by_list:
1116
+ if is_inclusion:
1117
+ # For parent fields like "metadata", check if any child is included
1118
+ field_or_children_included = (
1119
+ field in self._projection and self._projection[field] == 1
1120
+ ) or any(
1121
+ k.startswith(f"{field}.") and self._projection[k] == 1
1122
+ for k in self._projection.keys()
1123
+ )
1124
+ if not field_or_children_included:
1125
+ raise ValueError(
1126
+ f"Projection must include partition field '{field}'. "
1127
+ f"Cannot partition by a field that is projected out. "
1128
+ f"Projection: {self._projection}"
1129
+ )
1130
+
1131
+ # Validate sort fields in projection
1132
+ if self._projection and self._sort:
1133
+ projection_values = [v for k, v in self._projection.items() if k != "_id"]
1134
+ is_inclusion = any(v == 1 for v in projection_values)
1135
+ for sort_field, _ in self._sort:
1136
+ if is_inclusion:
1137
+ if (
1138
+ sort_field not in self._projection
1139
+ or self._projection[sort_field] != 1
1140
+ ):
1141
+ raise ValueError(
1142
+ f"Projection must include sort field '{sort_field}'. "
1143
+ f"Projection: {self._projection}"
1144
+ )
1145
+
1146
+ # Validate sort - only allow time field sorting
1147
+ if self._sort:
1148
+ for field, _direction in self._sort:
1149
+ if field != time_field:
1150
+ raise ValueError(
1151
+ f"stream_to_callback() only supports sorting by time field '{time_field}'. "
1152
+ f"Got sort field: '{field}'. "
1153
+ "Remove .sort() or sort only by time field."
1154
+ )
1155
+ # Store sort direction
1156
+ sort_ascending = self._sort[0][1] == 1
1157
+ else:
1158
+ sort_ascending = True # Default to ascending
1159
+
1160
+ # Normalize partition_by to list
1161
+ partition_by_list: Optional[List[str]] = None
1162
+ if partition_by is not None:
1163
+ if isinstance(partition_by, str):
1164
+ partition_by_list = [partition_by]
1165
+ else:
1166
+ partition_by_list = list(partition_by)
1167
+
1168
+ # Validate partition_by fields exist in schema (or are parent fields with children)
1169
+ all_schema_fields = list(schema.fields.keys())
1170
+ for field in partition_by_list:
1171
+ if field == time_field:
1172
+ raise ValueError(
1173
+ f"Cannot partition by time field '{time_field}'. "
1174
+ "Time partitioning is automatic via partition_time_delta."
1175
+ )
1176
+ # Check if field exists directly OR has children
1177
+ has_direct = schema.has_field(field)
1178
+ has_children = any(f.startswith(f"{field}.") for f in all_schema_fields)
1179
+ if not has_direct and not has_children:
1180
+ raise ValueError(
1181
+ f"Partition field '{field}' not found in schema. "
1182
+ f"Available fields: {all_schema_fields}"
1183
+ )
1184
+
1185
+ # Default chunking_granularity to partition_time_delta
1186
+ if chunking_granularity is None:
1187
+ chunking_granularity = partition_time_delta
1188
+
1189
+ # NEW: build_brackets_for_find internally validates via is_chunkable_query
1190
+ is_chunkable, reason, brackets, _ = build_brackets_for_find(
1191
+ self._filter,
1192
+ time_field,
1193
+ self._sort, # Pass sort spec for $natural detection
1194
+ )
1195
+
1196
+ # Handle REJECT mode (is_chunkable=False)
1197
+ if not is_chunkable:
1198
+ warnings.warn(
1199
+ f"Invalid query syntax ({reason}). Cannot execute this query.",
1200
+ UserWarning,
1201
+ stacklevel=2,
1202
+ )
1203
+ # Override max_workers to 1 for invalid queries
1204
+ max_workers = 1
1205
+ chunking_granularity = None
1206
+
1207
+ # Handle SINGLE mode (is_chunkable=True but empty brackets)
1208
+ elif is_chunkable and not brackets:
1209
+ warnings.warn(
1210
+ f"Query valid but not parallelizable ({reason}). Using single-worker mode.",
1211
+ UserWarning,
1212
+ stacklevel=2,
1213
+ )
1214
+ # Override max_workers to 1 for SINGLE mode
1215
+ max_workers = 1
1216
+ chunking_granularity = None
1217
+
1218
+ # Mark as started
1219
+ if not self._started:
1220
+ self._started = True
1221
+
1222
+ # ─────────────────────────────────────────────────────────────────────
1223
+ # PHASE 1: Download to cache (reuses existing Rust backend)
1224
+ # ─────────────────────────────────────────────────────────────────────
1225
+ cache = CacheManager(
1226
+ filter_dict=self._filter,
1227
+ projection=self._projection,
1228
+ sort=self._sort,
1229
+ )
1230
+
1231
+ cache_start = time.time()
1232
+
1233
+ if cache_read and cache.exists():
1234
+ logging.debug(f"[Cache] Using existing cache: {cache.cache_dir}")
1235
+ else:
1236
+ if not cache_write:
1237
+ raise ValueError(
1238
+ "Cache does not exist and cache_write=False. "
1239
+ "Set cache_write=True to download data first."
1240
+ )
1241
+
1242
+ if cache.exists() and not cache_read:
1243
+ logging.debug("[Clean] Clearing existing cache (cache_read=False)...")
1244
+ cache.clean()
1245
+
1246
+ logging.debug("[Query] Downloading from MongoDB to cache...")
1247
+ result = execute_parallel_stream_to_cache(
1248
+ pymongo_collection=self._collection.pymongo_collection,
1249
+ filter_dict=self._filter,
1250
+ schema=schema,
1251
+ cache_manager=cache,
1252
+ projection=self._projection,
1253
+ approx_document_size_bytes=self._collection.approx_document_size_bytes,
1254
+ max_workers=max_workers,
1255
+ peak_ram_limit_mb=flush_ram_limit_mb,
1256
+ chunking_granularity=chunking_granularity,
1257
+ mongo_uri=self._collection.mongo_uri,
1258
+ row_group_size=row_group_size,
1259
+ )
1260
+ logging.debug(
1261
+ f"[Cache] Downloaded: {result['total_docs']:,} docs in {result['duration_s']:.2f}s"
1262
+ )
1263
+
1264
+ cache_duration = time.time() - cache_start
1265
+
1266
+ # ─────────────────────────────────────────────────────────────────────
1267
+ # PHASE 2: Partition and stream to callbacks
1268
+ # ─────────────────────────────────────────────────────────────────────
1269
+
1270
+ partition_result = execute_partitioned_callback(
1271
+ cache_dir=str(cache.cache_dir),
1272
+ schema=schema,
1273
+ callback=callback,
1274
+ partition_time_delta=partition_time_delta,
1275
+ partition_by=partition_by_list,
1276
+ any_type_strategy=any_type_strategy,
1277
+ max_workers=max_workers,
1278
+ sort_ascending=sort_ascending,
1279
+ memory_limit_mb=flush_ram_limit_mb,
1280
+ )
1281
+
1282
+ total_duration = time.time() - total_start
1283
+
1284
+ return {
1285
+ "total_partitions": partition_result["total_partitions"],
1286
+ "total_rows": partition_result["total_rows"],
1287
+ "skipped_partitions": partition_result["skipped_partitions"],
1288
+ "duration_s": total_duration,
1289
+ "cache_duration_s": cache_duration,
1290
+ "partition_duration_s": partition_result["duration_s"],
1291
+ }
1292
+
1293
+ def to_polars(
1294
+ self,
1295
+ accelerate: bool = True,
1296
+ cache_read: bool = True,
1297
+ cache_write: bool = True,
1298
+ start_date: Optional[Union[datetime, date, str]] = None,
1299
+ end_date: Optional[Union[datetime, date, str]] = None,
1300
+ coerce: Literal["raise", "error"] = "raise",
1301
+ max_workers: int = 4,
1302
+ chunking_granularity: Optional[timedelta] = None,
1303
+ row_group_size: Optional[int] = None,
1304
+ any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
1305
+ flush_ram_limit_mb: int = 512,
1306
+ ) -> pl.DataFrame:
1307
+ """
1308
+ Convert results to Polars DataFrame with optional acceleration.
1309
+
1310
+ This mirrors to_dataframe() but returns a Polars DataFrame.
1311
+ Uses ParquetReader with engine="polars" for efficient native reading.
1312
+
1313
+ Args:
1314
+ accelerate: Enable acceleration if query is chunkable
1315
+ cache_read: Read from Parquet cache if available
1316
+ cache_write: Write results to Parquet cache
1317
+ start_date: Filter cached data from this date (inclusive).
1318
+ Accepts datetime, date, or ISO string with timezone.
1319
+ end_date: Filter cached data until this date (exclusive).
1320
+ coerce: Error handling mode ("raise" or "error")
1321
+ max_workers: Maximum parallel workers (default: 4)
1322
+ chunking_granularity: Time granularity for chunking (e.g., timedelta(days=1))
1323
+ row_group_size: Rows per parquet row group. If None, Rust default is used.
1324
+ any_type_strategy: How to decode Types.Any() struct columns:
1325
+ - "float": Coalesce to Float64, prioritize numeric (default)
1326
+ - "string": Convert everything to string (lossless)
1327
+ - "keep_struct": Keep raw struct, don't decode
1328
+ flush_ram_limit_mb: RAM limit in MB for buffered data before flushing.
1329
+ (default: 512)
1330
+
1331
+ Returns:
1332
+ Polars DataFrame with results
1333
+
1334
+ Raises:
1335
+ ValueError: If no schema is provided
1336
+
1337
+ Example:
1338
+ >>> cursor = collection.find({...}).sort("timestamp", 1)
1339
+ >>> df = cursor.to_polars(
1340
+ ... max_workers=8,
1341
+ ... chunking_granularity=timedelta(days=7),
1342
+ ... flush_ram_limit_mb=2000,
1343
+ ... )
1344
+ """
1345
+ schema = self._collection.schema
1346
+ if schema is None:
1347
+ raise ValueError(
1348
+ "Schema is required for to_polars(). "
1349
+ "Provide a schema when creating the collection."
1350
+ )
1351
+
1352
+ # CRITICAL: If limit() or skip() are used, fall back to PyMongo
1353
+ # Reason: Downloading all data just to return a subset is impractical
1354
+ if self._limit > 0 or self._skip > 0:
1355
+ logger.info(
1356
+ "limit() or skip() detected - falling back to PyMongo iteration "
1357
+ "(acceleration would be impractical for subset queries)"
1358
+ )
1359
+ # Use fresh PyMongo cursor (not self which may be exhausted)
1360
+ pymongo_cursor = self._collection.pymongo_collection.find(
1361
+ self._filter, self._projection
1362
+ )
1363
+ if self._sort:
1364
+ pymongo_cursor = pymongo_cursor.sort(self._sort)
1365
+ if self._skip:
1366
+ pymongo_cursor = pymongo_cursor.skip(self._skip)
1367
+ if self._limit:
1368
+ pymongo_cursor = pymongo_cursor.limit(self._limit)
1369
+ if self._batch_size:
1370
+ pymongo_cursor = pymongo_cursor.batch_size(self._batch_size)
1371
+ docs = list(pymongo_cursor)
1372
+ if not docs:
1373
+ return pl.DataFrame()
1374
+ return pl.DataFrame(docs)
1375
+
1376
+ # CRITICAL: Validate projection doesn't exclude required fields
1377
+ if self._projection:
1378
+ projection_values = [v for k, v in self._projection.items() if k != "_id"]
1379
+ is_inclusion = any(v == 1 for v in projection_values)
1380
+
1381
+ # Time field must be included
1382
+ if is_inclusion:
1383
+ time_in_projection = (
1384
+ schema.time_field in self._projection
1385
+ and self._projection[schema.time_field] == 1
1386
+ )
1387
+ if not time_in_projection:
1388
+ raise ValueError(
1389
+ f"Projection must include time field '{schema.time_field}'. "
1390
+ f"Projection: {self._projection}"
1391
+ )
1392
+
1393
+ # Sort fields must be included
1394
+ if self._sort:
1395
+ for sort_field, _ in self._sort:
1396
+ if is_inclusion:
1397
+ if (
1398
+ sort_field not in self._projection
1399
+ or self._projection[sort_field] != 1
1400
+ ):
1401
+ raise ValueError(
1402
+ f"Projection must include sort field '{sort_field}'. "
1403
+ f"Cannot sort by a field that is projected out. "
1404
+ f"Projection: {self._projection}"
1405
+ )
1406
+
1407
+ time_field = schema.time_field
1408
+
1409
+ # Validate sort field if specified
1410
+ if self._sort:
1411
+ sort_validation = validate_sort_field(self._sort, schema)
1412
+ if not sort_validation.is_valid:
1413
+ raise ValueError(f"Sort validation failed: {sort_validation.reason}")
1414
+
1415
+ # Parse and validate date filters
1416
+ parsed_start = parse_datetime_tz_aware(start_date, "start_date")
1417
+ parsed_end = parse_datetime_tz_aware(end_date, "end_date")
1418
+
1419
+ if not accelerate:
1420
+ if parsed_start or parsed_end:
1421
+ logger.warning(
1422
+ "start_date/end_date filters are ignored when accelerate=False"
1423
+ )
1424
+ # Fallback to regular iteration (native Polars from dicts)
1425
+ return self._to_polars_regular()
1426
+
1427
+ is_chunkable, reason, brackets, _ = build_brackets_for_find(
1428
+ self._filter,
1429
+ schema.time_field,
1430
+ self._sort, # Pass sort spec for $natural detection
1431
+ )
1432
+
1433
+ # Handle REJECT mode (is_chunkable=False)
1434
+ if not is_chunkable:
1435
+ if parsed_start or parsed_end:
1436
+ logger.warning(
1437
+ "start_date/end_date filters are ignored for non-chunkable queries"
1438
+ )
1439
+ logger.info("Invalid query syntax (%s) - cannot execute", reason)
1440
+ # Fall back to single-worker mode
1441
+ max_workers = 1
1442
+ chunking_granularity = None
1443
+
1444
+ # Handle SINGLE mode (is_chunkable=True but empty brackets)
1445
+ elif is_chunkable and not brackets:
1446
+ logger.info(
1447
+ "Query valid but not parallelizable (%s) - using single-worker mode",
1448
+ reason,
1449
+ )
1450
+ # Fall back to single-worker mode
1451
+ max_workers = 1
1452
+ chunking_granularity = None
1453
+
1454
+ # Create cache manager
1455
+ cache = CacheManager(
1456
+ filter_dict=self._filter,
1457
+ projection=self._projection,
1458
+ sort=self._sort,
1459
+ )
1460
+
1461
+ # Check if cache exists
1462
+ if cache_read and cache.exists():
1463
+ logging.debug(f"[Cache] Reading from cache (polars): {cache.cache_dir}")
1464
+ reader = ParquetReader(cache.cache_dir)
1465
+ df = cast(
1466
+ pl.DataFrame,
1467
+ reader.to_dataframe(
1468
+ engine="polars",
1469
+ schema=schema,
1470
+ time_field=time_field,
1471
+ start_date=parsed_start,
1472
+ end_date=parsed_end,
1473
+ coerce=coerce,
1474
+ any_type_strategy=any_type_strategy,
1475
+ ),
1476
+ )
1477
+
1478
+ # Check if we need DuckDB sorting (Any types or List types)
1479
+ need_duckdb_sort = False
1480
+ sort_infos: List[Dict[str, Any]] = []
1481
+ if self._sort:
1482
+ sort_infos = get_sort_field_info(self._sort, schema)
1483
+
1484
+ # Expand parent fields to children and collect all fields to check
1485
+ fields_to_check = []
1486
+ for info in sort_infos:
1487
+ if info["is_parent"]:
1488
+ # Parent field - check all children
1489
+ fields_to_check.extend(info["child_fields"])
1490
+ else:
1491
+ # Direct field
1492
+ fields_to_check.append(info["field"])
1493
+
1494
+ # Check if any of the actual sort fields (after expansion) are Any/List types
1495
+ for field in fields_to_check:
1496
+ if field in schema.fields:
1497
+ field_type = schema.fields[field]
1498
+ if isinstance(field_type, (AnyType, ListType)):
1499
+ need_duckdb_sort = True
1500
+ break
1501
+
1502
+ if self._sort and need_duckdb_sort:
1503
+ # Use DuckDB for Any/List type sorting (requires BSON type ordering / array sorting)
1504
+ logging.debug(
1505
+ "[Sort] Using DuckDB for Types.Any()/Types.List() sorting..."
1506
+ )
1507
+
1508
+ warnings.warn(
1509
+ "Sorting by Types.Any() field in to_polars returns raw struct columns "
1510
+ "(e.g., 'value.float_value', 'value.int64_value'). "
1511
+ "Use to_dataframe() for decoded Any() values.",
1512
+ UserWarning,
1513
+ )
1514
+
1515
+ # Use get_globally_sorted_dataframe() - more efficient than batching
1516
+ combined_df = reader.get_globally_sorted_dataframe(
1517
+ sort_spec=self._sort,
1518
+ schema=schema,
1519
+ time_field=time_field,
1520
+ start_date=parsed_start,
1521
+ end_date=parsed_end,
1522
+ coerce=coerce,
1523
+ )
1524
+
1525
+ if not combined_df.empty:
1526
+ for col in combined_df.columns:
1527
+ if combined_df[col].dtype == object:
1528
+ first_val = (
1529
+ combined_df[col].dropna().iloc[0]
1530
+ if not combined_df[col].dropna().empty
1531
+ else None
1532
+ )
1533
+ if (
1534
+ first_val is not None
1535
+ and type(first_val).__name__ == "ObjectId"
1536
+ ):
1537
+ combined_df[col] = combined_df[col].astype(str)
1538
+ df = pl.from_pandas(combined_df)
1539
+ else:
1540
+ df = pl.DataFrame()
1541
+
1542
+ elif self._sort:
1543
+ # Native Polars sort - expand parent fields to children
1544
+ expanded_sort = []
1545
+ for info in sort_infos:
1546
+ if info["is_parent"]:
1547
+ # Expand parent field to all children
1548
+ for child in info["child_fields"]:
1549
+ expanded_sort.append((child, info["direction"]))
1550
+ else:
1551
+ expanded_sort.append((info["field"], info["direction"]))
1552
+
1553
+ sort_fields = [
1554
+ field for field, _ in expanded_sort if field in df.columns
1555
+ ]
1556
+ descending = [
1557
+ direction == -1
1558
+ for field, direction in expanded_sort
1559
+ if field in df.columns
1560
+ ]
1561
+ if sort_fields:
1562
+ df = df.sort(sort_fields, descending=descending)
1563
+
1564
+ # Apply skip/limit
1565
+ if self._skip:
1566
+ df = df.slice(self._skip)
1567
+ if self._limit:
1568
+ df = df.head(self._limit)
1569
+
1570
+ logging.debug(
1571
+ f"[OK] Loaded {len(df):,} documents from cache ({reader.get_statistics()['total_size_mb']:.1f} MB)"
1572
+ )
1573
+ return df
1574
+
1575
+ # Cache miss - need to fetch and write
1576
+ if not cache_write:
1577
+ raise ValueError(
1578
+ "Cache does not exist and cache_write=False. "
1579
+ "Either enable cache_write or call to_dataframe() first."
1580
+ )
1581
+
1582
+ # Fetch data (uses same logic as to_dataframe)
1583
+ mode_str = (
1584
+ "parallel" if is_chunkable and chunking_granularity else "single-worker"
1585
+ )
1586
+ logging.debug(
1587
+ f"[Query] Cache miss - fetching from MongoDB ({mode_str} mode)..."
1588
+ )
1589
+
1590
+ result = execute_parallel_stream_to_cache(
1591
+ pymongo_collection=self._collection.pymongo_collection,
1592
+ filter_dict=self._filter,
1593
+ schema=schema,
1594
+ cache_manager=cache,
1595
+ projection=self._projection,
1596
+ approx_document_size_bytes=self._collection.approx_document_size_bytes,
1597
+ max_workers=max_workers if is_chunkable else 1,
1598
+ peak_ram_limit_mb=flush_ram_limit_mb,
1599
+ chunking_granularity=chunking_granularity if is_chunkable else None,
1600
+ mongo_uri=self._collection.mongo_uri,
1601
+ row_group_size=row_group_size,
1602
+ )
1603
+
1604
+ logging.debug(
1605
+ f"\n[Cache] Cache written: {result['total_docs']:,} docs in {result['duration_s']:.2f}s"
1606
+ )
1607
+
1608
+ # Read from cache as Polars
1609
+ logging.debug("[Cache] Reading from cache to build Polars DataFrame...")
1610
+ reader = ParquetReader(cache.cache_dir)
1611
+
1612
+ # Check if we need DuckDB sorting (Any types or List types)
1613
+ need_duckdb_sort = False
1614
+ sort_infos: List[Dict[str, Any]] = []
1615
+ if self._sort:
1616
+ sort_infos = get_sort_field_info(self._sort, schema)
1617
+
1618
+ # Expand parent fields to children and collect all fields to check
1619
+ fields_to_check = []
1620
+ for info in sort_infos:
1621
+ if info["is_parent"]:
1622
+ # Parent field - check all children
1623
+ fields_to_check.extend(info["child_fields"])
1624
+ else:
1625
+ # Direct field
1626
+ fields_to_check.append(info["field"])
1627
+
1628
+ # Check if any of the actual sort fields (after expansion) are Any/List types
1629
+ for field in fields_to_check:
1630
+ if field in schema.fields:
1631
+ field_type = schema.fields[field]
1632
+ if isinstance(field_type, (AnyType, ListType)):
1633
+ need_duckdb_sort = True
1634
+ break
1635
+
1636
+ if self._sort and need_duckdb_sort:
1637
+ # Use DuckDB for Any/List type sorting (requires BSON type ordering / array sorting)
1638
+ logging.debug("[Sort] Using DuckDB for Types.Any()/Types.List() sorting...")
1639
+
1640
+ warnings.warn(
1641
+ "Sorting by Types.Any() field in to_polars returns raw struct columns "
1642
+ "(e.g., 'value.float_value', 'value.int64_value'). "
1643
+ "Use to_dataframe() for decoded Any() values.",
1644
+ UserWarning,
1645
+ )
1646
+
1647
+ # Use get_globally_sorted_dataframe() - more efficient than batching
1648
+ combined_df = reader.get_globally_sorted_dataframe(
1649
+ sort_spec=self._sort,
1650
+ schema=schema,
1651
+ time_field=time_field,
1652
+ start_date=parsed_start,
1653
+ end_date=parsed_end,
1654
+ coerce=coerce,
1655
+ )
1656
+
1657
+ if not combined_df.empty:
1658
+ for col in combined_df.columns:
1659
+ if combined_df[col].dtype == object:
1660
+ first_val = (
1661
+ combined_df[col].dropna().iloc[0]
1662
+ if not combined_df[col].dropna().empty
1663
+ else None
1664
+ )
1665
+ if (
1666
+ first_val is not None
1667
+ and type(first_val).__name__ == "ObjectId"
1668
+ ):
1669
+ combined_df[col] = combined_df[col].astype(str)
1670
+ df = pl.from_pandas(combined_df)
1671
+ else:
1672
+ df = pl.DataFrame()
1673
+ else:
1674
+ df = cast(
1675
+ pl.DataFrame,
1676
+ reader.to_dataframe(
1677
+ engine="polars",
1678
+ schema=schema,
1679
+ time_field=time_field,
1680
+ start_date=parsed_start,
1681
+ end_date=parsed_end,
1682
+ coerce=coerce,
1683
+ any_type_strategy=any_type_strategy,
1684
+ ),
1685
+ )
1686
+
1687
+ # Native Polars sort - expand parent fields to children
1688
+ if self._sort:
1689
+ expanded_sort = []
1690
+ for info in sort_infos:
1691
+ if info["is_parent"]:
1692
+ for child in info["child_fields"]:
1693
+ expanded_sort.append((child, info["direction"]))
1694
+ else:
1695
+ expanded_sort.append((info["field"], info["direction"]))
1696
+
1697
+ sort_fields = [
1698
+ field for field, _ in expanded_sort if field in df.columns
1699
+ ]
1700
+ descending = [
1701
+ direction == -1
1702
+ for field, direction in expanded_sort
1703
+ if field in df.columns
1704
+ ]
1705
+ if sort_fields:
1706
+ # Polars uses `reverse` (not `descending`) in older versions.
1707
+ df = df.sort(sort_fields, descending=descending)
1708
+
1709
+ # Apply skip/limit
1710
+ if self._skip:
1711
+ df = df.slice(self._skip)
1712
+ if self._limit:
1713
+ df = df.head(self._limit)
1714
+
1715
+ return df
1716
+
1717
+ def _to_dataframe_regular(self) -> pd.DataFrame:
1718
+ """
1719
+ Convert to DataFrame without acceleration.
1720
+
1721
+ Uses regular PyMongo iteration. Fallback for:
1722
+ - Non-chunkable queries
1723
+ - No schema provided
1724
+ - Acceleration disabled
1725
+
1726
+ Returns:
1727
+ Pandas DataFrame
1728
+ """
1729
+ # Collect all documents - __iter__ will set _started
1730
+ # Convert to DataFrame
1731
+ return pd.json_normalize(list(self))
1732
+
1733
+ def _to_polars_regular(self) -> "pl.DataFrame":
1734
+ """
1735
+ Convert to Polars DataFrame without acceleration.
1736
+
1737
+ Uses regular PyMongo iteration with native Polars conversion.
1738
+ Fallback for:
1739
+ - Non-chunkable queries
1740
+ - No schema provided
1741
+ - Acceleration disabled
1742
+
1743
+ Returns:
1744
+ Polars DataFrame
1745
+
1746
+ Note:
1747
+ Uses pl.from_dicts() which handles nested documents by creating
1748
+ struct columns. For flattened column names like pandas json_normalize,
1749
+ you would need to unnest() afterwards.
1750
+ """
1751
+ # Collect all documents - __iter__ will set _started
1752
+ docs = list(self)
1753
+
1754
+ if not docs:
1755
+ return pl.DataFrame()
1756
+
1757
+ return pl.from_dicts(docs)
1758
+
1759
+ def _to_dataframe_accelerated(
1760
+ self,
1761
+ cache_read: bool,
1762
+ cache_write: bool,
1763
+ start_date: Optional[datetime] = None,
1764
+ end_date: Optional[datetime] = None,
1765
+ coerce: Literal["raise", "error"] = "raise",
1766
+ max_workers: int = 4,
1767
+ chunking_granularity: Optional[timedelta] = None,
1768
+ is_chunkable: bool = True,
1769
+ flush_ram_limit_mb: int = 512,
1770
+ row_group_size: Optional[int] = None,
1771
+ ) -> pd.DataFrame:
1772
+ """
1773
+ Convert to DataFrame using parallel execution with Parquet caching.
1774
+
1775
+ ┌─────────────────────────────────────────────────────────────────────┐
1776
+ │ DATA FLOW - ACCELERATED EXECUTION: │
1777
+ │ │
1778
+ │ This is where the XLR8 magic happens. The flow is: │
1779
+ │ │
1780
+ │ 1. CACHE CHECK │
1781
+ │ Input: self._filter hashed to "abc123def" │
1782
+ │ Check: Does .cache/abc123def/*.parquet exist? │
1783
+ │ If yes -> Read directly from Parquet (instant!) │
1784
+ │ │
1785
+ │ 2. CACHE MISS -> PARALLEL FETCH (if chunkable) │
1786
+ │ Calls: execute_parallel_stream_to_cache() │
1787
+ │ Which does: │
1788
+ │ a) Build brackets from query (analysis/brackets.py) │
1789
+ │ Query -> [Bracket(static_filter, time_range), ...] │
1790
+ │ b) Plan execution (execution/planner.py) │
1791
+ │ Time range + RAM -> workers=N, batch_size=M │
1792
+ │ c) Chunk time ranges (analysis/chunker.py) │
1793
+ │ 6 months -> X chunks based on granularity │
1794
+ │ d) Parallel fetch (Rust backend fetch_chunks_bson) │
1795
+ │ N async workers pull chunks from queue │
1796
+ │ e) Stream to Parquet (Rust backend) │
1797
+ │ Each worker writes part files: part_0000.parquet, etc. │
1798
+ │ │
1799
+ │ 2b. CACHE MISS -> SINGLE-WORKER FETCH (if not chunkable) │
1800
+ │ - Single worker fetches all data │
1801
+ │ - No async, no chunking │
1802
+ │ - Still writes to Parquet for caching │
1803
+ │ │
1804
+ │ 3. READ FROM CACHE │
1805
+ │ After fetch, read the Parquet files we just wrote │
1806
+ │ Optionally filter by start_date/end_date │
1807
+ │ Returns: pandas DataFrame with original values │
1808
+ │ │
1809
+ │ EXAMPLE TIMING (500K docs): │
1810
+ │ - Cache hit: 0.5s (read Parquet) │
1811
+ │ - Cache miss: 10-15s (parallel fetch + write + read) │
1812
+ │ - Without XLR8: 30-40s (sequential cursor iteration) │
1813
+ └─────────────────────────────────────────────────────────────────────┘
1814
+
1815
+ Args:
1816
+ cache_read: Read from cache if available
1817
+ cache_write: Write to cache after fetching
1818
+ start_date: Filter cached data from this date (inclusive, tz-aware)
1819
+ end_date: Filter cached data until this date (exclusive, tz-aware)
1820
+ coerce: Error handling mode ("raise" or "error")
1821
+ max_workers: Maximum parallel workers (passed from to_dataframe)
1822
+ chunking_granularity: Time granularity for chunking (passed from to_dataframe)
1823
+ is_chunkable: Whether query is chunkable (determines parallel vs single-worker)
1824
+
1825
+ Returns:
1826
+ Pandas DataFrame with accelerated query results
1827
+ """
1828
+ schema = self._collection.schema
1829
+ time_field = schema.time_field
1830
+
1831
+ # Mark as started
1832
+ if not self._started:
1833
+ self._started = True
1834
+
1835
+ # ─────────────────────────────────────────────────────────────────────
1836
+ # STEP 1: Create cache manager (hashes query to unique directory)
1837
+ # Example: filter_dict hashes to "abc123def" -> .cache/abc123def/
1838
+ # ─────────────────────────────────────────────────────────────────────
1839
+ cache = CacheManager(
1840
+ filter_dict=self._filter,
1841
+ projection=self._projection,
1842
+ sort=self._sort,
1843
+ )
1844
+
1845
+ # ─────────────────────────────────────────────────────────────────────
1846
+ # STEP 2: Check cache - if hit, read directly from Parquet
1847
+ # Example: .cache/abc123def/ts_1704067200_1704070800_part_0000.parquet
1848
+ # ─────────────────────────────────────────────────────────────────────
1849
+ if cache_read and cache.exists():
1850
+ logging.debug(f"[Cache] Reading from cache: {cache.cache_dir}")
1851
+ reader = ParquetReader(cache.cache_dir)
1852
+
1853
+ # Check if we need DuckDB sorting (Any types or List types)
1854
+ need_duckdb_sort = False
1855
+ sort_infos: List[Dict[str, Any]] = []
1856
+ if self._sort:
1857
+ sort_infos = get_sort_field_info(self._sort, schema)
1858
+
1859
+ # Expand parent fields to children and collect all fields to check
1860
+ fields_to_check = []
1861
+ for info in sort_infos:
1862
+ if info["is_parent"]:
1863
+ # Parent field - check all children
1864
+ fields_to_check.extend(info["child_fields"])
1865
+ else:
1866
+ # Direct field
1867
+ fields_to_check.append(info["field"])
1868
+
1869
+ # Check if any of the actual sort fields (after expansion) are Any/List types
1870
+ for field in fields_to_check:
1871
+ if field in schema.fields:
1872
+ field_type = schema.fields[field]
1873
+ if isinstance(field_type, (AnyType, ListType)):
1874
+ need_duckdb_sort = True
1875
+ break
1876
+
1877
+ if self._sort and need_duckdb_sort:
1878
+ # Use DuckDB for Any/List type sorting (requires BSON type ordering / array sorting)
1879
+ logging.debug(
1880
+ "[Sort] Using DuckDB for Types.Any()/Types.List() sorting..."
1881
+ )
1882
+ df = cast(
1883
+ pd.DataFrame,
1884
+ reader.get_globally_sorted_dataframe(
1885
+ sort_spec=self._sort,
1886
+ schema=schema,
1887
+ time_field=time_field,
1888
+ start_date=start_date,
1889
+ end_date=end_date,
1890
+ coerce=coerce,
1891
+ memory_limit_mb=flush_ram_limit_mb,
1892
+ threads=max_workers,
1893
+ ),
1894
+ )
1895
+ else:
1896
+ # Normal read + native pandas sort
1897
+ df = cast(
1898
+ pd.DataFrame,
1899
+ reader.to_dataframe(
1900
+ engine="pandas",
1901
+ schema=schema,
1902
+ time_field=time_field,
1903
+ start_date=start_date,
1904
+ end_date=end_date,
1905
+ coerce=coerce,
1906
+ ),
1907
+ )
1908
+
1909
+ # Native pandas sort - expand parent fields to children
1910
+ if self._sort:
1911
+ expanded_sort = []
1912
+ for info in sort_infos:
1913
+ if info["is_parent"]:
1914
+ for child in info["child_fields"]:
1915
+ expanded_sort.append((child, info["direction"]))
1916
+ else:
1917
+ expanded_sort.append((info["field"], info["direction"]))
1918
+
1919
+ sort_fields = [
1920
+ field for field, _ in expanded_sort if field in df.columns
1921
+ ]
1922
+ ascending = [
1923
+ direction == 1
1924
+ for field, direction in expanded_sort
1925
+ if field in df.columns
1926
+ ]
1927
+ if sort_fields:
1928
+ df = df.sort_values(
1929
+ by=sort_fields, ascending=ascending, na_position="last"
1930
+ )
1931
+ logger.debug("Sorted DataFrame by %s", sort_fields)
1932
+
1933
+ # Apply skip/limit if set
1934
+ if self._skip:
1935
+ df = df.iloc[self._skip :]
1936
+ if self._limit:
1937
+ df = df.iloc[: self._limit]
1938
+
1939
+ filter_info = ""
1940
+ if start_date or end_date:
1941
+ filter_info = f" (filtered: {start_date} to {end_date})"
1942
+ logging.debug(
1943
+ f"[OK] Loaded {len(df):,} documents from cache{filter_info} ({reader.get_statistics()['total_size_mb']:.1f} MB)"
1944
+ )
1945
+ return cast(pd.DataFrame, df)
1946
+
1947
+ # ─────────────────────────────────────────────────────────────────────
1948
+ # STEP 3: Cache miss - execute fetch and stream to Parquet
1949
+ # This is where the heavy lifting happens
1950
+ # ─────────────────────────────────────────────────────────────────────
1951
+ mode_str = "parallel" if is_chunkable else "single-worker"
1952
+ logging.debug(
1953
+ f"[Query] Cache miss - fetching from MongoDB ({mode_str} mode)..."
1954
+ )
1955
+
1956
+ if cache_write:
1957
+ # CRITICAL: If cache_read=False but cache_write=True and cache exists,
1958
+ # we need to clear the old cache first to avoid duplicate data
1959
+ if not cache_read and cache.exists():
1960
+ logging.debug(
1961
+ "Clearing existing cache (cache_read=False, starting fresh)..."
1962
+ )
1963
+ cache.clean()
1964
+ # chunking_granularity is passed from to_dataframe()
1965
+ # If None, execute_parallel_stream_to_cache will use single-worker mode
1966
+
1967
+ # Streaming path: fetch -> encode -> write Parquet (memory efficient)
1968
+ result = execute_parallel_stream_to_cache(
1969
+ pymongo_collection=self._collection.pymongo_collection,
1970
+ filter_dict=self._filter,
1971
+ schema=schema,
1972
+ cache_manager=cache,
1973
+ projection=self._projection,
1974
+ approx_document_size_bytes=self._collection.approx_document_size_bytes,
1975
+ max_workers=max_workers, # From to_dataframe() parameter
1976
+ peak_ram_limit_mb=flush_ram_limit_mb,
1977
+ chunking_granularity=chunking_granularity, # None = single-worker mode
1978
+ mongo_uri=self._collection.mongo_uri,
1979
+ row_group_size=row_group_size,
1980
+ )
1981
+
1982
+ logging.debug("\n[Cache] Cache written:")
1983
+ logging.debug(f" - Total docs: {result['total_docs']:,}")
1984
+ logging.debug(f" - Total files: {result['total_files']}")
1985
+ logging.debug(f" - Workers: {result['workers']}")
1986
+ logging.debug(f" - Duration: {result['duration_s']:.2f}s")
1987
+ logging.debug(f" - Cache dir: {cache.cache_dir}")
1988
+
1989
+ # Now read from cache to build DataFrame (with optional date filter)
1990
+ logging.debug("\n[Cache] Reading from cache to build DataFrame...")
1991
+ reader = ParquetReader(cache.cache_dir)
1992
+
1993
+ # Check if we need DuckDB sorting (Any types or List types)
1994
+ need_duckdb_sort = False
1995
+ sort_infos: List[Dict[str, Any]] = []
1996
+ if self._sort:
1997
+ sort_infos = get_sort_field_info(self._sort, schema)
1998
+
1999
+ # Expand parent fields to children and collect all fields to check
2000
+ fields_to_check = []
2001
+ for info in sort_infos:
2002
+ if info["is_parent"]:
2003
+ # Parent field - check all children
2004
+ fields_to_check.extend(info["child_fields"])
2005
+ else:
2006
+ # Direct field
2007
+ fields_to_check.append(info["field"])
2008
+
2009
+ # Check if any of the actual sort fields (after expansion) are Any/List types
2010
+ for field in fields_to_check:
2011
+ if field in schema.fields:
2012
+ field_type = schema.fields[field]
2013
+ if isinstance(field_type, (AnyType, ListType)):
2014
+ need_duckdb_sort = True
2015
+ break
2016
+
2017
+ if self._sort and need_duckdb_sort:
2018
+ # Use DuckDB for Any/List type sorting (requires BSON type ordering / array sorting)
2019
+ logging.debug(
2020
+ "[Sort] Using DuckDB for Types.Any()/Types.List() sorting..."
2021
+ )
2022
+ df = cast(
2023
+ pd.DataFrame,
2024
+ reader.get_globally_sorted_dataframe(
2025
+ sort_spec=self._sort,
2026
+ schema=schema,
2027
+ time_field=time_field,
2028
+ start_date=start_date,
2029
+ end_date=end_date,
2030
+ coerce=coerce,
2031
+ memory_limit_mb=flush_ram_limit_mb,
2032
+ threads=max_workers,
2033
+ ),
2034
+ )
2035
+ else:
2036
+ # Normal read + native pandas sort
2037
+ df = cast(
2038
+ pd.DataFrame,
2039
+ reader.to_dataframe(
2040
+ engine="pandas",
2041
+ schema=schema,
2042
+ time_field=time_field,
2043
+ start_date=start_date,
2044
+ end_date=end_date,
2045
+ coerce=coerce,
2046
+ ),
2047
+ )
2048
+
2049
+ # Native pandas sort - expand parent fields to children
2050
+ if self._sort:
2051
+ expanded_sort = []
2052
+ for info in sort_infos:
2053
+ if info["is_parent"]:
2054
+ for child in info["child_fields"]:
2055
+ expanded_sort.append((child, info["direction"]))
2056
+ else:
2057
+ expanded_sort.append((info["field"], info["direction"]))
2058
+
2059
+ sort_fields = [
2060
+ field for field, _ in expanded_sort if field in df.columns
2061
+ ]
2062
+ ascending = [
2063
+ direction == 1
2064
+ for field, direction in expanded_sort
2065
+ if field in df.columns
2066
+ ]
2067
+ if sort_fields:
2068
+ df = df.sort_values(
2069
+ by=sort_fields, ascending=ascending, na_position="last"
2070
+ )
2071
+ logger.debug("Sorted DataFrame by %s", sort_fields)
2072
+
2073
+ else:
2074
+ # cache_write=False is not supported in single-worker mode
2075
+ # Always write to cache for consistency and performance
2076
+ raise ValueError(
2077
+ "cache_write=False is not supported. "
2078
+ "XLR8 always writes to Parquet cache for memory efficiency. "
2079
+ "Set cache_read=False if you don't want to read from existing cache."
2080
+ )
2081
+
2082
+ # Apply skip/limit if set
2083
+ if self._skip:
2084
+ df = df.iloc[self._skip :]
2085
+ if self._limit:
2086
+ df = df.iloc[: self._limit]
2087
+
2088
+ return cast(pd.DataFrame, df)
2089
+
2090
+ def explain_acceleration(self) -> Dict[str, Any]:
2091
+ """
2092
+ Get query execution plan.
2093
+
2094
+ Returns explanation of how query will be executed:
2095
+ - Whether acceleration is possible
2096
+ - Time bounds extracted
2097
+ - Estimated chunk count
2098
+ - Worker configuration
2099
+
2100
+ Returns:
2101
+ Dict with execution plan details
2102
+ """
2103
+ schema = self._collection.schema
2104
+
2105
+ result: Dict[str, Any] = {
2106
+ "filter": self._filter,
2107
+ "projection": self._projection,
2108
+ "skip": self._skip,
2109
+ "limit": self._limit,
2110
+ "sort": self._sort,
2111
+ "accelerated": False,
2112
+ }
2113
+
2114
+ if schema is None:
2115
+ result["reason"] = "No schema provided"
2116
+ return result
2117
+
2118
+ # NEW: build_brackets_for_find internally validates via is_chunkable_query
2119
+ is_chunkable, reason, brackets, bounds = build_brackets_for_find(
2120
+ self._filter,
2121
+ schema.time_field,
2122
+ self._sort, # Pass sort spec for $natural detection
2123
+ )
2124
+
2125
+ result["is_chunkable"] = is_chunkable
2126
+ result["reason"] = reason
2127
+
2128
+ # Distinguish REJECT vs SINGLE modes
2129
+ if not is_chunkable:
2130
+ # REJECT mode
2131
+ result["mode"] = "reject"
2132
+ elif is_chunkable and not brackets:
2133
+ # SINGLE mode - valid but not parallelizable
2134
+ result["mode"] = "single"
2135
+ else:
2136
+ # PARALLEL mode
2137
+ result["mode"] = "parallel"
2138
+
2139
+ if is_chunkable and bounds and bounds[0] and bounds[1]:
2140
+ start_bound = bounds[0]
2141
+ end_bound = bounds[1]
2142
+
2143
+ result["time_bounds"] = {
2144
+ "start": start_bound.isoformat(),
2145
+ "end": end_bound.isoformat(),
2146
+ }
2147
+
2148
+ chunks = chunk_time_range(
2149
+ start_bound, end_bound, chunk_size=timedelta(days=1)
2150
+ )
2151
+ result["estimated_chunks"] = len(chunks)
2152
+
2153
+ result["accelerated"] = True
2154
+
2155
+ return result