xlr8 0.1.7b3__cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xlr8/__init__.py +113 -0
- xlr8/_xlr8_rust.cpython-311-aarch64-linux-gnu.so +0 -0
- xlr8/_xlr8_rust.pyi +71 -0
- xlr8/analysis/__init__.py +58 -0
- xlr8/analysis/brackets.py +1201 -0
- xlr8/analysis/chunker.py +118 -0
- xlr8/analysis/inspector.py +1889 -0
- xlr8/collection/__init__.py +6 -0
- xlr8/collection/cursor.py +2161 -0
- xlr8/collection/cursor.pyi +179 -0
- xlr8/collection/wrapper.py +400 -0
- xlr8/collection/wrapper.pyi +420 -0
- xlr8/constants.py +24 -0
- xlr8/execution/__init__.py +43 -0
- xlr8/execution/callback.py +792 -0
- xlr8/execution/executor.py +500 -0
- xlr8/execution/planner.py +377 -0
- xlr8/py.typed +1 -0
- xlr8/rust_backend.py +40 -0
- xlr8/rust_backend.pyi +71 -0
- xlr8/schema/__init__.py +42 -0
- xlr8/schema/encoder.py +235 -0
- xlr8/schema/schema.py +265 -0
- xlr8/schema/types.py +239 -0
- xlr8/storage/__init__.py +17 -0
- xlr8/storage/cache.py +228 -0
- xlr8/storage/reader.py +1369 -0
- xlr8-0.1.7b3.dist-info/METADATA +176 -0
- xlr8-0.1.7b3.dist-info/RECORD +31 -0
- xlr8-0.1.7b3.dist-info/WHEEL +5 -0
- xlr8-0.1.7b3.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,2161 @@
|
|
|
1
|
+
"""
|
|
2
|
+
XLR8 cursor with PyMongo compatibility.
|
|
3
|
+
|
|
4
|
+
================================================================================
|
|
5
|
+
DATA FLOW - CURSOR (WHERE ACCELERATION HAPPENS)
|
|
6
|
+
================================================================================
|
|
7
|
+
|
|
8
|
+
This module is where the magic happens. When user calls cursor.to_dataframe(),
|
|
9
|
+
we decide whether to:
|
|
10
|
+
A) Use regular PyMongo iteration (slow)
|
|
11
|
+
B) Use accelerated parallel fetch + Parquet caching (fast)
|
|
12
|
+
|
|
13
|
+
DECISION FLOW:
|
|
14
|
+
────────────────────────────────────────────────────────────────────────────────
|
|
15
|
+
|
|
16
|
+
cursor.to_dataframe() called
|
|
17
|
+
│
|
|
18
|
+
▼
|
|
19
|
+
┌─────────────────────────────┐
|
|
20
|
+
│ Is schema provided? │─── No ──▶ REGULAR PATH (PyMongo iteration)
|
|
21
|
+
└─────────────────────────────┘
|
|
22
|
+
│ Yes
|
|
23
|
+
▼
|
|
24
|
+
┌─────────────────────────────┐
|
|
25
|
+
│ Is query chunkable? │─── No ──▶ REGULAR PATH
|
|
26
|
+
│ (has time range, no │ (e.g., has $where or nested $or)
|
|
27
|
+
│ forbidden operators) │
|
|
28
|
+
└─────────────────────────────┘
|
|
29
|
+
│ Yes
|
|
30
|
+
▼
|
|
31
|
+
┌─────────────────────────────┐
|
|
32
|
+
│ Is data in cache? │─── Yes ─▶ READ FROM CACHE
|
|
33
|
+
│ (.cache/{query_hash}/*.parquet) (instant, ~100ms for 1M rows)
|
|
34
|
+
└─────────────────────────────┘
|
|
35
|
+
│ No
|
|
36
|
+
▼
|
|
37
|
+
┌─────────────────────────────┐
|
|
38
|
+
│ ACCELERATED PATH: │
|
|
39
|
+
│ 1. Build brackets │ ← analysis/brackets.py
|
|
40
|
+
│ 2. Plan execution │ ← execution/planner.py
|
|
41
|
+
│ 3. Chunk time ranges │ ← analysis/chunker.py
|
|
42
|
+
│ 4. Parallel async fetch │ ← Rust backend (fetch_chunks_bson)
|
|
43
|
+
│ 5. Stream to Parquet │ ← Rust backend writes shards
|
|
44
|
+
│ 6. Read back DataFrame │ ← storage/reader.py
|
|
45
|
+
└─────────────────────────────┘
|
|
46
|
+
|
|
47
|
+
EXAMPLE DATA TRANSFORMATIONS:
|
|
48
|
+
────────────────────────────────────────────────────────────────────────────────
|
|
49
|
+
|
|
50
|
+
1. INPUT QUERY (from user):
|
|
51
|
+
{
|
|
52
|
+
"$or": [
|
|
53
|
+
{"metadata.sensor_id": ObjectId("64a...")},
|
|
54
|
+
{"metadata.sensor_id": ObjectId("64b...")},
|
|
55
|
+
],
|
|
56
|
+
"timestamp": {"$gte": datetime(2024, 1, 1), "$lt": datetime(2024, 7, 1)}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
2. AFTER BRACKET ANALYSIS (brackets.py):
|
|
60
|
+
[
|
|
61
|
+
Bracket(static={"metadata.sensor_id": "64a..."}, time=Jan-Jul),
|
|
62
|
+
Bracket(static={"metadata.sensor_id": "64b..."}, time=Jan-Jul),
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
3. AFTER CHUNKING (for each bracket):
|
|
66
|
+
Bracket 1 -> 13 chunks (14 days each for 6 months)
|
|
67
|
+
Bracket 2 -> 13 chunks
|
|
68
|
+
Total: 26 work items in queue
|
|
69
|
+
|
|
70
|
+
4. PARALLEL FETCH (10 workers):
|
|
71
|
+
Worker 0: Chunk 1 -> 45,000 docs, write to part_0000.parquet
|
|
72
|
+
Worker 1: Chunk 2 -> 52,000 docs, write to part_0001.parquet
|
|
73
|
+
...
|
|
74
|
+
Worker 9: Chunk 10 -> 38,000 docs, write to part_0009.parquet
|
|
75
|
+
(Rust async workers pull chunks as they finish)
|
|
76
|
+
|
|
77
|
+
5. OUTPUT (DataFrame):
|
|
78
|
+
pandas.DataFrame with columns: [timestamp, metadata.device_id, value, ...]
|
|
79
|
+
500,000 rows loaded from Parquet in ~0.5s
|
|
80
|
+
|
|
81
|
+
================================================================================
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
from __future__ import annotations
|
|
85
|
+
|
|
86
|
+
from typing import (
|
|
87
|
+
Any,
|
|
88
|
+
Callable,
|
|
89
|
+
Dict,
|
|
90
|
+
List,
|
|
91
|
+
Optional,
|
|
92
|
+
Union,
|
|
93
|
+
Iterator,
|
|
94
|
+
Literal,
|
|
95
|
+
Generator,
|
|
96
|
+
cast,
|
|
97
|
+
)
|
|
98
|
+
from datetime import datetime, date, timezone, timedelta
|
|
99
|
+
import logging
|
|
100
|
+
import warnings
|
|
101
|
+
import pandas as pd
|
|
102
|
+
import time
|
|
103
|
+
import pyarrow as pa
|
|
104
|
+
import polars as pl
|
|
105
|
+
|
|
106
|
+
logger = logging.getLogger(__name__)
|
|
107
|
+
|
|
108
|
+
# Import after logger to avoid circular imports
|
|
109
|
+
from xlr8.constants import DEFAULT_BATCH_SIZE
|
|
110
|
+
from xlr8.execution.callback import execute_partitioned_callback
|
|
111
|
+
from xlr8.analysis import (
|
|
112
|
+
build_brackets_for_find,
|
|
113
|
+
chunk_time_range,
|
|
114
|
+
get_sort_field_info,
|
|
115
|
+
validate_sort_field,
|
|
116
|
+
)
|
|
117
|
+
from xlr8.schema.types import Any as AnyType, List as ListType
|
|
118
|
+
from xlr8.storage import CacheManager, ParquetReader
|
|
119
|
+
from xlr8.execution import execute_parallel_stream_to_cache
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def parse_datetime_tz_aware(
|
|
123
|
+
value: Union[datetime, date, str, None],
|
|
124
|
+
param_name: str = "date",
|
|
125
|
+
) -> Optional[datetime]:
|
|
126
|
+
"""
|
|
127
|
+
Parse a date/datetime value to a timezone-aware datetime.
|
|
128
|
+
|
|
129
|
+
Accepts:
|
|
130
|
+
- datetime (must be tz-aware or will assume UTC)
|
|
131
|
+
- date (converted to midnight UTC)
|
|
132
|
+
- ISO format string with timezone (e.g., "2024-01-15T10:30:00Z", "2024-01-15T10:30:00+00:00")
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
value: The date value to parse
|
|
136
|
+
param_name: Name of parameter for error messages
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Timezone-aware datetime or None if value is None
|
|
140
|
+
|
|
141
|
+
Raises:
|
|
142
|
+
ValueError: If string is not a valid ISO format or missing timezone
|
|
143
|
+
"""
|
|
144
|
+
if value is None:
|
|
145
|
+
return None
|
|
146
|
+
|
|
147
|
+
if isinstance(value, datetime):
|
|
148
|
+
if value.tzinfo is None:
|
|
149
|
+
# Assume UTC for naive datetimes
|
|
150
|
+
return value.replace(tzinfo=timezone.utc)
|
|
151
|
+
return value
|
|
152
|
+
|
|
153
|
+
if isinstance(value, date):
|
|
154
|
+
# Convert date to midnight UTC
|
|
155
|
+
return datetime(value.year, value.month, value.day, tzinfo=timezone.utc)
|
|
156
|
+
|
|
157
|
+
if isinstance(value, str):
|
|
158
|
+
# Try parsing ISO format
|
|
159
|
+
try:
|
|
160
|
+
# Python 3.11+ has datetime.fromisoformat with better Z support
|
|
161
|
+
# For compatibility, handle Z suffix manually
|
|
162
|
+
if value.endswith("Z"):
|
|
163
|
+
value = value[:-1] + "+00:00"
|
|
164
|
+
|
|
165
|
+
dt = datetime.fromisoformat(value)
|
|
166
|
+
|
|
167
|
+
if dt.tzinfo is None:
|
|
168
|
+
raise ValueError(
|
|
169
|
+
f"{param_name}: Timezone-aware datetime required. "
|
|
170
|
+
f"Got '{value}' without timezone. "
|
|
171
|
+
f"Use ISO format with timezone like '2024-01-15T10:30:00Z' or '2024-01-15T10:30:00+00:00'"
|
|
172
|
+
)
|
|
173
|
+
return dt
|
|
174
|
+
except ValueError as e:
|
|
175
|
+
if "Timezone-aware" in str(e):
|
|
176
|
+
raise
|
|
177
|
+
raise ValueError(
|
|
178
|
+
f"{param_name}: Invalid datetime string '{value}'. "
|
|
179
|
+
f"Use ISO format with timezone like '2024-01-15T10:30:00Z' or '2024-01-15T10:30:00+00:00'"
|
|
180
|
+
) from e
|
|
181
|
+
|
|
182
|
+
raise TypeError(
|
|
183
|
+
f"{param_name}: Expected datetime, date, or ISO string, got {type(value).__name__}"
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class XLR8Cursor:
|
|
188
|
+
"""
|
|
189
|
+
PyMongo-compatible cursor with acceleration support.
|
|
190
|
+
|
|
191
|
+
Acts as drop-in replacement for pymongo.cursor.Cursor but can
|
|
192
|
+
accelerate queries through parallel execution and Parquet caching.
|
|
193
|
+
|
|
194
|
+
Key differences from PyMongo:
|
|
195
|
+
- to_dataframe() / to_polars() for efficient DataFrame conversion
|
|
196
|
+
- Transparent acceleration when query is chunkable
|
|
197
|
+
- Maintains full PyMongo API compatibility for iteration
|
|
198
|
+
|
|
199
|
+
Example:
|
|
200
|
+
>>> cursor = collection.find({"timestamp": {"$gte": start, "$lt": end}})
|
|
201
|
+
>>> df = cursor.to_dataframe() # Accelerated execution
|
|
202
|
+
>>>
|
|
203
|
+
>>> # Or use like regular PyMongo cursor:
|
|
204
|
+
>>> for doc in cursor:
|
|
205
|
+
... logging.debug(doc)
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
def __init__(
|
|
209
|
+
self,
|
|
210
|
+
collection: Any, # XLR8Collection
|
|
211
|
+
query_filter: Dict[str, Any],
|
|
212
|
+
projection: Optional[Dict[str, Any]] = None,
|
|
213
|
+
skip: int = 0,
|
|
214
|
+
limit: int = 0,
|
|
215
|
+
sort: Optional[List[tuple]] = None,
|
|
216
|
+
batch_size: int = 1000,
|
|
217
|
+
**kwargs: Any,
|
|
218
|
+
):
|
|
219
|
+
"""
|
|
220
|
+
Initialize cursor.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
collection: Parent XLR8Collection
|
|
224
|
+
query_filter: Query filter dict
|
|
225
|
+
projection: Field projection dict
|
|
226
|
+
skip: Number of documents to skip
|
|
227
|
+
limit: Maximum documents to return (0 = unlimited)
|
|
228
|
+
sort: List of (field, direction) tuples
|
|
229
|
+
batch_size: Batch size for iteration
|
|
230
|
+
**kwargs: Additional PyMongo cursor options (no_cursor_timeout,
|
|
231
|
+
cursor_type, collation, hint, max_time_ms, etc.)
|
|
232
|
+
These are passed through to PyMongo when iterating.
|
|
233
|
+
"""
|
|
234
|
+
self._collection = collection
|
|
235
|
+
self._filter = query_filter
|
|
236
|
+
self._projection = projection
|
|
237
|
+
self._skip = skip
|
|
238
|
+
self._limit = limit
|
|
239
|
+
self._sort = sort
|
|
240
|
+
self._batch_size = batch_size
|
|
241
|
+
self._cursor_kwargs = kwargs # Store all additional PyMongo options
|
|
242
|
+
|
|
243
|
+
# Iteration state
|
|
244
|
+
self._started = False
|
|
245
|
+
self._pymongo_cursor: Optional[Any] = None
|
|
246
|
+
self._exhausted = False
|
|
247
|
+
|
|
248
|
+
def __iter__(self) -> Iterator[Dict[str, Any]]:
|
|
249
|
+
"""Iterate over documents."""
|
|
250
|
+
if not self._started:
|
|
251
|
+
self._started = True
|
|
252
|
+
# Create actual PyMongo cursor for iteration
|
|
253
|
+
self._ensure_pymongo_cursor()
|
|
254
|
+
|
|
255
|
+
if self._pymongo_cursor is None:
|
|
256
|
+
return iter([])
|
|
257
|
+
|
|
258
|
+
return iter(self._pymongo_cursor)
|
|
259
|
+
|
|
260
|
+
def __next__(self) -> Dict[str, Any]:
|
|
261
|
+
"""Get next document."""
|
|
262
|
+
if not self._started:
|
|
263
|
+
self.__iter__()
|
|
264
|
+
|
|
265
|
+
if self._pymongo_cursor is None:
|
|
266
|
+
raise StopIteration
|
|
267
|
+
|
|
268
|
+
return next(self._pymongo_cursor)
|
|
269
|
+
|
|
270
|
+
def _ensure_pymongo_cursor(self) -> None:
|
|
271
|
+
"""Lazily create PyMongo cursor only when needed for iteration/delegation."""
|
|
272
|
+
if self._pymongo_cursor is None:
|
|
273
|
+
self._pymongo_cursor = self._collection.pymongo_collection.find(
|
|
274
|
+
filter=self._filter,
|
|
275
|
+
projection=self._projection,
|
|
276
|
+
skip=self._skip,
|
|
277
|
+
limit=self._limit,
|
|
278
|
+
sort=self._sort,
|
|
279
|
+
batch_size=self._batch_size,
|
|
280
|
+
**self._cursor_kwargs, # Pass through all PyMongo cursor options
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
def raw_cursor(self):
|
|
284
|
+
"""
|
|
285
|
+
Get direct access to underlying PyMongo cursor.
|
|
286
|
+
|
|
287
|
+
This is an escape hatch for power users who need access to PyMongo cursor
|
|
288
|
+
methods not explicitly implemented in XLR8Cursor.
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
pymongo.cursor.Cursor: The underlying PyMongo cursor
|
|
292
|
+
|
|
293
|
+
Example:
|
|
294
|
+
>>> cursor = collection.find(...)
|
|
295
|
+
>>> cursor.raw_cursor().comment("my query").max_time_ms(5000)
|
|
296
|
+
"""
|
|
297
|
+
self._ensure_pymongo_cursor()
|
|
298
|
+
return self._pymongo_cursor
|
|
299
|
+
|
|
300
|
+
def __getattr__(self, name: str) -> Any:
|
|
301
|
+
"""
|
|
302
|
+
Delegate unknown attributes to underlying PyMongo cursor.
|
|
303
|
+
|
|
304
|
+
This provides transparent access to all PyMongo cursor methods while
|
|
305
|
+
preserving XLR8's accelerated methods.
|
|
306
|
+
|
|
307
|
+
Note: PyMongo cursor is created lazily only when delegation is needed.
|
|
308
|
+
For explicit access, use .raw_cursor()
|
|
309
|
+
"""
|
|
310
|
+
# Avoid infinite recursion
|
|
311
|
+
if name.startswith("_"):
|
|
312
|
+
raise AttributeError(
|
|
313
|
+
f"'{type(self).__name__}' object has no attribute '{name}'"
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
# Create PyMongo cursor if needed
|
|
317
|
+
self._ensure_pymongo_cursor()
|
|
318
|
+
|
|
319
|
+
# Get attribute from PyMongo cursor
|
|
320
|
+
attr = getattr(self._pymongo_cursor, name)
|
|
321
|
+
|
|
322
|
+
# If it's a method that returns cursor, wrap the result
|
|
323
|
+
if callable(attr):
|
|
324
|
+
|
|
325
|
+
def wrapper(*args, **kwargs):
|
|
326
|
+
result = attr(*args, **kwargs)
|
|
327
|
+
# If PyMongo method returns cursor, it returns self (the PyMongo cursor)
|
|
328
|
+
# We want to return our wrapper instead
|
|
329
|
+
if result is self._pymongo_cursor:
|
|
330
|
+
return self
|
|
331
|
+
return result
|
|
332
|
+
|
|
333
|
+
return wrapper
|
|
334
|
+
|
|
335
|
+
return attr
|
|
336
|
+
|
|
337
|
+
def __enter__(self):
|
|
338
|
+
"""Context manager entry."""
|
|
339
|
+
return self
|
|
340
|
+
|
|
341
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
342
|
+
"""Context manager exit."""
|
|
343
|
+
self.close()
|
|
344
|
+
|
|
345
|
+
# PyMongo compatibility methods
|
|
346
|
+
|
|
347
|
+
def skip(self, count: int) -> "XLR8Cursor":
|
|
348
|
+
"""
|
|
349
|
+
Skip documents.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
count: Number of documents to skip
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
Self for chaining
|
|
356
|
+
"""
|
|
357
|
+
if self._started:
|
|
358
|
+
raise RuntimeError("Cannot modify cursor after iteration started")
|
|
359
|
+
|
|
360
|
+
self._skip = count
|
|
361
|
+
return self
|
|
362
|
+
|
|
363
|
+
def limit(self, count: int) -> "XLR8Cursor":
|
|
364
|
+
"""
|
|
365
|
+
Limit result count.
|
|
366
|
+
|
|
367
|
+
Args:
|
|
368
|
+
count: Maximum documents to return
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
Self for chaining
|
|
372
|
+
"""
|
|
373
|
+
if self._started:
|
|
374
|
+
raise RuntimeError("Cannot modify cursor after iteration started")
|
|
375
|
+
|
|
376
|
+
self._limit = count
|
|
377
|
+
return self
|
|
378
|
+
|
|
379
|
+
def sort(
|
|
380
|
+
self, key_or_list: Union[str, List[tuple]], direction: int = 1
|
|
381
|
+
) -> "XLR8Cursor":
|
|
382
|
+
"""
|
|
383
|
+
Sort results.
|
|
384
|
+
|
|
385
|
+
Automatically adds _id as final tie-breaker for deterministic ordering
|
|
386
|
+
(matching MongoDB's behavior).
|
|
387
|
+
|
|
388
|
+
Args:
|
|
389
|
+
key_or_list: Field name or list of (field, direction) tuples
|
|
390
|
+
direction: Sort direction (1=ascending, -1=descending)
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
Self for chaining
|
|
394
|
+
"""
|
|
395
|
+
if self._started:
|
|
396
|
+
raise RuntimeError("Cannot modify cursor after iteration started")
|
|
397
|
+
|
|
398
|
+
if isinstance(key_or_list, str):
|
|
399
|
+
self._sort = [(key_or_list, direction)]
|
|
400
|
+
else:
|
|
401
|
+
self._sort = key_or_list
|
|
402
|
+
|
|
403
|
+
return self
|
|
404
|
+
|
|
405
|
+
def batch_size(self, size: int) -> "XLR8Cursor":
|
|
406
|
+
"""
|
|
407
|
+
Set batch size for iteration.
|
|
408
|
+
|
|
409
|
+
Args:
|
|
410
|
+
size: Batch size
|
|
411
|
+
|
|
412
|
+
Returns:
|
|
413
|
+
Self for chaining
|
|
414
|
+
"""
|
|
415
|
+
if self._started:
|
|
416
|
+
raise RuntimeError("Cannot modify cursor after iteration started")
|
|
417
|
+
|
|
418
|
+
self._batch_size = size
|
|
419
|
+
return self
|
|
420
|
+
|
|
421
|
+
def close(self) -> None:
|
|
422
|
+
"""Close cursor and free resources."""
|
|
423
|
+
if self._pymongo_cursor is not None:
|
|
424
|
+
self._pymongo_cursor.close()
|
|
425
|
+
self._pymongo_cursor = None
|
|
426
|
+
self._exhausted = True
|
|
427
|
+
|
|
428
|
+
# count() and distinct() removed - use __getattr__ delegation to PyMongo
|
|
429
|
+
# These are available via: cursor.count(), cursor.distinct()
|
|
430
|
+
# __getattr__ automatically forwards them to the underlying PyMongo cursor
|
|
431
|
+
|
|
432
|
+
# XLR8-specific acceleration methods
|
|
433
|
+
|
|
434
|
+
def to_dataframe(
|
|
435
|
+
self,
|
|
436
|
+
accelerate: bool = True,
|
|
437
|
+
cache_read: bool = True,
|
|
438
|
+
cache_write: bool = True,
|
|
439
|
+
start_date: Optional[Union[datetime, date, str]] = None,
|
|
440
|
+
end_date: Optional[Union[datetime, date, str]] = None,
|
|
441
|
+
coerce: Literal["raise", "error"] = "raise",
|
|
442
|
+
max_workers: int = 4,
|
|
443
|
+
chunking_granularity: Optional[timedelta] = None,
|
|
444
|
+
row_group_size: Optional[int] = None,
|
|
445
|
+
flush_ram_limit_mb: int = 512,
|
|
446
|
+
) -> pd.DataFrame:
|
|
447
|
+
"""
|
|
448
|
+
Convert results to Pandas DataFrame with optional acceleration.
|
|
449
|
+
|
|
450
|
+
This is the main acceleration entry point. If the query is chunkable
|
|
451
|
+
and acceleration is enabled, uses parallel execution and Parquet caching
|
|
452
|
+
for upto 4x speedup on large result sets.
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
DATA FLOW - ACCELERATION DECISION:
|
|
456
|
+
|
|
457
|
+
INPUT: self._filter (the MongoDB query)
|
|
458
|
+
Example: {
|
|
459
|
+
"timestamp": {"$gte": datetime(2024,1,1), "$lt": datetime(...)},
|
|
460
|
+
"$or": [{"metadata.sensor_id": ObjectId("64a...")}]
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
DECISION STEPS:
|
|
464
|
+
1. Check if schema exists -> No: raise error (schema required)
|
|
465
|
+
2. Check if query is chunkable -> No: single-worker, still Parquet
|
|
466
|
+
(is_chunkable_query checks for time bounds, forbidden ops)
|
|
467
|
+
3. If chunkable: use parallel workers based on time span
|
|
468
|
+
|
|
469
|
+
OUTPUT: pandas.DataFrame with columns from schema
|
|
470
|
+
Example columns: [timestamp, metadata.device_id, value]
|
|
471
|
+
|
|
472
|
+
PERFORMANCE ( Obviously depends on data size, schema,
|
|
473
|
+
cache state etc. but this is just for illustration ):
|
|
474
|
+
- Regular path: ~30s for 500K docs (sequential cursor iteration)
|
|
475
|
+
- Accelerated path: ~10s for 500K docs (parallel + caching)
|
|
476
|
+
- Cache hit: ~0.5s for 500K docs (read from Parquet)
|
|
477
|
+
|
|
478
|
+
Args:
|
|
479
|
+
accelerate: Enable acceleration if query is chunkable
|
|
480
|
+
cache_read: Read from Parquet cache if available
|
|
481
|
+
cache_write: Write results to Parquet cache
|
|
482
|
+
start_date: Filter cached data from this date (inclusive).
|
|
483
|
+
Accepts datetime, date, or ISO string with timezone.
|
|
484
|
+
Example: "2024-01-15T00:00:00Z" or datetime with tzinfo
|
|
485
|
+
end_date: Filter cached data until this date (exclusive).
|
|
486
|
+
Accepts datetime, date, or ISO string with timezone.
|
|
487
|
+
coerce: Error handling mode:
|
|
488
|
+
- "raise": Raise exceptions on schema validation errors (default)
|
|
489
|
+
- "error": Log errors and store None for invalid values
|
|
490
|
+
max_workers: Maximum parallel workers (default: 4). More workers use
|
|
491
|
+
more RAM but process faster. Set to 1 for single-threaded.
|
|
492
|
+
Only used when chunking_granularity is provided.
|
|
493
|
+
chunking_granularity: Time granularity for chunking the query.
|
|
494
|
+
Example: timedelta(days=1) chunks by day, timedelta(hours=1) by hour.
|
|
495
|
+
REQUIRED for parallel execution - determines chunk boundaries.
|
|
496
|
+
If None, single-worker mode is used (no parallelization).
|
|
497
|
+
row_group_size: Rows per Parquet row group. If None, Rust default is used.
|
|
498
|
+
flush_ram_limit_mb: RAM limit in MB for buffered data before flushing to
|
|
499
|
+
Parquet. Higher values mean fewer files but more memory usage.
|
|
500
|
+
(default: 512)
|
|
501
|
+
|
|
502
|
+
Returns:
|
|
503
|
+
Pandas DataFrame with results
|
|
504
|
+
|
|
505
|
+
Raises:
|
|
506
|
+
ValueError: If no schema is provided (schema is required for acceleration)
|
|
507
|
+
ValueError: If date strings are not timezone-aware
|
|
508
|
+
|
|
509
|
+
Example:
|
|
510
|
+
>>> cursor = collection.find({
|
|
511
|
+
... "timestamp": {"$gte": start, "$lt": end},
|
|
512
|
+
... "status": "active"
|
|
513
|
+
... })
|
|
514
|
+
>>> df = cursor.to_dataframe() # Accelerated automatically
|
|
515
|
+
>>>
|
|
516
|
+
"""
|
|
517
|
+
# Schema is required for acceleration
|
|
518
|
+
schema = self._collection.schema
|
|
519
|
+
if schema is None:
|
|
520
|
+
raise ValueError(
|
|
521
|
+
"Schema is required for to_dataframe(). "
|
|
522
|
+
"Provide a schema when creating the collection: "
|
|
523
|
+
"xlr8_collection = xlr8.wrap(collection, schema=my_schema)"
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
# CRITICAL: Validate projection doesn't exclude required fields
|
|
527
|
+
if self._projection:
|
|
528
|
+
# Check if projection is inclusion (has 1 values) or exclusion (has 0 values)
|
|
529
|
+
projection_values = [v for k, v in self._projection.items() if k != "_id"]
|
|
530
|
+
is_inclusion = any(v == 1 for v in projection_values)
|
|
531
|
+
|
|
532
|
+
# Time field must be included (required for all operations)
|
|
533
|
+
if is_inclusion:
|
|
534
|
+
time_in_projection = (
|
|
535
|
+
schema.time_field in self._projection
|
|
536
|
+
and self._projection[schema.time_field] == 1
|
|
537
|
+
)
|
|
538
|
+
if not time_in_projection:
|
|
539
|
+
raise ValueError(
|
|
540
|
+
f"Projection must include time field '{schema.time_field}'. "
|
|
541
|
+
f"Projection: {self._projection}"
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
# Sort fields must be included
|
|
545
|
+
if self._sort:
|
|
546
|
+
for sort_field, _ in self._sort:
|
|
547
|
+
if is_inclusion:
|
|
548
|
+
if (
|
|
549
|
+
sort_field not in self._projection
|
|
550
|
+
or self._projection[sort_field] != 1
|
|
551
|
+
):
|
|
552
|
+
raise ValueError(
|
|
553
|
+
f"Projection must include sort field '{sort_field}'. "
|
|
554
|
+
f"Cannot sort by a field that is projected out. "
|
|
555
|
+
f"Projection: {self._projection}"
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
# CRITICAL: If limit() or skip() are used, fall back to PyMongo
|
|
559
|
+
# Reason: Downloading all data just to return a subset is impractical
|
|
560
|
+
# MongoDB can efficiently handle limit/skip operations
|
|
561
|
+
if self._limit > 0 or self._skip > 0:
|
|
562
|
+
logger.info(
|
|
563
|
+
"limit() or skip() detected - falling back to PyMongo iteration "
|
|
564
|
+
"(acceleration would be impractical for subset queries)"
|
|
565
|
+
)
|
|
566
|
+
# Use fresh PyMongo cursor (not self which may be exhausted)
|
|
567
|
+
pymongo_cursor = self._collection.pymongo_collection.find(
|
|
568
|
+
self._filter, self._projection
|
|
569
|
+
)
|
|
570
|
+
if self._sort:
|
|
571
|
+
pymongo_cursor = pymongo_cursor.sort(self._sort)
|
|
572
|
+
if self._skip:
|
|
573
|
+
pymongo_cursor = pymongo_cursor.skip(self._skip)
|
|
574
|
+
if self._limit:
|
|
575
|
+
pymongo_cursor = pymongo_cursor.limit(self._limit)
|
|
576
|
+
if self._batch_size:
|
|
577
|
+
pymongo_cursor = pymongo_cursor.batch_size(self._batch_size)
|
|
578
|
+
return pd.json_normalize(list(pymongo_cursor))
|
|
579
|
+
|
|
580
|
+
# Validate sort field if specified
|
|
581
|
+
if self._sort:
|
|
582
|
+
sort_validation = validate_sort_field(self._sort, schema)
|
|
583
|
+
if not sort_validation.is_valid:
|
|
584
|
+
raise ValueError(f"Sort validation failed: {sort_validation.reason}")
|
|
585
|
+
|
|
586
|
+
# Parse and validate date filters
|
|
587
|
+
parsed_start = parse_datetime_tz_aware(start_date, "start_date")
|
|
588
|
+
parsed_end = parse_datetime_tz_aware(end_date, "end_date")
|
|
589
|
+
|
|
590
|
+
if not accelerate:
|
|
591
|
+
# Fallback to regular iteration (ignores date filters)
|
|
592
|
+
if parsed_start or parsed_end:
|
|
593
|
+
logger.warning(
|
|
594
|
+
"start_date/end_date filters are ignored when accelerate=False"
|
|
595
|
+
)
|
|
596
|
+
return self._to_dataframe_regular()
|
|
597
|
+
|
|
598
|
+
is_chunkable, reason, brackets, _ = build_brackets_for_find(
|
|
599
|
+
self._filter,
|
|
600
|
+
schema.time_field,
|
|
601
|
+
self._sort, # Pass sort spec for $natural detection
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
# Validate chunking_granularity if provided
|
|
605
|
+
# CRITICAL: If chunking_granularity is None, we CANNOT chunk the query
|
|
606
|
+
# because we don't know the data's time precision (could be ms, us, ns)
|
|
607
|
+
if chunking_granularity is not None:
|
|
608
|
+
if chunking_granularity.total_seconds() <= 0:
|
|
609
|
+
raise ValueError(
|
|
610
|
+
f"chunking_granularity must be positive, got {chunking_granularity}"
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
if not is_chunkable:
|
|
614
|
+
# REJECT mode - invalid query syntax or contradictory constraints
|
|
615
|
+
# This is different from SINGLE mode (where is_chunkable=True, brackets empty)
|
|
616
|
+
if parsed_start or parsed_end:
|
|
617
|
+
logger.warning(
|
|
618
|
+
"start_date/end_date filters are ignored for non-chunkable queries"
|
|
619
|
+
)
|
|
620
|
+
logger.info("Query has invalid syntax (%s) - cannot execute", reason)
|
|
621
|
+
return self._to_dataframe_accelerated(
|
|
622
|
+
cache_read=cache_read,
|
|
623
|
+
cache_write=cache_write,
|
|
624
|
+
start_date=parsed_start,
|
|
625
|
+
end_date=parsed_end,
|
|
626
|
+
coerce=coerce,
|
|
627
|
+
max_workers=1, # Single worker for invalid queries
|
|
628
|
+
chunking_granularity=None, # No chunking
|
|
629
|
+
is_chunkable=False,
|
|
630
|
+
)
|
|
631
|
+
|
|
632
|
+
# Check for SINGLE mode - valid query but single-worker fallback
|
|
633
|
+
# Indicated by: is_chunkable=True AND empty brackets
|
|
634
|
+
if is_chunkable and not brackets:
|
|
635
|
+
# SINGLE mode examples: $natural sort, unbounded $or branches
|
|
636
|
+
logger.info(
|
|
637
|
+
"Query valid but not parallelizable (%s) - using single-worker mode",
|
|
638
|
+
reason,
|
|
639
|
+
)
|
|
640
|
+
return self._to_dataframe_accelerated(
|
|
641
|
+
cache_read=cache_read,
|
|
642
|
+
cache_write=cache_write,
|
|
643
|
+
start_date=parsed_start,
|
|
644
|
+
end_date=parsed_end,
|
|
645
|
+
coerce=coerce,
|
|
646
|
+
max_workers=1, # Single worker for SINGLE mode
|
|
647
|
+
chunking_granularity=None, # No chunking
|
|
648
|
+
is_chunkable=False,
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
# Query IS chunkable, but do we have granularity info?
|
|
652
|
+
if chunking_granularity is None:
|
|
653
|
+
# No chunking_granularity provided - cannot parallelize safely
|
|
654
|
+
# because we don't know how to split the time range
|
|
655
|
+
logger.info(
|
|
656
|
+
"Query is chunkable but chunking_granularity not provided - "
|
|
657
|
+
"using single-worker mode. Provide chunking_granularity=timedelta(...) "
|
|
658
|
+
"to enable parallel execution."
|
|
659
|
+
)
|
|
660
|
+
return self._to_dataframe_accelerated(
|
|
661
|
+
cache_read=cache_read,
|
|
662
|
+
cache_write=cache_write,
|
|
663
|
+
start_date=parsed_start,
|
|
664
|
+
end_date=parsed_end,
|
|
665
|
+
coerce=coerce,
|
|
666
|
+
max_workers=1, # Single worker - no chunking info
|
|
667
|
+
chunking_granularity=None,
|
|
668
|
+
is_chunkable=False, # Treat as non-chunkable since we can't chunk
|
|
669
|
+
flush_ram_limit_mb=flush_ram_limit_mb, # Pass through for cache reading
|
|
670
|
+
row_group_size=row_group_size, # Pass through for DuckDB batch
|
|
671
|
+
)
|
|
672
|
+
|
|
673
|
+
# Use accelerated parallel execution - we have chunking info!
|
|
674
|
+
return self._to_dataframe_accelerated(
|
|
675
|
+
cache_read=cache_read,
|
|
676
|
+
cache_write=cache_write,
|
|
677
|
+
start_date=parsed_start,
|
|
678
|
+
end_date=parsed_end,
|
|
679
|
+
coerce=coerce,
|
|
680
|
+
max_workers=max_workers,
|
|
681
|
+
chunking_granularity=chunking_granularity,
|
|
682
|
+
is_chunkable=True,
|
|
683
|
+
flush_ram_limit_mb=flush_ram_limit_mb,
|
|
684
|
+
row_group_size=row_group_size,
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
def to_dataframe_batches(
|
|
688
|
+
self,
|
|
689
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
690
|
+
cache_read: bool = True,
|
|
691
|
+
cache_write: bool = True,
|
|
692
|
+
start_date: Optional[Union[datetime, date, str]] = None,
|
|
693
|
+
end_date: Optional[Union[datetime, date, str]] = None,
|
|
694
|
+
coerce: Literal["raise", "error"] = "raise",
|
|
695
|
+
max_workers: int = 4,
|
|
696
|
+
chunking_granularity: Optional[timedelta] = None,
|
|
697
|
+
row_group_size: Optional[int] = None,
|
|
698
|
+
flush_ram_limit_mb: int = 512,
|
|
699
|
+
) -> Generator[pd.DataFrame, None, None]:
|
|
700
|
+
"""
|
|
701
|
+
Yield DataFrames in batches from cache without loading all data into memory.
|
|
702
|
+
|
|
703
|
+
This is a memory-efficient alternative to to_dataframe() for very large
|
|
704
|
+
result sets. Instead of loading the entire result into memory, it yields
|
|
705
|
+
smaller DataFrames that can be processed incrementally.
|
|
706
|
+
|
|
707
|
+
|
|
708
|
+
MEMORY-EFFICIENT BATCH PROCESSING:
|
|
709
|
+
|
|
710
|
+
Instead of:
|
|
711
|
+
df = cursor.to_dataframe() # Loads ALL 10M rows into RAM
|
|
712
|
+
|
|
713
|
+
Use:
|
|
714
|
+
for batch_df in cursor.to_dataframe_batches(batch_size=50000):
|
|
715
|
+
process(batch_df) # Only 50K rows in RAM at a time
|
|
716
|
+
|
|
717
|
+
Memory usage: O(batch_size) instead of O(total_rows)
|
|
718
|
+
|
|
719
|
+
|
|
720
|
+
Args:
|
|
721
|
+
batch_size: Number of rows per DataFrame batch (default: 10,000)
|
|
722
|
+
cache_read: Read from Parquet cache if available
|
|
723
|
+
cache_write: Write results to Parquet cache on cache miss
|
|
724
|
+
start_date: Filter cached data from this date (inclusive).
|
|
725
|
+
Accepts datetime, date, or ISO string with timezone.
|
|
726
|
+
end_date: Filter cached data until this date (exclusive).
|
|
727
|
+
coerce: Error handling mode ("raise" or "error")
|
|
728
|
+
max_workers: Maximum parallel workers for cache population (default: 4)
|
|
729
|
+
chunking_granularity: Time granularity for chunking (required for parallel fetch)
|
|
730
|
+
|
|
731
|
+
Yields:
|
|
732
|
+
pd.DataFrame: Batches of rows as DataFrames
|
|
733
|
+
|
|
734
|
+
Raises:
|
|
735
|
+
ValueError: If no schema is provided
|
|
736
|
+
ValueError: If date strings are not timezone-aware
|
|
737
|
+
ValueError: If cache doesn't exist and cache_write=False
|
|
738
|
+
|
|
739
|
+
Example:
|
|
740
|
+
>>> # Process 10M rows without loading all into RAM
|
|
741
|
+
>>> total = 0
|
|
742
|
+
>>> for batch_df in cursor.to_dataframe_batches(batch_size=50000):
|
|
743
|
+
... total += len(batch_df)
|
|
744
|
+
... # Process batch_df...
|
|
745
|
+
>>> logging.debug(f"Processed {total} rows")
|
|
746
|
+
>>>
|
|
747
|
+
>>> # With date filtering:
|
|
748
|
+
>>> for batch_df in cursor.to_dataframe_batches(
|
|
749
|
+
... batch_size=10000,
|
|
750
|
+
... start_date="2024-06-01T00:00:00Z",
|
|
751
|
+
... end_date="2024-06-15T00:00:00Z"
|
|
752
|
+
... ):
|
|
753
|
+
... analyze(batch_df)
|
|
754
|
+
"""
|
|
755
|
+
# Schema is required
|
|
756
|
+
schema = self._collection.schema
|
|
757
|
+
if schema is None:
|
|
758
|
+
raise ValueError(
|
|
759
|
+
"Schema is required for to_dataframe_batches(). "
|
|
760
|
+
"Provide a schema when creating the collection."
|
|
761
|
+
)
|
|
762
|
+
|
|
763
|
+
# CRITICAL: If limit() or skip() are used, fall back to PyMongo
|
|
764
|
+
# Reason: Downloading all data just to return a subset is impractical
|
|
765
|
+
if self._limit > 0 or self._skip > 0:
|
|
766
|
+
logger.info(
|
|
767
|
+
"limit() or skip() detected - falling back to PyMongo iteration "
|
|
768
|
+
"(acceleration would be impractical for subset queries)"
|
|
769
|
+
)
|
|
770
|
+
# Use fresh PyMongo cursor in batches (not self which may be exhausted)
|
|
771
|
+
pymongo_cursor = self._collection.pymongo_collection.find(
|
|
772
|
+
self._filter, self._projection
|
|
773
|
+
)
|
|
774
|
+
if self._sort:
|
|
775
|
+
pymongo_cursor = pymongo_cursor.sort(self._sort)
|
|
776
|
+
if self._skip:
|
|
777
|
+
pymongo_cursor = pymongo_cursor.skip(self._skip)
|
|
778
|
+
if self._limit:
|
|
779
|
+
pymongo_cursor = pymongo_cursor.limit(self._limit)
|
|
780
|
+
if self._batch_size:
|
|
781
|
+
pymongo_cursor = pymongo_cursor.batch_size(self._batch_size)
|
|
782
|
+
|
|
783
|
+
batch = []
|
|
784
|
+
for doc in pymongo_cursor:
|
|
785
|
+
batch.append(doc)
|
|
786
|
+
if len(batch) >= batch_size:
|
|
787
|
+
yield pd.DataFrame(batch)
|
|
788
|
+
batch = []
|
|
789
|
+
if batch:
|
|
790
|
+
yield pd.DataFrame(batch)
|
|
791
|
+
return
|
|
792
|
+
|
|
793
|
+
# CRITICAL: Validate projection doesn't exclude required fields
|
|
794
|
+
if self._projection:
|
|
795
|
+
projection_values = [v for k, v in self._projection.items() if k != "_id"]
|
|
796
|
+
is_inclusion = any(v == 1 for v in projection_values)
|
|
797
|
+
|
|
798
|
+
# Time field must be included
|
|
799
|
+
if is_inclusion:
|
|
800
|
+
time_in_projection = (
|
|
801
|
+
schema.time_field in self._projection
|
|
802
|
+
and self._projection[schema.time_field] == 1
|
|
803
|
+
)
|
|
804
|
+
if not time_in_projection:
|
|
805
|
+
raise ValueError(
|
|
806
|
+
f"Projection must include time field '{schema.time_field}'. "
|
|
807
|
+
f"Projection: {self._projection}"
|
|
808
|
+
)
|
|
809
|
+
|
|
810
|
+
# Sort fields must be included
|
|
811
|
+
if self._sort:
|
|
812
|
+
for sort_field, _ in self._sort:
|
|
813
|
+
if is_inclusion:
|
|
814
|
+
if (
|
|
815
|
+
sort_field not in self._projection
|
|
816
|
+
or self._projection[sort_field] != 1
|
|
817
|
+
):
|
|
818
|
+
raise ValueError(
|
|
819
|
+
f"Projection must include sort field '{sort_field}'. "
|
|
820
|
+
f"Cannot sort by a field that is projected out. "
|
|
821
|
+
f"Projection: {self._projection}"
|
|
822
|
+
)
|
|
823
|
+
|
|
824
|
+
time_field = schema.time_field
|
|
825
|
+
|
|
826
|
+
# Validate sort field if specified
|
|
827
|
+
if self._sort:
|
|
828
|
+
sort_validation = validate_sort_field(self._sort, schema)
|
|
829
|
+
if not sort_validation.is_valid:
|
|
830
|
+
raise ValueError(f"Sort validation failed: {sort_validation.reason}")
|
|
831
|
+
logger.info(
|
|
832
|
+
"Sorted streaming enabled - using DuckDB K-way merge for global sort order"
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
# Parse and validate date filters
|
|
836
|
+
parsed_start = parse_datetime_tz_aware(start_date, "start_date")
|
|
837
|
+
parsed_end = parse_datetime_tz_aware(end_date, "end_date")
|
|
838
|
+
|
|
839
|
+
is_chunkable, reason, brackets, _ = build_brackets_for_find(
|
|
840
|
+
self._filter,
|
|
841
|
+
time_field,
|
|
842
|
+
self._sort, # Pass sort spec for $natural detection
|
|
843
|
+
)
|
|
844
|
+
|
|
845
|
+
# Handle REJECT mode (is_chunkable=False)
|
|
846
|
+
if not is_chunkable:
|
|
847
|
+
warnings.warn(
|
|
848
|
+
f"Invalid query syntax ({reason}). Cannot execute this query.",
|
|
849
|
+
UserWarning,
|
|
850
|
+
stacklevel=2,
|
|
851
|
+
)
|
|
852
|
+
# Override max_workers to 1 for invalid queries
|
|
853
|
+
max_workers = 1
|
|
854
|
+
chunking_granularity = None
|
|
855
|
+
|
|
856
|
+
# Handle SINGLE mode (is_chunkable=True but empty brackets)
|
|
857
|
+
elif is_chunkable and not brackets:
|
|
858
|
+
warnings.warn(
|
|
859
|
+
f"Query valid but not parallelizable ({reason}). Using single-worker mode.",
|
|
860
|
+
UserWarning,
|
|
861
|
+
stacklevel=2,
|
|
862
|
+
)
|
|
863
|
+
# Override max_workers to 1 for SINGLE mode
|
|
864
|
+
max_workers = 1
|
|
865
|
+
chunking_granularity = None
|
|
866
|
+
|
|
867
|
+
# Mark as started
|
|
868
|
+
if not self._started:
|
|
869
|
+
self._started = True
|
|
870
|
+
|
|
871
|
+
# Create cache manager
|
|
872
|
+
cache = CacheManager(
|
|
873
|
+
filter_dict=self._filter,
|
|
874
|
+
projection=self._projection,
|
|
875
|
+
sort=self._sort,
|
|
876
|
+
)
|
|
877
|
+
|
|
878
|
+
# Ensure cache exists
|
|
879
|
+
if not cache.exists():
|
|
880
|
+
if not cache_write:
|
|
881
|
+
raise ValueError(
|
|
882
|
+
"Cache does not exist and cache_write=False. "
|
|
883
|
+
"Either call to_dataframe() first to populate cache, "
|
|
884
|
+
"or set cache_write=True."
|
|
885
|
+
)
|
|
886
|
+
|
|
887
|
+
# Populate cache first
|
|
888
|
+
logging.debug("[Query] Cache miss - fetching from MongoDB...")
|
|
889
|
+
|
|
890
|
+
# Populate cache via accelerated executor
|
|
891
|
+
result = execute_parallel_stream_to_cache(
|
|
892
|
+
pymongo_collection=self._collection.pymongo_collection,
|
|
893
|
+
filter_dict=self._filter,
|
|
894
|
+
schema=schema,
|
|
895
|
+
cache_manager=cache,
|
|
896
|
+
projection=self._projection,
|
|
897
|
+
approx_document_size_bytes=self._collection.approx_document_size_bytes,
|
|
898
|
+
max_workers=max_workers,
|
|
899
|
+
peak_ram_limit_mb=flush_ram_limit_mb,
|
|
900
|
+
chunking_granularity=chunking_granularity,
|
|
901
|
+
mongo_uri=self._collection.mongo_uri,
|
|
902
|
+
sort_spec=self._sort, # Pass sort for pre-sorting during Parquet write
|
|
903
|
+
row_group_size=row_group_size,
|
|
904
|
+
)
|
|
905
|
+
|
|
906
|
+
logging.debug(
|
|
907
|
+
f"\n[Cache] Cache written: {result['total_docs']:,} docs in {result['duration_s']:.2f}s"
|
|
908
|
+
)
|
|
909
|
+
|
|
910
|
+
elif not cache_read and cache_write:
|
|
911
|
+
# CRITICAL: cache_read=False but cache_write=True and cache exists
|
|
912
|
+
# Clear old cache and re-populate to avoid duplicate data
|
|
913
|
+
logging.debug(
|
|
914
|
+
"[Clean] Clearing existing cache (cache_read=False, starting fresh)..."
|
|
915
|
+
)
|
|
916
|
+
cache.clean()
|
|
917
|
+
|
|
918
|
+
logging.debug("[Query] Re-fetching from MongoDB...")
|
|
919
|
+
|
|
920
|
+
# Re-populate cache via accelerated executor
|
|
921
|
+
result = execute_parallel_stream_to_cache(
|
|
922
|
+
pymongo_collection=self._collection.pymongo_collection,
|
|
923
|
+
filter_dict=self._filter,
|
|
924
|
+
schema=schema,
|
|
925
|
+
cache_manager=cache,
|
|
926
|
+
projection=self._projection,
|
|
927
|
+
approx_document_size_bytes=self._collection.approx_document_size_bytes,
|
|
928
|
+
max_workers=max_workers,
|
|
929
|
+
peak_ram_limit_mb=flush_ram_limit_mb,
|
|
930
|
+
chunking_granularity=chunking_granularity,
|
|
931
|
+
mongo_uri=self._collection.mongo_uri,
|
|
932
|
+
sort_spec=self._sort, # Pass sort for pre-sorting during Parquet write
|
|
933
|
+
row_group_size=row_group_size,
|
|
934
|
+
)
|
|
935
|
+
|
|
936
|
+
logging.debug(
|
|
937
|
+
f"\n[Cache] Cache re-written: {result['total_docs']:,} docs in {result['duration_s']:.2f}s"
|
|
938
|
+
)
|
|
939
|
+
|
|
940
|
+
# Now yield batches from cache
|
|
941
|
+
logging.debug(f"[Cache] Streaming batches from cache: {cache.cache_dir}")
|
|
942
|
+
reader = ParquetReader(cache.cache_dir)
|
|
943
|
+
|
|
944
|
+
# Use globally sorted streaming if sort is specified
|
|
945
|
+
if self._sort:
|
|
946
|
+
logging.debug("[Sort] Using DuckDB K-way merge for globally sorted batches")
|
|
947
|
+
yield from reader.iter_globally_sorted_batches(
|
|
948
|
+
sort_spec=self._sort, # Pass full sort spec for multi-field sorting
|
|
949
|
+
batch_size=batch_size,
|
|
950
|
+
schema=schema,
|
|
951
|
+
time_field=time_field,
|
|
952
|
+
start_date=parsed_start,
|
|
953
|
+
end_date=parsed_end,
|
|
954
|
+
coerce=coerce,
|
|
955
|
+
memory_limit_mb=flush_ram_limit_mb, # Pass RAM limit to DuckDB
|
|
956
|
+
threads=max_workers, # Pass thread count to DuckDB
|
|
957
|
+
)
|
|
958
|
+
else:
|
|
959
|
+
yield from reader.iter_dataframe_batches(
|
|
960
|
+
batch_size=batch_size,
|
|
961
|
+
schema=schema,
|
|
962
|
+
time_field=time_field,
|
|
963
|
+
start_date=parsed_start,
|
|
964
|
+
end_date=parsed_end,
|
|
965
|
+
coerce=coerce,
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
def stream_to_callback(
|
|
969
|
+
self,
|
|
970
|
+
callback: Callable[["pa.Table", Dict[str, Any]], None],
|
|
971
|
+
*,
|
|
972
|
+
partition_time_delta: timedelta,
|
|
973
|
+
partition_by: Optional[Union[str, List[str]]] = None,
|
|
974
|
+
any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
|
|
975
|
+
max_workers: int = 4,
|
|
976
|
+
chunking_granularity: Optional[timedelta] = None,
|
|
977
|
+
row_group_size: Optional[int] = None,
|
|
978
|
+
flush_ram_limit_mb: int = 512,
|
|
979
|
+
cache_read: bool = True,
|
|
980
|
+
cache_write: bool = True,
|
|
981
|
+
) -> Dict[str, Any]:
|
|
982
|
+
"""
|
|
983
|
+
Stream partitioned PyArrow tables to a callback function.
|
|
984
|
+
|
|
985
|
+
This is a two-phase operation:
|
|
986
|
+
1. Download data from MongoDB to local Parquet cache (reuses Rust backend)
|
|
987
|
+
2. Partition data and call callback in parallel for each partition
|
|
988
|
+
|
|
989
|
+
Perfect for populating data lakes with partitioned data structures.
|
|
990
|
+
|
|
991
|
+
┌─────────────────────────────────────────────────────────────────────┐
|
|
992
|
+
│ PARTITION MODES: │
|
|
993
|
+
│ │
|
|
994
|
+
│ TIME ONLY (partition_by=None): │
|
|
995
|
+
│ partition_time_delta=timedelta(weeks=1) │
|
|
996
|
+
│ -> 1 callback per week of data │
|
|
997
|
+
│ │
|
|
998
|
+
│ TIME + FIELD (partition_by="metadata.instrument"): │
|
|
999
|
+
│ partition_time_delta=timedelta(weeks=1) │
|
|
1000
|
+
│ -> 1 callback per (week, instrument) combination │
|
|
1001
|
+
│ │
|
|
1002
|
+
│ Example: 1 year of data, 10 instruments, weekly partitions │
|
|
1003
|
+
│ -> 52 weeks × 10 instruments = 520 callbacks │
|
|
1004
|
+
└─────────────────────────────────────────────────────────────────────┘
|
|
1005
|
+
|
|
1006
|
+
The callback receives:
|
|
1007
|
+
- table: PyArrow Table with data for this partition
|
|
1008
|
+
- metadata: Dict with partition info:
|
|
1009
|
+
{
|
|
1010
|
+
"time_start": datetime, # Start of time bucket
|
|
1011
|
+
"time_end": datetime, # End of time bucket
|
|
1012
|
+
"partition_values": {...}, # Values for partition_by fields
|
|
1013
|
+
"row_count": int, # Rows in this table
|
|
1014
|
+
"partition_index": int, # 0-based partition index
|
|
1015
|
+
"total_partitions": int, # Total partition count
|
|
1016
|
+
}
|
|
1017
|
+
|
|
1018
|
+
Args:
|
|
1019
|
+
callback: Function(table: pa.Table, metadata: dict) -> None
|
|
1020
|
+
Called for each partition. Runs in ThreadPoolExecutor.
|
|
1021
|
+
partition_time_delta: Time bucket size for partitioning.
|
|
1022
|
+
Example: timedelta(weeks=1) creates weekly partitions.
|
|
1023
|
+
REQUIRED - determines how data is grouped.
|
|
1024
|
+
partition_by: Field(s) to partition by, in addition to time.
|
|
1025
|
+
Example: "metadata.instrument" or ["region", "device_id"]
|
|
1026
|
+
Can be any field in schema except time field.
|
|
1027
|
+
None = partition by time only.
|
|
1028
|
+
any_type_strategy: How to decode Types.Any() struct columns:
|
|
1029
|
+
- "float": Coalesce to Float64, prioritize numeric (default)
|
|
1030
|
+
- "string": Convert everything to string (lossless)
|
|
1031
|
+
- "keep_struct": Keep raw struct, don't decode
|
|
1032
|
+
max_workers: Number of parallel callback threads (default: 4).
|
|
1033
|
+
DuckDB releases GIL, so threads get true parallelism.
|
|
1034
|
+
chunking_granularity: Time granularity for MongoDB fetch chunks.
|
|
1035
|
+
Used during Phase 1 (download). Example: timedelta(hours=16).
|
|
1036
|
+
If None, defaults to partition_time_delta.
|
|
1037
|
+
flush_ram_limit_mb: RAM limit for buffered data (default: 512).
|
|
1038
|
+
Used during both download and partition phases.
|
|
1039
|
+
cache_read: Read from existing cache if available (default: True).
|
|
1040
|
+
cache_write: Write to cache during download (default: True).
|
|
1041
|
+
|
|
1042
|
+
Returns:
|
|
1043
|
+
Dict with:
|
|
1044
|
+
- total_partitions: Number of partitions processed
|
|
1045
|
+
- total_rows: Total rows across all partitions
|
|
1046
|
+
- skipped_partitions: Empty partitions skipped
|
|
1047
|
+
- duration_s: Total execution time
|
|
1048
|
+
- cache_duration_s: Time spent on cache population
|
|
1049
|
+
- partition_duration_s: Time spent on partition callbacks
|
|
1050
|
+
|
|
1051
|
+
Raises:
|
|
1052
|
+
ValueError: If no schema provided
|
|
1053
|
+
ValueError: If query not chunkable (no time bounds)
|
|
1054
|
+
ValueError: If sort specified on non-time field
|
|
1055
|
+
RuntimeError: If callback fails for any partition
|
|
1056
|
+
|
|
1057
|
+
Example:
|
|
1058
|
+
>>> # Upload weekly data per instrument to S3 data lake
|
|
1059
|
+
>>> import pyarrow.parquet as pq
|
|
1060
|
+
>>> import s3fs
|
|
1061
|
+
>>>
|
|
1062
|
+
>>> fs = s3fs.S3FileSystem()
|
|
1063
|
+
>>>
|
|
1064
|
+
>>> def upload_partition(table, metadata):
|
|
1065
|
+
... instrument = metadata['partition_values'].get('metadata.instrument', 'unknown')
|
|
1066
|
+
... week = metadata['time_start'].strftime('%Y-%m-%d')
|
|
1067
|
+
... path = f"s3://bucket/data/instrument={instrument}/week={week}.parquet"
|
|
1068
|
+
... pq.write_table(table, path, filesystem=fs)
|
|
1069
|
+
>>>
|
|
1070
|
+
>>> cursor.stream_to_callback(
|
|
1071
|
+
... callback=upload_partition,
|
|
1072
|
+
... partition_time_delta=timedelta(weeks=1),
|
|
1073
|
+
... partition_by="metadata.instrument",
|
|
1074
|
+
... max_workers=8,
|
|
1075
|
+
... chunking_granularity=timedelta(hours=16),
|
|
1076
|
+
... )
|
|
1077
|
+
"""
|
|
1078
|
+
total_start = time.time()
|
|
1079
|
+
|
|
1080
|
+
schema = self._collection.schema
|
|
1081
|
+
if schema is None:
|
|
1082
|
+
raise ValueError(
|
|
1083
|
+
"Schema is required for stream_to_callback(). "
|
|
1084
|
+
"Provide a schema when creating the collection."
|
|
1085
|
+
)
|
|
1086
|
+
|
|
1087
|
+
# CRITICAL: limit() and skip() don't make sense for streaming callbacks
|
|
1088
|
+
# These operations require knowing the full result set, which defeats
|
|
1089
|
+
# the purpose of streaming
|
|
1090
|
+
if self._limit > 0 or self._skip > 0:
|
|
1091
|
+
raise ValueError(
|
|
1092
|
+
"stream_to_callback() does not support limit() or skip(). "
|
|
1093
|
+
"These operations require knowing the total result set size upfront, "
|
|
1094
|
+
"which defeats the purpose of streaming. "
|
|
1095
|
+
"Use to_dataframe() or iterate with PyMongo cursor instead."
|
|
1096
|
+
)
|
|
1097
|
+
|
|
1098
|
+
time_field = schema.time_field
|
|
1099
|
+
|
|
1100
|
+
# CRITICAL: Validate projection doesn't exclude partition_by fields
|
|
1101
|
+
if self._projection and partition_by:
|
|
1102
|
+
# Check if projection is inclusion (has 1 values) or exclusion (has 0 values)
|
|
1103
|
+
projection_values = [v for k, v in self._projection.items() if k != "_id"]
|
|
1104
|
+
is_inclusion = any(v == 1 for v in projection_values)
|
|
1105
|
+
|
|
1106
|
+
# Time field must be included
|
|
1107
|
+
if is_inclusion:
|
|
1108
|
+
time_in_projection = (
|
|
1109
|
+
time_field in self._projection and self._projection[time_field] == 1
|
|
1110
|
+
)
|
|
1111
|
+
if not time_in_projection:
|
|
1112
|
+
raise ValueError(
|
|
1113
|
+
f"Projection must include time field '{time_field}'. "
|
|
1114
|
+
f"Projection: {self._projection}"
|
|
1115
|
+
)
|
|
1116
|
+
|
|
1117
|
+
# Partition fields must be included
|
|
1118
|
+
partition_by_list = (
|
|
1119
|
+
[partition_by] if isinstance(partition_by, str) else partition_by
|
|
1120
|
+
)
|
|
1121
|
+
for field in partition_by_list:
|
|
1122
|
+
if is_inclusion:
|
|
1123
|
+
# For parent fields like "metadata", check if any child is included
|
|
1124
|
+
field_or_children_included = (
|
|
1125
|
+
field in self._projection and self._projection[field] == 1
|
|
1126
|
+
) or any(
|
|
1127
|
+
k.startswith(f"{field}.") and self._projection[k] == 1
|
|
1128
|
+
for k in self._projection.keys()
|
|
1129
|
+
)
|
|
1130
|
+
if not field_or_children_included:
|
|
1131
|
+
raise ValueError(
|
|
1132
|
+
f"Projection must include partition field '{field}'. "
|
|
1133
|
+
f"Cannot partition by a field that is projected out. "
|
|
1134
|
+
f"Projection: {self._projection}"
|
|
1135
|
+
)
|
|
1136
|
+
|
|
1137
|
+
# Validate sort fields in projection
|
|
1138
|
+
if self._projection and self._sort:
|
|
1139
|
+
projection_values = [v for k, v in self._projection.items() if k != "_id"]
|
|
1140
|
+
is_inclusion = any(v == 1 for v in projection_values)
|
|
1141
|
+
for sort_field, _ in self._sort:
|
|
1142
|
+
if is_inclusion:
|
|
1143
|
+
if (
|
|
1144
|
+
sort_field not in self._projection
|
|
1145
|
+
or self._projection[sort_field] != 1
|
|
1146
|
+
):
|
|
1147
|
+
raise ValueError(
|
|
1148
|
+
f"Projection must include sort field '{sort_field}'. "
|
|
1149
|
+
f"Projection: {self._projection}"
|
|
1150
|
+
)
|
|
1151
|
+
|
|
1152
|
+
# Validate sort - only allow time field sorting
|
|
1153
|
+
if self._sort:
|
|
1154
|
+
for field, _direction in self._sort:
|
|
1155
|
+
if field != time_field:
|
|
1156
|
+
raise ValueError(
|
|
1157
|
+
f"stream_to_callback() only supports sorting by time field '{time_field}'. "
|
|
1158
|
+
f"Got sort field: '{field}'. "
|
|
1159
|
+
"Remove .sort() or sort only by time field."
|
|
1160
|
+
)
|
|
1161
|
+
# Store sort direction
|
|
1162
|
+
sort_ascending = self._sort[0][1] == 1
|
|
1163
|
+
else:
|
|
1164
|
+
sort_ascending = True # Default to ascending
|
|
1165
|
+
|
|
1166
|
+
# Normalize partition_by to list
|
|
1167
|
+
partition_by_list: Optional[List[str]] = None
|
|
1168
|
+
if partition_by is not None:
|
|
1169
|
+
if isinstance(partition_by, str):
|
|
1170
|
+
partition_by_list = [partition_by]
|
|
1171
|
+
else:
|
|
1172
|
+
partition_by_list = list(partition_by)
|
|
1173
|
+
|
|
1174
|
+
# Validate partition_by fields exist in schema (or are parent fields with children)
|
|
1175
|
+
all_schema_fields = list(schema.fields.keys())
|
|
1176
|
+
for field in partition_by_list:
|
|
1177
|
+
if field == time_field:
|
|
1178
|
+
raise ValueError(
|
|
1179
|
+
f"Cannot partition by time field '{time_field}'. "
|
|
1180
|
+
"Time partitioning is automatic via partition_time_delta."
|
|
1181
|
+
)
|
|
1182
|
+
# Check if field exists directly OR has children
|
|
1183
|
+
has_direct = schema.has_field(field)
|
|
1184
|
+
has_children = any(f.startswith(f"{field}.") for f in all_schema_fields)
|
|
1185
|
+
if not has_direct and not has_children:
|
|
1186
|
+
raise ValueError(
|
|
1187
|
+
f"Partition field '{field}' not found in schema. "
|
|
1188
|
+
f"Available fields: {all_schema_fields}"
|
|
1189
|
+
)
|
|
1190
|
+
|
|
1191
|
+
# Default chunking_granularity to partition_time_delta
|
|
1192
|
+
if chunking_granularity is None:
|
|
1193
|
+
chunking_granularity = partition_time_delta
|
|
1194
|
+
|
|
1195
|
+
# NEW: build_brackets_for_find internally validates via is_chunkable_query
|
|
1196
|
+
is_chunkable, reason, brackets, _ = build_brackets_for_find(
|
|
1197
|
+
self._filter,
|
|
1198
|
+
time_field,
|
|
1199
|
+
self._sort, # Pass sort spec for $natural detection
|
|
1200
|
+
)
|
|
1201
|
+
|
|
1202
|
+
# Handle REJECT mode (is_chunkable=False)
|
|
1203
|
+
if not is_chunkable:
|
|
1204
|
+
warnings.warn(
|
|
1205
|
+
f"Invalid query syntax ({reason}). Cannot execute this query.",
|
|
1206
|
+
UserWarning,
|
|
1207
|
+
stacklevel=2,
|
|
1208
|
+
)
|
|
1209
|
+
# Override max_workers to 1 for invalid queries
|
|
1210
|
+
max_workers = 1
|
|
1211
|
+
chunking_granularity = None
|
|
1212
|
+
|
|
1213
|
+
# Handle SINGLE mode (is_chunkable=True but empty brackets)
|
|
1214
|
+
elif is_chunkable and not brackets:
|
|
1215
|
+
warnings.warn(
|
|
1216
|
+
f"Query valid but not parallelizable ({reason}). Using single-worker mode.",
|
|
1217
|
+
UserWarning,
|
|
1218
|
+
stacklevel=2,
|
|
1219
|
+
)
|
|
1220
|
+
# Override max_workers to 1 for SINGLE mode
|
|
1221
|
+
max_workers = 1
|
|
1222
|
+
chunking_granularity = None
|
|
1223
|
+
|
|
1224
|
+
# Mark as started
|
|
1225
|
+
if not self._started:
|
|
1226
|
+
self._started = True
|
|
1227
|
+
|
|
1228
|
+
# ─────────────────────────────────────────────────────────────────────
|
|
1229
|
+
# PHASE 1: Download to cache (reuses existing Rust backend)
|
|
1230
|
+
# ─────────────────────────────────────────────────────────────────────
|
|
1231
|
+
cache = CacheManager(
|
|
1232
|
+
filter_dict=self._filter,
|
|
1233
|
+
projection=self._projection,
|
|
1234
|
+
sort=self._sort,
|
|
1235
|
+
)
|
|
1236
|
+
|
|
1237
|
+
cache_start = time.time()
|
|
1238
|
+
|
|
1239
|
+
if cache_read and cache.exists():
|
|
1240
|
+
logging.debug(f"[Cache] Using existing cache: {cache.cache_dir}")
|
|
1241
|
+
else:
|
|
1242
|
+
if not cache_write:
|
|
1243
|
+
raise ValueError(
|
|
1244
|
+
"Cache does not exist and cache_write=False. "
|
|
1245
|
+
"Set cache_write=True to download data first."
|
|
1246
|
+
)
|
|
1247
|
+
|
|
1248
|
+
if cache.exists() and not cache_read:
|
|
1249
|
+
logging.debug("[Clean] Clearing existing cache (cache_read=False)...")
|
|
1250
|
+
cache.clean()
|
|
1251
|
+
|
|
1252
|
+
logging.debug("[Query] Downloading from MongoDB to cache...")
|
|
1253
|
+
result = execute_parallel_stream_to_cache(
|
|
1254
|
+
pymongo_collection=self._collection.pymongo_collection,
|
|
1255
|
+
filter_dict=self._filter,
|
|
1256
|
+
schema=schema,
|
|
1257
|
+
cache_manager=cache,
|
|
1258
|
+
projection=self._projection,
|
|
1259
|
+
approx_document_size_bytes=self._collection.approx_document_size_bytes,
|
|
1260
|
+
max_workers=max_workers,
|
|
1261
|
+
peak_ram_limit_mb=flush_ram_limit_mb,
|
|
1262
|
+
chunking_granularity=chunking_granularity,
|
|
1263
|
+
mongo_uri=self._collection.mongo_uri,
|
|
1264
|
+
row_group_size=row_group_size,
|
|
1265
|
+
)
|
|
1266
|
+
logging.debug(
|
|
1267
|
+
f"[Cache] Downloaded: {result['total_docs']:,} docs in {result['duration_s']:.2f}s"
|
|
1268
|
+
)
|
|
1269
|
+
|
|
1270
|
+
cache_duration = time.time() - cache_start
|
|
1271
|
+
|
|
1272
|
+
# ─────────────────────────────────────────────────────────────────────
|
|
1273
|
+
# PHASE 2: Partition and stream to callbacks
|
|
1274
|
+
# ─────────────────────────────────────────────────────────────────────
|
|
1275
|
+
|
|
1276
|
+
partition_result = execute_partitioned_callback(
|
|
1277
|
+
cache_dir=str(cache.cache_dir),
|
|
1278
|
+
schema=schema,
|
|
1279
|
+
callback=callback,
|
|
1280
|
+
partition_time_delta=partition_time_delta,
|
|
1281
|
+
partition_by=partition_by_list,
|
|
1282
|
+
any_type_strategy=any_type_strategy,
|
|
1283
|
+
max_workers=max_workers,
|
|
1284
|
+
sort_ascending=sort_ascending,
|
|
1285
|
+
memory_limit_mb=flush_ram_limit_mb,
|
|
1286
|
+
)
|
|
1287
|
+
|
|
1288
|
+
total_duration = time.time() - total_start
|
|
1289
|
+
|
|
1290
|
+
return {
|
|
1291
|
+
"total_partitions": partition_result["total_partitions"],
|
|
1292
|
+
"total_rows": partition_result["total_rows"],
|
|
1293
|
+
"skipped_partitions": partition_result["skipped_partitions"],
|
|
1294
|
+
"duration_s": total_duration,
|
|
1295
|
+
"cache_duration_s": cache_duration,
|
|
1296
|
+
"partition_duration_s": partition_result["duration_s"],
|
|
1297
|
+
}
|
|
1298
|
+
|
|
1299
|
+
def to_polars(
|
|
1300
|
+
self,
|
|
1301
|
+
accelerate: bool = True,
|
|
1302
|
+
cache_read: bool = True,
|
|
1303
|
+
cache_write: bool = True,
|
|
1304
|
+
start_date: Optional[Union[datetime, date, str]] = None,
|
|
1305
|
+
end_date: Optional[Union[datetime, date, str]] = None,
|
|
1306
|
+
coerce: Literal["raise", "error"] = "raise",
|
|
1307
|
+
max_workers: int = 4,
|
|
1308
|
+
chunking_granularity: Optional[timedelta] = None,
|
|
1309
|
+
row_group_size: Optional[int] = None,
|
|
1310
|
+
any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
|
|
1311
|
+
flush_ram_limit_mb: int = 512,
|
|
1312
|
+
) -> pl.DataFrame:
|
|
1313
|
+
"""
|
|
1314
|
+
Convert results to Polars DataFrame with optional acceleration.
|
|
1315
|
+
|
|
1316
|
+
This mirrors to_dataframe() but returns a Polars DataFrame.
|
|
1317
|
+
Uses ParquetReader with engine="polars" for efficient native reading.
|
|
1318
|
+
|
|
1319
|
+
Args:
|
|
1320
|
+
accelerate: Enable acceleration if query is chunkable
|
|
1321
|
+
cache_read: Read from Parquet cache if available
|
|
1322
|
+
cache_write: Write results to Parquet cache
|
|
1323
|
+
start_date: Filter cached data from this date (inclusive).
|
|
1324
|
+
Accepts datetime, date, or ISO string with timezone.
|
|
1325
|
+
end_date: Filter cached data until this date (exclusive).
|
|
1326
|
+
coerce: Error handling mode ("raise" or "error")
|
|
1327
|
+
max_workers: Maximum parallel workers (default: 4)
|
|
1328
|
+
chunking_granularity: Time granularity for chunking (e.g., timedelta(days=1))
|
|
1329
|
+
row_group_size: Rows per parquet row group. If None, Rust default is used.
|
|
1330
|
+
any_type_strategy: How to decode Types.Any() struct columns:
|
|
1331
|
+
- "float": Coalesce to Float64, prioritize numeric (default)
|
|
1332
|
+
- "string": Convert everything to string (lossless)
|
|
1333
|
+
- "keep_struct": Keep raw struct, don't decode
|
|
1334
|
+
flush_ram_limit_mb: RAM limit in MB for buffered data before flushing.
|
|
1335
|
+
(default: 512)
|
|
1336
|
+
|
|
1337
|
+
Returns:
|
|
1338
|
+
Polars DataFrame with results
|
|
1339
|
+
|
|
1340
|
+
Raises:
|
|
1341
|
+
ValueError: If no schema is provided
|
|
1342
|
+
|
|
1343
|
+
Example:
|
|
1344
|
+
>>> cursor = collection.find({...}).sort("timestamp", 1)
|
|
1345
|
+
>>> df = cursor.to_polars(
|
|
1346
|
+
... max_workers=8,
|
|
1347
|
+
... chunking_granularity=timedelta(days=7),
|
|
1348
|
+
... flush_ram_limit_mb=2000,
|
|
1349
|
+
... )
|
|
1350
|
+
"""
|
|
1351
|
+
schema = self._collection.schema
|
|
1352
|
+
if schema is None:
|
|
1353
|
+
raise ValueError(
|
|
1354
|
+
"Schema is required for to_polars(). "
|
|
1355
|
+
"Provide a schema when creating the collection."
|
|
1356
|
+
)
|
|
1357
|
+
|
|
1358
|
+
# CRITICAL: If limit() or skip() are used, fall back to PyMongo
|
|
1359
|
+
# Reason: Downloading all data just to return a subset is impractical
|
|
1360
|
+
if self._limit > 0 or self._skip > 0:
|
|
1361
|
+
logger.info(
|
|
1362
|
+
"limit() or skip() detected - falling back to PyMongo iteration "
|
|
1363
|
+
"(acceleration would be impractical for subset queries)"
|
|
1364
|
+
)
|
|
1365
|
+
# Use fresh PyMongo cursor (not self which may be exhausted)
|
|
1366
|
+
pymongo_cursor = self._collection.pymongo_collection.find(
|
|
1367
|
+
self._filter, self._projection
|
|
1368
|
+
)
|
|
1369
|
+
if self._sort:
|
|
1370
|
+
pymongo_cursor = pymongo_cursor.sort(self._sort)
|
|
1371
|
+
if self._skip:
|
|
1372
|
+
pymongo_cursor = pymongo_cursor.skip(self._skip)
|
|
1373
|
+
if self._limit:
|
|
1374
|
+
pymongo_cursor = pymongo_cursor.limit(self._limit)
|
|
1375
|
+
if self._batch_size:
|
|
1376
|
+
pymongo_cursor = pymongo_cursor.batch_size(self._batch_size)
|
|
1377
|
+
docs = list(pymongo_cursor)
|
|
1378
|
+
if not docs:
|
|
1379
|
+
return pl.DataFrame()
|
|
1380
|
+
return pl.DataFrame(docs)
|
|
1381
|
+
|
|
1382
|
+
# CRITICAL: Validate projection doesn't exclude required fields
|
|
1383
|
+
if self._projection:
|
|
1384
|
+
projection_values = [v for k, v in self._projection.items() if k != "_id"]
|
|
1385
|
+
is_inclusion = any(v == 1 for v in projection_values)
|
|
1386
|
+
|
|
1387
|
+
# Time field must be included
|
|
1388
|
+
if is_inclusion:
|
|
1389
|
+
time_in_projection = (
|
|
1390
|
+
schema.time_field in self._projection
|
|
1391
|
+
and self._projection[schema.time_field] == 1
|
|
1392
|
+
)
|
|
1393
|
+
if not time_in_projection:
|
|
1394
|
+
raise ValueError(
|
|
1395
|
+
f"Projection must include time field '{schema.time_field}'. "
|
|
1396
|
+
f"Projection: {self._projection}"
|
|
1397
|
+
)
|
|
1398
|
+
|
|
1399
|
+
# Sort fields must be included
|
|
1400
|
+
if self._sort:
|
|
1401
|
+
for sort_field, _ in self._sort:
|
|
1402
|
+
if is_inclusion:
|
|
1403
|
+
if (
|
|
1404
|
+
sort_field not in self._projection
|
|
1405
|
+
or self._projection[sort_field] != 1
|
|
1406
|
+
):
|
|
1407
|
+
raise ValueError(
|
|
1408
|
+
f"Projection must include sort field '{sort_field}'. "
|
|
1409
|
+
f"Cannot sort by a field that is projected out. "
|
|
1410
|
+
f"Projection: {self._projection}"
|
|
1411
|
+
)
|
|
1412
|
+
|
|
1413
|
+
time_field = schema.time_field
|
|
1414
|
+
|
|
1415
|
+
# Validate sort field if specified
|
|
1416
|
+
if self._sort:
|
|
1417
|
+
sort_validation = validate_sort_field(self._sort, schema)
|
|
1418
|
+
if not sort_validation.is_valid:
|
|
1419
|
+
raise ValueError(f"Sort validation failed: {sort_validation.reason}")
|
|
1420
|
+
|
|
1421
|
+
# Parse and validate date filters
|
|
1422
|
+
parsed_start = parse_datetime_tz_aware(start_date, "start_date")
|
|
1423
|
+
parsed_end = parse_datetime_tz_aware(end_date, "end_date")
|
|
1424
|
+
|
|
1425
|
+
if not accelerate:
|
|
1426
|
+
if parsed_start or parsed_end:
|
|
1427
|
+
logger.warning(
|
|
1428
|
+
"start_date/end_date filters are ignored when accelerate=False"
|
|
1429
|
+
)
|
|
1430
|
+
# Fallback to regular iteration (native Polars from dicts)
|
|
1431
|
+
return self._to_polars_regular()
|
|
1432
|
+
|
|
1433
|
+
is_chunkable, reason, brackets, _ = build_brackets_for_find(
|
|
1434
|
+
self._filter,
|
|
1435
|
+
schema.time_field,
|
|
1436
|
+
self._sort, # Pass sort spec for $natural detection
|
|
1437
|
+
)
|
|
1438
|
+
|
|
1439
|
+
# Handle REJECT mode (is_chunkable=False)
|
|
1440
|
+
if not is_chunkable:
|
|
1441
|
+
if parsed_start or parsed_end:
|
|
1442
|
+
logger.warning(
|
|
1443
|
+
"start_date/end_date filters are ignored for non-chunkable queries"
|
|
1444
|
+
)
|
|
1445
|
+
logger.info("Invalid query syntax (%s) - cannot execute", reason)
|
|
1446
|
+
# Fall back to single-worker mode
|
|
1447
|
+
max_workers = 1
|
|
1448
|
+
chunking_granularity = None
|
|
1449
|
+
|
|
1450
|
+
# Handle SINGLE mode (is_chunkable=True but empty brackets)
|
|
1451
|
+
elif is_chunkable and not brackets:
|
|
1452
|
+
logger.info(
|
|
1453
|
+
"Query valid but not parallelizable (%s) - using single-worker mode",
|
|
1454
|
+
reason,
|
|
1455
|
+
)
|
|
1456
|
+
# Fall back to single-worker mode
|
|
1457
|
+
max_workers = 1
|
|
1458
|
+
chunking_granularity = None
|
|
1459
|
+
|
|
1460
|
+
# Create cache manager
|
|
1461
|
+
cache = CacheManager(
|
|
1462
|
+
filter_dict=self._filter,
|
|
1463
|
+
projection=self._projection,
|
|
1464
|
+
sort=self._sort,
|
|
1465
|
+
)
|
|
1466
|
+
|
|
1467
|
+
# Check if cache exists
|
|
1468
|
+
if cache_read and cache.exists():
|
|
1469
|
+
logging.debug(f"[Cache] Reading from cache (polars): {cache.cache_dir}")
|
|
1470
|
+
reader = ParquetReader(cache.cache_dir)
|
|
1471
|
+
df = cast(
|
|
1472
|
+
pl.DataFrame,
|
|
1473
|
+
reader.to_dataframe(
|
|
1474
|
+
engine="polars",
|
|
1475
|
+
schema=schema,
|
|
1476
|
+
time_field=time_field,
|
|
1477
|
+
start_date=parsed_start,
|
|
1478
|
+
end_date=parsed_end,
|
|
1479
|
+
coerce=coerce,
|
|
1480
|
+
any_type_strategy=any_type_strategy,
|
|
1481
|
+
),
|
|
1482
|
+
)
|
|
1483
|
+
|
|
1484
|
+
# Check if we need DuckDB sorting (Any types or List types)
|
|
1485
|
+
need_duckdb_sort = False
|
|
1486
|
+
sort_infos: List[Dict[str, Any]] = []
|
|
1487
|
+
if self._sort:
|
|
1488
|
+
sort_infos = get_sort_field_info(self._sort, schema)
|
|
1489
|
+
|
|
1490
|
+
# Expand parent fields to children and collect all fields to check
|
|
1491
|
+
fields_to_check = []
|
|
1492
|
+
for info in sort_infos:
|
|
1493
|
+
if info["is_parent"]:
|
|
1494
|
+
# Parent field - check all children
|
|
1495
|
+
fields_to_check.extend(info["child_fields"])
|
|
1496
|
+
else:
|
|
1497
|
+
# Direct field
|
|
1498
|
+
fields_to_check.append(info["field"])
|
|
1499
|
+
|
|
1500
|
+
# Check if any of the actual sort fields (after expansion) are Any/List types
|
|
1501
|
+
for field in fields_to_check:
|
|
1502
|
+
if field in schema.fields:
|
|
1503
|
+
field_type = schema.fields[field]
|
|
1504
|
+
if isinstance(field_type, (AnyType, ListType)):
|
|
1505
|
+
need_duckdb_sort = True
|
|
1506
|
+
break
|
|
1507
|
+
|
|
1508
|
+
if self._sort and need_duckdb_sort:
|
|
1509
|
+
# Use DuckDB for Any/List type sorting (requires BSON type ordering / array sorting)
|
|
1510
|
+
logging.debug(
|
|
1511
|
+
"[Sort] Using DuckDB for Types.Any()/Types.List() sorting..."
|
|
1512
|
+
)
|
|
1513
|
+
|
|
1514
|
+
warnings.warn(
|
|
1515
|
+
"Sorting by Types.Any() field in to_polars returns raw struct columns "
|
|
1516
|
+
"(e.g., 'value.float_value', 'value.int64_value'). "
|
|
1517
|
+
"Use to_dataframe() for decoded Any() values.",
|
|
1518
|
+
UserWarning,
|
|
1519
|
+
)
|
|
1520
|
+
|
|
1521
|
+
# Use get_globally_sorted_dataframe() - more efficient than batching
|
|
1522
|
+
combined_df = reader.get_globally_sorted_dataframe(
|
|
1523
|
+
sort_spec=self._sort,
|
|
1524
|
+
schema=schema,
|
|
1525
|
+
time_field=time_field,
|
|
1526
|
+
start_date=parsed_start,
|
|
1527
|
+
end_date=parsed_end,
|
|
1528
|
+
coerce=coerce,
|
|
1529
|
+
)
|
|
1530
|
+
|
|
1531
|
+
if not combined_df.empty:
|
|
1532
|
+
for col in combined_df.columns:
|
|
1533
|
+
if combined_df[col].dtype == object:
|
|
1534
|
+
first_val = (
|
|
1535
|
+
combined_df[col].dropna().iloc[0]
|
|
1536
|
+
if not combined_df[col].dropna().empty
|
|
1537
|
+
else None
|
|
1538
|
+
)
|
|
1539
|
+
if (
|
|
1540
|
+
first_val is not None
|
|
1541
|
+
and type(first_val).__name__ == "ObjectId"
|
|
1542
|
+
):
|
|
1543
|
+
combined_df[col] = combined_df[col].astype(str)
|
|
1544
|
+
df = pl.from_pandas(combined_df)
|
|
1545
|
+
else:
|
|
1546
|
+
df = pl.DataFrame()
|
|
1547
|
+
|
|
1548
|
+
elif self._sort:
|
|
1549
|
+
# Native Polars sort - expand parent fields to children
|
|
1550
|
+
expanded_sort = []
|
|
1551
|
+
for info in sort_infos:
|
|
1552
|
+
if info["is_parent"]:
|
|
1553
|
+
# Expand parent field to all children
|
|
1554
|
+
for child in info["child_fields"]:
|
|
1555
|
+
expanded_sort.append((child, info["direction"]))
|
|
1556
|
+
else:
|
|
1557
|
+
expanded_sort.append((info["field"], info["direction"]))
|
|
1558
|
+
|
|
1559
|
+
sort_fields = [
|
|
1560
|
+
field for field, _ in expanded_sort if field in df.columns
|
|
1561
|
+
]
|
|
1562
|
+
descending = [
|
|
1563
|
+
direction == -1
|
|
1564
|
+
for field, direction in expanded_sort
|
|
1565
|
+
if field in df.columns
|
|
1566
|
+
]
|
|
1567
|
+
if sort_fields:
|
|
1568
|
+
df = df.sort(sort_fields, descending=descending)
|
|
1569
|
+
|
|
1570
|
+
# Apply skip/limit
|
|
1571
|
+
if self._skip:
|
|
1572
|
+
df = df.slice(self._skip)
|
|
1573
|
+
if self._limit:
|
|
1574
|
+
df = df.head(self._limit)
|
|
1575
|
+
|
|
1576
|
+
logging.debug(
|
|
1577
|
+
f"[OK] Loaded {len(df):,} documents from cache ({reader.get_statistics()['total_size_mb']:.1f} MB)"
|
|
1578
|
+
)
|
|
1579
|
+
return df
|
|
1580
|
+
|
|
1581
|
+
# Cache miss - need to fetch and write
|
|
1582
|
+
if not cache_write:
|
|
1583
|
+
raise ValueError(
|
|
1584
|
+
"Cache does not exist and cache_write=False. "
|
|
1585
|
+
"Either enable cache_write or call to_dataframe() first."
|
|
1586
|
+
)
|
|
1587
|
+
|
|
1588
|
+
# Fetch data (uses same logic as to_dataframe)
|
|
1589
|
+
mode_str = (
|
|
1590
|
+
"parallel" if is_chunkable and chunking_granularity else "single-worker"
|
|
1591
|
+
)
|
|
1592
|
+
logging.debug(
|
|
1593
|
+
f"[Query] Cache miss - fetching from MongoDB ({mode_str} mode)..."
|
|
1594
|
+
)
|
|
1595
|
+
|
|
1596
|
+
result = execute_parallel_stream_to_cache(
|
|
1597
|
+
pymongo_collection=self._collection.pymongo_collection,
|
|
1598
|
+
filter_dict=self._filter,
|
|
1599
|
+
schema=schema,
|
|
1600
|
+
cache_manager=cache,
|
|
1601
|
+
projection=self._projection,
|
|
1602
|
+
approx_document_size_bytes=self._collection.approx_document_size_bytes,
|
|
1603
|
+
max_workers=max_workers if is_chunkable else 1,
|
|
1604
|
+
peak_ram_limit_mb=flush_ram_limit_mb,
|
|
1605
|
+
chunking_granularity=chunking_granularity if is_chunkable else None,
|
|
1606
|
+
mongo_uri=self._collection.mongo_uri,
|
|
1607
|
+
row_group_size=row_group_size,
|
|
1608
|
+
)
|
|
1609
|
+
|
|
1610
|
+
logging.debug(
|
|
1611
|
+
f"\n[Cache] Cache written: {result['total_docs']:,} docs in {result['duration_s']:.2f}s"
|
|
1612
|
+
)
|
|
1613
|
+
|
|
1614
|
+
# Read from cache as Polars
|
|
1615
|
+
logging.debug("[Cache] Reading from cache to build Polars DataFrame...")
|
|
1616
|
+
reader = ParquetReader(cache.cache_dir)
|
|
1617
|
+
|
|
1618
|
+
# Check if we need DuckDB sorting (Any types or List types)
|
|
1619
|
+
need_duckdb_sort = False
|
|
1620
|
+
sort_infos: List[Dict[str, Any]] = []
|
|
1621
|
+
if self._sort:
|
|
1622
|
+
sort_infos = get_sort_field_info(self._sort, schema)
|
|
1623
|
+
|
|
1624
|
+
# Expand parent fields to children and collect all fields to check
|
|
1625
|
+
fields_to_check = []
|
|
1626
|
+
for info in sort_infos:
|
|
1627
|
+
if info["is_parent"]:
|
|
1628
|
+
# Parent field - check all children
|
|
1629
|
+
fields_to_check.extend(info["child_fields"])
|
|
1630
|
+
else:
|
|
1631
|
+
# Direct field
|
|
1632
|
+
fields_to_check.append(info["field"])
|
|
1633
|
+
|
|
1634
|
+
# Check if any of the actual sort fields (after expansion) are Any/List types
|
|
1635
|
+
for field in fields_to_check:
|
|
1636
|
+
if field in schema.fields:
|
|
1637
|
+
field_type = schema.fields[field]
|
|
1638
|
+
if isinstance(field_type, (AnyType, ListType)):
|
|
1639
|
+
need_duckdb_sort = True
|
|
1640
|
+
break
|
|
1641
|
+
|
|
1642
|
+
if self._sort and need_duckdb_sort:
|
|
1643
|
+
# Use DuckDB for Any/List type sorting (requires BSON type ordering / array sorting)
|
|
1644
|
+
logging.debug("[Sort] Using DuckDB for Types.Any()/Types.List() sorting...")
|
|
1645
|
+
|
|
1646
|
+
warnings.warn(
|
|
1647
|
+
"Sorting by Types.Any() field in to_polars returns raw struct columns "
|
|
1648
|
+
"(e.g., 'value.float_value', 'value.int64_value'). "
|
|
1649
|
+
"Use to_dataframe() for decoded Any() values.",
|
|
1650
|
+
UserWarning,
|
|
1651
|
+
)
|
|
1652
|
+
|
|
1653
|
+
# Use get_globally_sorted_dataframe() - more efficient than batching
|
|
1654
|
+
combined_df = reader.get_globally_sorted_dataframe(
|
|
1655
|
+
sort_spec=self._sort,
|
|
1656
|
+
schema=schema,
|
|
1657
|
+
time_field=time_field,
|
|
1658
|
+
start_date=parsed_start,
|
|
1659
|
+
end_date=parsed_end,
|
|
1660
|
+
coerce=coerce,
|
|
1661
|
+
)
|
|
1662
|
+
|
|
1663
|
+
if not combined_df.empty:
|
|
1664
|
+
for col in combined_df.columns:
|
|
1665
|
+
if combined_df[col].dtype == object:
|
|
1666
|
+
first_val = (
|
|
1667
|
+
combined_df[col].dropna().iloc[0]
|
|
1668
|
+
if not combined_df[col].dropna().empty
|
|
1669
|
+
else None
|
|
1670
|
+
)
|
|
1671
|
+
if (
|
|
1672
|
+
first_val is not None
|
|
1673
|
+
and type(first_val).__name__ == "ObjectId"
|
|
1674
|
+
):
|
|
1675
|
+
combined_df[col] = combined_df[col].astype(str)
|
|
1676
|
+
df = pl.from_pandas(combined_df)
|
|
1677
|
+
else:
|
|
1678
|
+
df = pl.DataFrame()
|
|
1679
|
+
else:
|
|
1680
|
+
df = cast(
|
|
1681
|
+
pl.DataFrame,
|
|
1682
|
+
reader.to_dataframe(
|
|
1683
|
+
engine="polars",
|
|
1684
|
+
schema=schema,
|
|
1685
|
+
time_field=time_field,
|
|
1686
|
+
start_date=parsed_start,
|
|
1687
|
+
end_date=parsed_end,
|
|
1688
|
+
coerce=coerce,
|
|
1689
|
+
any_type_strategy=any_type_strategy,
|
|
1690
|
+
),
|
|
1691
|
+
)
|
|
1692
|
+
|
|
1693
|
+
# Native Polars sort - expand parent fields to children
|
|
1694
|
+
if self._sort:
|
|
1695
|
+
expanded_sort = []
|
|
1696
|
+
for info in sort_infos:
|
|
1697
|
+
if info["is_parent"]:
|
|
1698
|
+
for child in info["child_fields"]:
|
|
1699
|
+
expanded_sort.append((child, info["direction"]))
|
|
1700
|
+
else:
|
|
1701
|
+
expanded_sort.append((info["field"], info["direction"]))
|
|
1702
|
+
|
|
1703
|
+
sort_fields = [
|
|
1704
|
+
field for field, _ in expanded_sort if field in df.columns
|
|
1705
|
+
]
|
|
1706
|
+
descending = [
|
|
1707
|
+
direction == -1
|
|
1708
|
+
for field, direction in expanded_sort
|
|
1709
|
+
if field in df.columns
|
|
1710
|
+
]
|
|
1711
|
+
if sort_fields:
|
|
1712
|
+
# Polars uses `reverse` (not `descending`) in older versions.
|
|
1713
|
+
df = df.sort(sort_fields, descending=descending)
|
|
1714
|
+
|
|
1715
|
+
# Apply skip/limit
|
|
1716
|
+
if self._skip:
|
|
1717
|
+
df = df.slice(self._skip)
|
|
1718
|
+
if self._limit:
|
|
1719
|
+
df = df.head(self._limit)
|
|
1720
|
+
|
|
1721
|
+
return df
|
|
1722
|
+
|
|
1723
|
+
def _to_dataframe_regular(self) -> pd.DataFrame:
|
|
1724
|
+
"""
|
|
1725
|
+
Convert to DataFrame without acceleration.
|
|
1726
|
+
|
|
1727
|
+
Uses regular PyMongo iteration. Fallback for:
|
|
1728
|
+
- Non-chunkable queries
|
|
1729
|
+
- No schema provided
|
|
1730
|
+
- Acceleration disabled
|
|
1731
|
+
|
|
1732
|
+
Returns:
|
|
1733
|
+
Pandas DataFrame
|
|
1734
|
+
"""
|
|
1735
|
+
# Collect all documents - __iter__ will set _started
|
|
1736
|
+
# Convert to DataFrame
|
|
1737
|
+
return pd.json_normalize(list(self))
|
|
1738
|
+
|
|
1739
|
+
def _to_polars_regular(self) -> "pl.DataFrame":
|
|
1740
|
+
"""
|
|
1741
|
+
Convert to Polars DataFrame without acceleration.
|
|
1742
|
+
|
|
1743
|
+
Uses regular PyMongo iteration with native Polars conversion.
|
|
1744
|
+
Fallback for:
|
|
1745
|
+
- Non-chunkable queries
|
|
1746
|
+
- No schema provided
|
|
1747
|
+
- Acceleration disabled
|
|
1748
|
+
|
|
1749
|
+
Returns:
|
|
1750
|
+
Polars DataFrame
|
|
1751
|
+
|
|
1752
|
+
Note:
|
|
1753
|
+
Uses pl.from_dicts() which handles nested documents by creating
|
|
1754
|
+
struct columns. For flattened column names like pandas json_normalize,
|
|
1755
|
+
you would need to unnest() afterwards.
|
|
1756
|
+
"""
|
|
1757
|
+
# Collect all documents - __iter__ will set _started
|
|
1758
|
+
docs = list(self)
|
|
1759
|
+
|
|
1760
|
+
if not docs:
|
|
1761
|
+
return pl.DataFrame()
|
|
1762
|
+
|
|
1763
|
+
return pl.from_dicts(docs)
|
|
1764
|
+
|
|
1765
|
+
def _to_dataframe_accelerated(
|
|
1766
|
+
self,
|
|
1767
|
+
cache_read: bool,
|
|
1768
|
+
cache_write: bool,
|
|
1769
|
+
start_date: Optional[datetime] = None,
|
|
1770
|
+
end_date: Optional[datetime] = None,
|
|
1771
|
+
coerce: Literal["raise", "error"] = "raise",
|
|
1772
|
+
max_workers: int = 4,
|
|
1773
|
+
chunking_granularity: Optional[timedelta] = None,
|
|
1774
|
+
is_chunkable: bool = True,
|
|
1775
|
+
flush_ram_limit_mb: int = 512,
|
|
1776
|
+
row_group_size: Optional[int] = None,
|
|
1777
|
+
) -> pd.DataFrame:
|
|
1778
|
+
"""
|
|
1779
|
+
Convert to DataFrame using parallel execution with Parquet caching.
|
|
1780
|
+
|
|
1781
|
+
┌─────────────────────────────────────────────────────────────────────┐
|
|
1782
|
+
│ DATA FLOW - ACCELERATED EXECUTION: │
|
|
1783
|
+
│ │
|
|
1784
|
+
│ This is where the XLR8 magic happens. The flow is: │
|
|
1785
|
+
│ │
|
|
1786
|
+
│ 1. CACHE CHECK │
|
|
1787
|
+
│ Input: self._filter hashed to "abc123def" │
|
|
1788
|
+
│ Check: Does .cache/abc123def/*.parquet exist? │
|
|
1789
|
+
│ If yes -> Read directly from Parquet (instant!) │
|
|
1790
|
+
│ │
|
|
1791
|
+
│ 2. CACHE MISS -> PARALLEL FETCH (if chunkable) │
|
|
1792
|
+
│ Calls: execute_parallel_stream_to_cache() │
|
|
1793
|
+
│ Which does: │
|
|
1794
|
+
│ a) Build brackets from query (analysis/brackets.py) │
|
|
1795
|
+
│ Query -> [Bracket(static_filter, time_range), ...] │
|
|
1796
|
+
│ b) Plan execution (execution/planner.py) │
|
|
1797
|
+
│ Time range + RAM -> workers=N, batch_size=M │
|
|
1798
|
+
│ c) Chunk time ranges (analysis/chunker.py) │
|
|
1799
|
+
│ 6 months -> X chunks based on granularity │
|
|
1800
|
+
│ d) Parallel fetch (Rust backend fetch_chunks_bson) │
|
|
1801
|
+
│ N async workers pull chunks from queue │
|
|
1802
|
+
│ e) Stream to Parquet (Rust backend) │
|
|
1803
|
+
│ Each worker writes part files: part_0000.parquet, etc. │
|
|
1804
|
+
│ │
|
|
1805
|
+
│ 2b. CACHE MISS -> SINGLE-WORKER FETCH (if not chunkable) │
|
|
1806
|
+
│ - Single worker fetches all data │
|
|
1807
|
+
│ - No async, no chunking │
|
|
1808
|
+
│ - Still writes to Parquet for caching │
|
|
1809
|
+
│ │
|
|
1810
|
+
│ 3. READ FROM CACHE │
|
|
1811
|
+
│ After fetch, read the Parquet files we just wrote │
|
|
1812
|
+
│ Optionally filter by start_date/end_date │
|
|
1813
|
+
│ Returns: pandas DataFrame with original values │
|
|
1814
|
+
│ │
|
|
1815
|
+
│ EXAMPLE TIMING (500K docs): │
|
|
1816
|
+
│ - Cache hit: 0.5s (read Parquet) │
|
|
1817
|
+
│ - Cache miss: 10-15s (parallel fetch + write + read) │
|
|
1818
|
+
│ - Without XLR8: 30-40s (sequential cursor iteration) │
|
|
1819
|
+
└─────────────────────────────────────────────────────────────────────┘
|
|
1820
|
+
|
|
1821
|
+
Args:
|
|
1822
|
+
cache_read: Read from cache if available
|
|
1823
|
+
cache_write: Write to cache after fetching
|
|
1824
|
+
start_date: Filter cached data from this date (inclusive, tz-aware)
|
|
1825
|
+
end_date: Filter cached data until this date (exclusive, tz-aware)
|
|
1826
|
+
coerce: Error handling mode ("raise" or "error")
|
|
1827
|
+
max_workers: Maximum parallel workers (passed from to_dataframe)
|
|
1828
|
+
chunking_granularity: Time granularity for chunking (passed from to_dataframe)
|
|
1829
|
+
is_chunkable: Whether query is chunkable (determines parallel vs single-worker)
|
|
1830
|
+
|
|
1831
|
+
Returns:
|
|
1832
|
+
Pandas DataFrame with accelerated query results
|
|
1833
|
+
"""
|
|
1834
|
+
schema = self._collection.schema
|
|
1835
|
+
time_field = schema.time_field
|
|
1836
|
+
|
|
1837
|
+
# Mark as started
|
|
1838
|
+
if not self._started:
|
|
1839
|
+
self._started = True
|
|
1840
|
+
|
|
1841
|
+
# ─────────────────────────────────────────────────────────────────────
|
|
1842
|
+
# STEP 1: Create cache manager (hashes query to unique directory)
|
|
1843
|
+
# Example: filter_dict hashes to "abc123def" -> .cache/abc123def/
|
|
1844
|
+
# ─────────────────────────────────────────────────────────────────────
|
|
1845
|
+
cache = CacheManager(
|
|
1846
|
+
filter_dict=self._filter,
|
|
1847
|
+
projection=self._projection,
|
|
1848
|
+
sort=self._sort,
|
|
1849
|
+
)
|
|
1850
|
+
|
|
1851
|
+
# ─────────────────────────────────────────────────────────────────────
|
|
1852
|
+
# STEP 2: Check cache - if hit, read directly from Parquet
|
|
1853
|
+
# Example: .cache/abc123def/ts_1704067200_1704070800_part_0000.parquet
|
|
1854
|
+
# ─────────────────────────────────────────────────────────────────────
|
|
1855
|
+
if cache_read and cache.exists():
|
|
1856
|
+
logging.debug(f"[Cache] Reading from cache: {cache.cache_dir}")
|
|
1857
|
+
reader = ParquetReader(cache.cache_dir)
|
|
1858
|
+
|
|
1859
|
+
# Check if we need DuckDB sorting (Any types or List types)
|
|
1860
|
+
need_duckdb_sort = False
|
|
1861
|
+
sort_infos: List[Dict[str, Any]] = []
|
|
1862
|
+
if self._sort:
|
|
1863
|
+
sort_infos = get_sort_field_info(self._sort, schema)
|
|
1864
|
+
|
|
1865
|
+
# Expand parent fields to children and collect all fields to check
|
|
1866
|
+
fields_to_check = []
|
|
1867
|
+
for info in sort_infos:
|
|
1868
|
+
if info["is_parent"]:
|
|
1869
|
+
# Parent field - check all children
|
|
1870
|
+
fields_to_check.extend(info["child_fields"])
|
|
1871
|
+
else:
|
|
1872
|
+
# Direct field
|
|
1873
|
+
fields_to_check.append(info["field"])
|
|
1874
|
+
|
|
1875
|
+
# Check if any of the actual sort fields (after expansion) are Any/List types
|
|
1876
|
+
for field in fields_to_check:
|
|
1877
|
+
if field in schema.fields:
|
|
1878
|
+
field_type = schema.fields[field]
|
|
1879
|
+
if isinstance(field_type, (AnyType, ListType)):
|
|
1880
|
+
need_duckdb_sort = True
|
|
1881
|
+
break
|
|
1882
|
+
|
|
1883
|
+
if self._sort and need_duckdb_sort:
|
|
1884
|
+
# Use DuckDB for Any/List type sorting (requires BSON type ordering / array sorting)
|
|
1885
|
+
logging.debug(
|
|
1886
|
+
"[Sort] Using DuckDB for Types.Any()/Types.List() sorting..."
|
|
1887
|
+
)
|
|
1888
|
+
df = cast(
|
|
1889
|
+
pd.DataFrame,
|
|
1890
|
+
reader.get_globally_sorted_dataframe(
|
|
1891
|
+
sort_spec=self._sort,
|
|
1892
|
+
schema=schema,
|
|
1893
|
+
time_field=time_field,
|
|
1894
|
+
start_date=start_date,
|
|
1895
|
+
end_date=end_date,
|
|
1896
|
+
coerce=coerce,
|
|
1897
|
+
memory_limit_mb=flush_ram_limit_mb,
|
|
1898
|
+
threads=max_workers,
|
|
1899
|
+
),
|
|
1900
|
+
)
|
|
1901
|
+
else:
|
|
1902
|
+
# Normal read + native pandas sort
|
|
1903
|
+
df = cast(
|
|
1904
|
+
pd.DataFrame,
|
|
1905
|
+
reader.to_dataframe(
|
|
1906
|
+
engine="pandas",
|
|
1907
|
+
schema=schema,
|
|
1908
|
+
time_field=time_field,
|
|
1909
|
+
start_date=start_date,
|
|
1910
|
+
end_date=end_date,
|
|
1911
|
+
coerce=coerce,
|
|
1912
|
+
),
|
|
1913
|
+
)
|
|
1914
|
+
|
|
1915
|
+
# Native pandas sort - expand parent fields to children
|
|
1916
|
+
if self._sort:
|
|
1917
|
+
expanded_sort = []
|
|
1918
|
+
for info in sort_infos:
|
|
1919
|
+
if info["is_parent"]:
|
|
1920
|
+
for child in info["child_fields"]:
|
|
1921
|
+
expanded_sort.append((child, info["direction"]))
|
|
1922
|
+
else:
|
|
1923
|
+
expanded_sort.append((info["field"], info["direction"]))
|
|
1924
|
+
|
|
1925
|
+
sort_fields = [
|
|
1926
|
+
field for field, _ in expanded_sort if field in df.columns
|
|
1927
|
+
]
|
|
1928
|
+
ascending = [
|
|
1929
|
+
direction == 1
|
|
1930
|
+
for field, direction in expanded_sort
|
|
1931
|
+
if field in df.columns
|
|
1932
|
+
]
|
|
1933
|
+
if sort_fields:
|
|
1934
|
+
df = df.sort_values(
|
|
1935
|
+
by=sort_fields, ascending=ascending, na_position="last"
|
|
1936
|
+
)
|
|
1937
|
+
logger.debug("Sorted DataFrame by %s", sort_fields)
|
|
1938
|
+
|
|
1939
|
+
# Apply skip/limit if set
|
|
1940
|
+
if self._skip:
|
|
1941
|
+
df = df.iloc[self._skip :]
|
|
1942
|
+
if self._limit:
|
|
1943
|
+
df = df.iloc[: self._limit]
|
|
1944
|
+
|
|
1945
|
+
filter_info = ""
|
|
1946
|
+
if start_date or end_date:
|
|
1947
|
+
filter_info = f" (filtered: {start_date} to {end_date})"
|
|
1948
|
+
logging.debug(
|
|
1949
|
+
f"[OK] Loaded {len(df):,} documents from cache{filter_info} ({reader.get_statistics()['total_size_mb']:.1f} MB)"
|
|
1950
|
+
)
|
|
1951
|
+
return cast(pd.DataFrame, df)
|
|
1952
|
+
|
|
1953
|
+
# ─────────────────────────────────────────────────────────────────────
|
|
1954
|
+
# STEP 3: Cache miss - execute fetch and stream to Parquet
|
|
1955
|
+
# This is where the heavy lifting happens
|
|
1956
|
+
# ─────────────────────────────────────────────────────────────────────
|
|
1957
|
+
mode_str = "parallel" if is_chunkable else "single-worker"
|
|
1958
|
+
logging.debug(
|
|
1959
|
+
f"[Query] Cache miss - fetching from MongoDB ({mode_str} mode)..."
|
|
1960
|
+
)
|
|
1961
|
+
|
|
1962
|
+
if cache_write:
|
|
1963
|
+
# CRITICAL: If cache_read=False but cache_write=True and cache exists,
|
|
1964
|
+
# we need to clear the old cache first to avoid duplicate data
|
|
1965
|
+
if not cache_read and cache.exists():
|
|
1966
|
+
logging.debug(
|
|
1967
|
+
"Clearing existing cache (cache_read=False, starting fresh)..."
|
|
1968
|
+
)
|
|
1969
|
+
cache.clean()
|
|
1970
|
+
# chunking_granularity is passed from to_dataframe()
|
|
1971
|
+
# If None, execute_parallel_stream_to_cache will use single-worker mode
|
|
1972
|
+
|
|
1973
|
+
# Streaming path: fetch -> encode -> write Parquet (memory efficient)
|
|
1974
|
+
result = execute_parallel_stream_to_cache(
|
|
1975
|
+
pymongo_collection=self._collection.pymongo_collection,
|
|
1976
|
+
filter_dict=self._filter,
|
|
1977
|
+
schema=schema,
|
|
1978
|
+
cache_manager=cache,
|
|
1979
|
+
projection=self._projection,
|
|
1980
|
+
approx_document_size_bytes=self._collection.approx_document_size_bytes,
|
|
1981
|
+
max_workers=max_workers, # From to_dataframe() parameter
|
|
1982
|
+
peak_ram_limit_mb=flush_ram_limit_mb,
|
|
1983
|
+
chunking_granularity=chunking_granularity, # None = single-worker mode
|
|
1984
|
+
mongo_uri=self._collection.mongo_uri,
|
|
1985
|
+
row_group_size=row_group_size,
|
|
1986
|
+
)
|
|
1987
|
+
|
|
1988
|
+
logging.debug("\n[Cache] Cache written:")
|
|
1989
|
+
logging.debug(f" - Total docs: {result['total_docs']:,}")
|
|
1990
|
+
logging.debug(f" - Total files: {result['total_files']}")
|
|
1991
|
+
logging.debug(f" - Workers: {result['workers']}")
|
|
1992
|
+
logging.debug(f" - Duration: {result['duration_s']:.2f}s")
|
|
1993
|
+
logging.debug(f" - Cache dir: {cache.cache_dir}")
|
|
1994
|
+
|
|
1995
|
+
# Now read from cache to build DataFrame (with optional date filter)
|
|
1996
|
+
logging.debug("\n[Cache] Reading from cache to build DataFrame...")
|
|
1997
|
+
reader = ParquetReader(cache.cache_dir)
|
|
1998
|
+
|
|
1999
|
+
# Check if we need DuckDB sorting (Any types or List types)
|
|
2000
|
+
need_duckdb_sort = False
|
|
2001
|
+
sort_infos: List[Dict[str, Any]] = []
|
|
2002
|
+
if self._sort:
|
|
2003
|
+
sort_infos = get_sort_field_info(self._sort, schema)
|
|
2004
|
+
|
|
2005
|
+
# Expand parent fields to children and collect all fields to check
|
|
2006
|
+
fields_to_check = []
|
|
2007
|
+
for info in sort_infos:
|
|
2008
|
+
if info["is_parent"]:
|
|
2009
|
+
# Parent field - check all children
|
|
2010
|
+
fields_to_check.extend(info["child_fields"])
|
|
2011
|
+
else:
|
|
2012
|
+
# Direct field
|
|
2013
|
+
fields_to_check.append(info["field"])
|
|
2014
|
+
|
|
2015
|
+
# Check if any of the actual sort fields (after expansion) are Any/List types
|
|
2016
|
+
for field in fields_to_check:
|
|
2017
|
+
if field in schema.fields:
|
|
2018
|
+
field_type = schema.fields[field]
|
|
2019
|
+
if isinstance(field_type, (AnyType, ListType)):
|
|
2020
|
+
need_duckdb_sort = True
|
|
2021
|
+
break
|
|
2022
|
+
|
|
2023
|
+
if self._sort and need_duckdb_sort:
|
|
2024
|
+
# Use DuckDB for Any/List type sorting (requires BSON type ordering / array sorting)
|
|
2025
|
+
logging.debug(
|
|
2026
|
+
"[Sort] Using DuckDB for Types.Any()/Types.List() sorting..."
|
|
2027
|
+
)
|
|
2028
|
+
df = cast(
|
|
2029
|
+
pd.DataFrame,
|
|
2030
|
+
reader.get_globally_sorted_dataframe(
|
|
2031
|
+
sort_spec=self._sort,
|
|
2032
|
+
schema=schema,
|
|
2033
|
+
time_field=time_field,
|
|
2034
|
+
start_date=start_date,
|
|
2035
|
+
end_date=end_date,
|
|
2036
|
+
coerce=coerce,
|
|
2037
|
+
memory_limit_mb=flush_ram_limit_mb,
|
|
2038
|
+
threads=max_workers,
|
|
2039
|
+
),
|
|
2040
|
+
)
|
|
2041
|
+
else:
|
|
2042
|
+
# Normal read + native pandas sort
|
|
2043
|
+
df = cast(
|
|
2044
|
+
pd.DataFrame,
|
|
2045
|
+
reader.to_dataframe(
|
|
2046
|
+
engine="pandas",
|
|
2047
|
+
schema=schema,
|
|
2048
|
+
time_field=time_field,
|
|
2049
|
+
start_date=start_date,
|
|
2050
|
+
end_date=end_date,
|
|
2051
|
+
coerce=coerce,
|
|
2052
|
+
),
|
|
2053
|
+
)
|
|
2054
|
+
|
|
2055
|
+
# Native pandas sort - expand parent fields to children
|
|
2056
|
+
if self._sort:
|
|
2057
|
+
expanded_sort = []
|
|
2058
|
+
for info in sort_infos:
|
|
2059
|
+
if info["is_parent"]:
|
|
2060
|
+
for child in info["child_fields"]:
|
|
2061
|
+
expanded_sort.append((child, info["direction"]))
|
|
2062
|
+
else:
|
|
2063
|
+
expanded_sort.append((info["field"], info["direction"]))
|
|
2064
|
+
|
|
2065
|
+
sort_fields = [
|
|
2066
|
+
field for field, _ in expanded_sort if field in df.columns
|
|
2067
|
+
]
|
|
2068
|
+
ascending = [
|
|
2069
|
+
direction == 1
|
|
2070
|
+
for field, direction in expanded_sort
|
|
2071
|
+
if field in df.columns
|
|
2072
|
+
]
|
|
2073
|
+
if sort_fields:
|
|
2074
|
+
df = df.sort_values(
|
|
2075
|
+
by=sort_fields, ascending=ascending, na_position="last"
|
|
2076
|
+
)
|
|
2077
|
+
logger.debug("Sorted DataFrame by %s", sort_fields)
|
|
2078
|
+
|
|
2079
|
+
else:
|
|
2080
|
+
# cache_write=False is not supported in single-worker mode
|
|
2081
|
+
# Always write to cache for consistency and performance
|
|
2082
|
+
raise ValueError(
|
|
2083
|
+
"cache_write=False is not supported. "
|
|
2084
|
+
"XLR8 always writes to Parquet cache for memory efficiency. "
|
|
2085
|
+
"Set cache_read=False if you don't want to read from existing cache."
|
|
2086
|
+
)
|
|
2087
|
+
|
|
2088
|
+
# Apply skip/limit if set
|
|
2089
|
+
if self._skip:
|
|
2090
|
+
df = df.iloc[self._skip :]
|
|
2091
|
+
if self._limit:
|
|
2092
|
+
df = df.iloc[: self._limit]
|
|
2093
|
+
|
|
2094
|
+
return cast(pd.DataFrame, df)
|
|
2095
|
+
|
|
2096
|
+
def explain_acceleration(self) -> Dict[str, Any]:
|
|
2097
|
+
"""
|
|
2098
|
+
Get query execution plan.
|
|
2099
|
+
|
|
2100
|
+
Returns explanation of how query will be executed:
|
|
2101
|
+
- Whether acceleration is possible
|
|
2102
|
+
- Time bounds extracted
|
|
2103
|
+
- Estimated chunk count
|
|
2104
|
+
- Worker configuration
|
|
2105
|
+
|
|
2106
|
+
Returns:
|
|
2107
|
+
Dict with execution plan details
|
|
2108
|
+
"""
|
|
2109
|
+
schema = self._collection.schema
|
|
2110
|
+
|
|
2111
|
+
result: Dict[str, Any] = {
|
|
2112
|
+
"filter": self._filter,
|
|
2113
|
+
"projection": self._projection,
|
|
2114
|
+
"skip": self._skip,
|
|
2115
|
+
"limit": self._limit,
|
|
2116
|
+
"sort": self._sort,
|
|
2117
|
+
"accelerated": False,
|
|
2118
|
+
}
|
|
2119
|
+
|
|
2120
|
+
if schema is None:
|
|
2121
|
+
result["reason"] = "No schema provided"
|
|
2122
|
+
return result
|
|
2123
|
+
|
|
2124
|
+
# NEW: build_brackets_for_find internally validates via is_chunkable_query
|
|
2125
|
+
is_chunkable, reason, brackets, bounds = build_brackets_for_find(
|
|
2126
|
+
self._filter,
|
|
2127
|
+
schema.time_field,
|
|
2128
|
+
self._sort, # Pass sort spec for $natural detection
|
|
2129
|
+
)
|
|
2130
|
+
|
|
2131
|
+
result["is_chunkable"] = is_chunkable
|
|
2132
|
+
result["reason"] = reason
|
|
2133
|
+
|
|
2134
|
+
# Distinguish REJECT vs SINGLE modes
|
|
2135
|
+
if not is_chunkable:
|
|
2136
|
+
# REJECT mode
|
|
2137
|
+
result["mode"] = "reject"
|
|
2138
|
+
elif is_chunkable and not brackets:
|
|
2139
|
+
# SINGLE mode - valid but not parallelizable
|
|
2140
|
+
result["mode"] = "single"
|
|
2141
|
+
else:
|
|
2142
|
+
# PARALLEL mode
|
|
2143
|
+
result["mode"] = "parallel"
|
|
2144
|
+
|
|
2145
|
+
if is_chunkable and bounds and bounds[0] and bounds[1]:
|
|
2146
|
+
start_bound = bounds[0]
|
|
2147
|
+
end_bound = bounds[1]
|
|
2148
|
+
|
|
2149
|
+
result["time_bounds"] = {
|
|
2150
|
+
"start": start_bound.isoformat(),
|
|
2151
|
+
"end": end_bound.isoformat(),
|
|
2152
|
+
}
|
|
2153
|
+
|
|
2154
|
+
chunks = chunk_time_range(
|
|
2155
|
+
start_bound, end_bound, chunk_size=timedelta(days=1)
|
|
2156
|
+
)
|
|
2157
|
+
result["estimated_chunks"] = len(chunks)
|
|
2158
|
+
|
|
2159
|
+
result["accelerated"] = True
|
|
2160
|
+
|
|
2161
|
+
return result
|