xlr8 0.1.7b3__cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,500 @@
1
+ """
2
+ Execution coordinator for parallel query execution.
3
+
4
+ ================================================================================
5
+ DATA FLOW - EXECUTION ORCHESTRATION
6
+ ================================================================================
7
+
8
+ This is the HEART of XLR8. It coordinates the entire parallel fetch pipeline.
9
+
10
+ EXECUTION FLOW:
11
+ ────────────────────────────────────────────────────────────────────────────────
12
+
13
+ execute_parallel_stream_to_cache() is called with:
14
+ - pymongo_collection: The MongoDB collection
15
+ - filter_dict: The user's query
16
+ - schema: Type definitions for encoding
17
+ - cache_manager: Where to write Parquet files
18
+
19
+ ┌─────────────────────────────────────────────────────────────────────────────┐
20
+ │ STEP 1: BUILD BRACKETS │
21
+ │ Query -> List[Bracket] │
22
+ │ │
23
+ │ Example: │
24
+ │ {"$or": [...], "timestamp": {...}} │
25
+ │ v │
26
+ │ [Bracket(static={"sensor_id": "64a..."}, time=Jan-Jul), │
27
+ │ Bracket(static={"sensor_id": "64b..."}, time=Jan-Jul)] │
28
+ └─────────────────────────────────────────────────────────────────────────────┘
29
+ v
30
+ ┌─────────────────────────────────────────────────────────────────────────────┐
31
+ │ STEP 2: BUILD EXECUTION PLAN │
32
+ │ Time range + RAM budget -> workers, batch_size, chunk_minutes │
33
+ │ │
34
+ │ Example (6-month range, 2000MB RAM, max 10 workers): │
35
+ │ ExecutionPlan( │
36
+ │ worker_count=10, │
37
+ │ batch_size_docs=50000, │
38
+ │ chunk_size_minutes=1440, │
39
+ │ estimated_ram_mb=1800 │
40
+ │ ) │
41
+ └─────────────────────────────────────────────────────────────────────────────┘
42
+ v
43
+ ┌─────────────────────────────────────────────────────────────────────────────┐
44
+ │ STEP 3: CHUNK TIME RANGES │
45
+ │ Each bracket's time range -> multiple chunks │
46
+ │ │
47
+ │ Example (Bracket 1 with Jan-Jul range, 14-day chunks): │
48
+ │ -> Chunk 1.1: Jan 1-15 with filter {"sensor_id": "64a..."} │
49
+ │ -> Chunk 1.2: Jan 15-29 with filter {"sensor_id": "64a..."} │
50
+ │ -> ... │
51
+ │ -> Chunk 1.13: Jun 17 - Jul 1 │
52
+ │ │
53
+ │ Total: 13 chunks x 2 brackets = 26 work items │
54
+ └─────────────────────────────────────────────────────────────────────────────┘
55
+ v
56
+ ┌─────────────────────────────────────────────────────────────────────────────┐
57
+ │ STEP 4: PARALLEL RUST FETCH (rust_backend.fetch_chunks_bson) │
58
+ │ Rust backend processes all chunks concurrently in parallel workers │
59
+ │ │
60
+ │ Worker 0: Grabs Chunk 1 -> Fetch 45K docs -> Write part_0000.parquet │
61
+ │ Worker 1: Grabs Chunk 2 -> Fetch 52K docs -> Write part_0001.parquet │
62
+ │ ... │
63
+ │ Worker 9: Grabs Chunk 10 -> Fetch 38K docs -> Write part_0009.parquet │
64
+ │ │
65
+ │ All I/O happens in Rust (GIL-free, tokio async MongoDB client) │
66
+ │ Workers pull more chunks as they finish until queue is empty │
67
+ └─────────────────────────────────────────────────────────────────────────────┘
68
+ v
69
+ ┌─────────────────────────────────────────────────────────────────────────────┐
70
+ │ STEP 5: RETURN STATS │
71
+ │ { │
72
+ │ "total_docs": 500000, │
73
+ │ "total_files": 26, │
74
+ │ "duration_s": 12.5, │
75
+ │ "workers": 10 │
76
+ │ } │
77
+ └─────────────────────────────────────────────────────────────────────────────┘
78
+
79
+ ================================================================================
80
+ """
81
+
82
+ import json
83
+ import logging
84
+ import warnings
85
+ from collections import defaultdict
86
+ from datetime import datetime, timedelta
87
+ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
88
+
89
+ from bson import encode as bson_encode
90
+
91
+ from xlr8.analysis import chunk_time_range
92
+ from xlr8.analysis.brackets import build_brackets_for_find
93
+ from xlr8.analysis.inspector import validate_query_for_chunking
94
+ from xlr8.execution.planner import build_execution_plan
95
+ from xlr8.schema import Schema
96
+
97
+ logger = logging.getLogger(__name__)
98
+
99
+
100
+ def execute_parallel_stream_to_cache(
101
+ pymongo_collection,
102
+ filter_dict: Dict[str, Any],
103
+ schema: Schema,
104
+ cache_manager: Any,
105
+ *,
106
+ projection: Optional[Dict[str, int]] = None,
107
+ approx_document_size_bytes: int = 500,
108
+ available_ram_gb: Optional[float] = None,
109
+ max_workers: int = 4,
110
+ peak_ram_limit_mb: int = 512,
111
+ chunking_granularity: Optional[timedelta] = None,
112
+ mongo_uri: Union[str, Callable[[], str], None] = None,
113
+ sort_spec: Optional[List[Tuple[str, int]]] = None,
114
+ row_group_size: Optional[int] = None,
115
+ ) -> Dict[str, Any]:
116
+ """Execute query with streaming to Parquet cache.
117
+
118
+ Uses bracket-based chunking and memory-aware execution planning.
119
+ Streams results directly to cache shards.
120
+
121
+ Args:
122
+ pymongo_collection: PyMongo collection instance
123
+ filter_dict: MongoDB query filter
124
+ schema: Schema for Parquet encoding
125
+ cache_manager: CacheManager instance
126
+ projection: MongoDB projection
127
+ approx_document_size_bytes: Average doc size for RAM planning
128
+ available_ram_gb: Override RAM detection
129
+ max_workers: Maximum workers (default: 4)
130
+ peak_ram_limit_mb: RAM budget (default: 512)
131
+ chunking_granularity: Time granularity for chunking (e.g.,
132
+ `timedelta(minutes=10)`). If None, uses single-worker mode without chunking.
133
+ mongo_uri: MongoDB connection string or callable that returns one.
134
+ sort_spec: Sort specification for pre-sorting during cache write.
135
+ Format: `[(field, direction), ...]`
136
+ Where direction is `1` (ASC) or `-1` (DESC).
137
+
138
+ Returns:
139
+ Dict with total_docs, total_files, duration_s, workers
140
+ """
141
+
142
+ # High-level safety: forbidden operators / nested OR
143
+ is_valid, reason = validate_query_for_chunking(
144
+ filter_dict,
145
+ schema.time_field,
146
+ )
147
+
148
+ # NOTE: We no longer use single-worker fallback. All queries go through
149
+ # the parallel Rust path, even if they have no time bounds (unbounded).
150
+ # The brackets algorithm handles unbounded queries as unchunked brackets.
151
+
152
+ if not is_valid:
153
+ # Reject queries with forbidden operators (geospatial, $expr, etc.)
154
+ raise ValueError(f"Query not executable: {reason}")
155
+
156
+ # Build brackets (static_filter + timerange) like
157
+ ok, bracket_reason, brackets, _ = build_brackets_for_find(
158
+ filter_dict, schema.time_field
159
+ )
160
+ if not ok:
161
+ warnings.warn(
162
+ (
163
+ f"Query not chunkable ({bracket_reason}). "
164
+ "Falling back to single-worker mode."
165
+ ),
166
+ UserWarning,
167
+ stacklevel=2,
168
+ )
169
+ # Fall back to single-worker, unchunked execution
170
+ max_workers = 1
171
+ chunking_granularity = None
172
+ # When query is not chunkable, brackets will be empty
173
+ # We'll execute the entire query without brackets
174
+
175
+ if not brackets:
176
+ # No brackets or non-chunkable query - execute full query in single worker
177
+ if max_workers == 1 and chunking_granularity is None:
178
+ # This is expected for non-chunkable queries
179
+ # Create a single "bracket" with the original filter, no time chunking
180
+ logger.info("Non-chunkable query - executing as single unchunked query")
181
+ # We'll handle this as a special case below
182
+ brackets = []
183
+ start_time = None
184
+ end_time = None
185
+ else:
186
+ raise ValueError("No time ranges found in chunkable query")
187
+ else:
188
+ # Separate full and partial brackets for planning
189
+ full_brackets_plan = [b for b in brackets if b.timerange.is_full]
190
+
191
+ # Derive time span ONLY from full brackets (partial ones are executed unchunked)
192
+ if full_brackets_plan:
193
+ los = [
194
+ b.timerange.lo for b in full_brackets_plan if b.timerange.lo is not None
195
+ ]
196
+ his = [
197
+ b.timerange.hi for b in full_brackets_plan if b.timerange.hi is not None
198
+ ]
199
+ start_time = min(los)
200
+ end_time = max(his)
201
+ else:
202
+ # All brackets are unchunked (partial or unbounded)
203
+ start_time = None
204
+ end_time = None
205
+ logger.info(
206
+ "All brackets are unchunked (partial/unbounded) - "
207
+ "will execute as single queries"
208
+ )
209
+
210
+ # Get avg doc size from schema (default to approx_document_size_bytes)
211
+ avg_doc_size = getattr(schema, "avg_doc_size_bytes", approx_document_size_bytes)
212
+
213
+ effective_peak_ram_mb = peak_ram_limit_mb
214
+ if available_ram_gb is not None:
215
+ effective_peak_ram_mb = int(available_ram_gb * 1024)
216
+
217
+ # Count unchunked queries (partial + unbounded brackets)
218
+ # For non-chunkable queries (empty brackets), count as 1 unchunked query
219
+ unchunked_brackets_count = (
220
+ sum(1 for b in brackets if not b.timerange.is_full) if brackets else 1
221
+ )
222
+
223
+ exec_plan = build_execution_plan(
224
+ start_time=start_time,
225
+ end_time=end_time,
226
+ avg_doc_size_bytes=avg_doc_size,
227
+ max_workers=max_workers,
228
+ peak_ram_limit_mb=effective_peak_ram_mb,
229
+ chunking_granularity=chunking_granularity,
230
+ # Always pass: planner combines with time chunks.
231
+ num_unchunked_queries=unchunked_brackets_count,
232
+ )
233
+
234
+ # Build chunks (opt): group brackets by time range.
235
+ # If multiple $or branches share the same time range, combine them into one $or
236
+ # query per chunk instead of creating separate chunks for each branch.
237
+
238
+ # Handle non-chunkable queries (empty brackets)
239
+ if not brackets:
240
+ # Execute entire query as single unchunked query
241
+ chunks = [(filter_dict, 0, None, None)]
242
+ logger.info("Executing non-chunkable query as single chunk")
243
+ else:
244
+ # Separate full and unchunked brackets
245
+ full_brackets: List = []
246
+ unchunked_brackets: List = []
247
+
248
+ for b in brackets:
249
+ if b.timerange.is_full:
250
+ full_brackets.append(b)
251
+ else:
252
+ unchunked_brackets.append(b)
253
+
254
+ brackets_by_timerange: Dict[Tuple[datetime, datetime], List] = defaultdict(list)
255
+
256
+ for b in full_brackets:
257
+ brackets_by_timerange[(b.timerange.lo, b.timerange.hi)].append(b)
258
+
259
+ time_chunks_per_bracket: List[
260
+ Tuple[Dict[str, Any], int, datetime, datetime]
261
+ ] = []
262
+ chunk_index = 0
263
+
264
+ # Process full brackets - chunk them
265
+ for (lo, hi), bracket_group in brackets_by_timerange.items():
266
+ br_chunks = chunk_time_range(
267
+ start=lo,
268
+ end=hi,
269
+ chunk_size=exec_plan.chunk_size,
270
+ )
271
+
272
+ for c_start, c_end in br_chunks:
273
+ # Determine if this is the last chunk - preserve original end boundary
274
+ # operator.
275
+ is_last_chunk = c_end == hi
276
+
277
+ if len(bracket_group) == 1:
278
+ # Single bracket - simple filter
279
+ chunk_filter = dict(bracket_group[0].static_filter)
280
+ time_clause = {}
281
+
282
+ # Lower bound: always $gte for chunk starts
283
+ time_clause["$gte"] = c_start
284
+
285
+ # Upper bound: use original operator if last chunk, else $lt
286
+ if is_last_chunk and bracket_group[0].timerange.hi_inclusive:
287
+ time_clause["$lte"] = c_end
288
+ else:
289
+ time_clause["$lt"] = c_end
290
+
291
+ chunk_filter[schema.time_field] = time_clause
292
+ else:
293
+ # Multiple brackets with same time range - combine with $or
294
+ or_branches = []
295
+ for b in bracket_group:
296
+ branch = dict(b.static_filter)
297
+ or_branches.append(branch)
298
+
299
+ time_clause = {}
300
+ time_clause["$gte"] = c_start
301
+
302
+ # Use original operator if last chunk and ANY bracket is inclusive
303
+ if is_last_chunk and any(
304
+ b.timerange.hi_inclusive for b in bracket_group
305
+ ):
306
+ time_clause["$lte"] = c_end
307
+ else:
308
+ time_clause["$lt"] = c_end
309
+
310
+ chunk_filter = {"$or": or_branches, schema.time_field: time_clause}
311
+
312
+ time_chunks_per_bracket.append(
313
+ (chunk_filter, chunk_index, c_start, c_end)
314
+ )
315
+ chunk_index += 1
316
+
317
+ # Process unchunked brackets (partial + unbounded) - execute as single queries
318
+ for b in unchunked_brackets:
319
+ chunk_filter = dict(b.static_filter)
320
+ time_clause = {}
321
+ if b.timerange.lo is not None:
322
+ if b.timerange.lo_inclusive:
323
+ time_clause["$gte"] = b.timerange.lo
324
+ else:
325
+ time_clause["$gt"] = b.timerange.lo
326
+ if b.timerange.hi is not None:
327
+ if b.timerange.hi_inclusive:
328
+ time_clause["$lte"] = b.timerange.hi
329
+ else:
330
+ time_clause["$lt"] = b.timerange.hi
331
+ if time_clause:
332
+ chunk_filter[schema.time_field] = time_clause
333
+
334
+ time_chunks_per_bracket.append(
335
+ (chunk_filter, chunk_index, b.timerange.lo, b.timerange.hi)
336
+ )
337
+ chunk_index += 1
338
+ logger.info("Added unchunked bracket as single query: %s", chunk_filter)
339
+
340
+ chunks = time_chunks_per_bracket
341
+
342
+ # Cap worker count to actual number of chunks - no point having idle workers
343
+ actual_worker_count = min(exec_plan.worker_count, len(chunks))
344
+
345
+ logging.debug("\n[Plan] Execution Plan:")
346
+ if start_time is not None and end_time is not None:
347
+ logging.debug(f" - Date range: {start_time.date()} to {end_time.date()}")
348
+ logging.debug(f" - Total days: {(end_time - start_time).days}")
349
+ else:
350
+ logging.debug(" - Mode: Unchunked queries (no time range)")
351
+ logging.debug(f" - Chunks: {len(chunks)}")
352
+ logging.debug(f" - Workers: {actual_worker_count}")
353
+ chunk_seconds = int(exec_plan.chunk_size.total_seconds())
354
+ if chunk_seconds > 0:
355
+ if chunk_seconds >= 86400:
356
+ logging.debug(f" - Chunk size: {chunk_seconds // 86400} day(s)")
357
+ elif chunk_seconds >= 3600:
358
+ logging.debug(f" - Chunk size: {chunk_seconds // 3600} hour(s)")
359
+ elif chunk_seconds >= 60:
360
+ logging.debug(f" - Chunk size: {chunk_seconds // 60} minute(s)")
361
+ else:
362
+ logging.debug(f" - Chunk size: {chunk_seconds} second(s)")
363
+ logging.debug(f" - Batch size: {exec_plan.batch_size_docs:,}")
364
+ logging.debug(f" - Flush trigger/worker: {exec_plan.flush_trigger_mb} MB")
365
+ logging.debug(f" - Max Estimated RAM Usage: {exec_plan.estimated_ram_mb:,} MB")
366
+
367
+ # =========================================================================
368
+ # RUST BACKEND EXECUTION (Phase 2 - Full GIL-free implementation)
369
+ # =========================================================================
370
+ # All MongoDB fetching, BSON decoding, Arrow conversion, and Parquet writing
371
+ # happens in Rust with NO Python GIL contention.
372
+ #
373
+ # Python's role: memory planning, chunking, BSON serialization, result reading
374
+ # Rust's role: MongoDB client, async/parallel fetch, BSON->Arrow->Parquet
375
+ # =========================================================================
376
+
377
+ # Validate mongo_uri is provided
378
+ if mongo_uri is None:
379
+ raise ValueError(
380
+ "mongo_uri is required for Rust backend execution. "
381
+ "Pass it to XLR8Collection constructor or accelerate()."
382
+ )
383
+
384
+ # Resolve callable if needed
385
+ resolved_uri = mongo_uri() if callable(mongo_uri) else mongo_uri
386
+
387
+ # Ensure cache directory exists
388
+ cache_manager.ensure_cache_dir()
389
+
390
+ # Serialize chunks to BSON (handles ObjectId, datetime, etc.)
391
+ chunks_bson = serialize_chunks_for_rust(chunks)
392
+
393
+ logger.info(
394
+ "Serialized %d chunks to %d BSON bytes for Rust backend",
395
+ len(chunks),
396
+ len(chunks_bson),
397
+ )
398
+
399
+ # Get MongoDB connection details
400
+ db_name = pymongo_collection.database.name
401
+ collection_name = pymongo_collection.name
402
+
403
+ # Prepare schema JSON for Rust
404
+ schema_json = json.dumps(schema.to_spec())
405
+
406
+ logging.debug(" - Mode: RUST BACKEND (GIL-free, tokio async)")
407
+
408
+ # Call Rust backend directly!
409
+ from xlr8 import rust_backend
410
+
411
+ rust_kwargs: Dict[str, Any] = {
412
+ "mongodb_uri": resolved_uri,
413
+ "db_name": db_name,
414
+ "collection_name": collection_name,
415
+ "chunks_bson": chunks_bson,
416
+ "schema_json": schema_json,
417
+ "cache_dir": str(cache_manager.cache_dir),
418
+ "num_workers": actual_worker_count,
419
+ "batch_size": exec_plan.batch_size_docs,
420
+ "flush_trigger_mb": exec_plan.flush_trigger_mb,
421
+ "avg_doc_size_bytes": schema.avg_doc_size_bytes,
422
+ "sort_spec_json": json.dumps(sort_spec) if sort_spec else "null",
423
+ "time_field": schema.time_field,
424
+ "projection_json": json.dumps(projection) if projection else "null",
425
+ }
426
+ if row_group_size is not None:
427
+ rust_kwargs["row_group_size"] = row_group_size
428
+
429
+ result = rust_backend.fetch_chunks_bson(**rust_kwargs)
430
+
431
+ logger.info(
432
+ "Rust execution complete: %d docs, %d files, %.2fs",
433
+ result["total_docs"],
434
+ result["total_files"],
435
+ result["duration_secs"],
436
+ )
437
+
438
+ # Convert Rust result format to match Python pool format
439
+ # (for compatibility with existing result reading code)
440
+ result["workers"] = actual_worker_count
441
+ result["duration_s"] = result["duration_secs"] # Add Python format key
442
+ result["cache_dir"] = str(cache_manager.cache_dir)
443
+
444
+ return result
445
+
446
+
447
+ # ============================================================================
448
+ # RUST BACKEND INTEGRATION (Phase 2)
449
+ # ============================================================================
450
+
451
+
452
+ def serialize_chunks_for_rust(
453
+ chunks: Sequence[
454
+ Tuple[Dict[str, Any], int, Optional[datetime], Optional[datetime]]
455
+ ],
456
+ ) -> bytes:
457
+ """
458
+ Serialize chunks to BSON bytes for Rust backend.
459
+
460
+ This function converts Python chunks (which may contain ObjectId, datetime,
461
+ and other BSON types) into BSON bytes that Rust can deserialize correctly.
462
+
463
+ Args:
464
+ chunks: List of (filter, chunk_idx, c_start, c_end) tuples from executor
465
+
466
+ Returns:
467
+ BSON-encoded bytes ready for Rust's fetch_chunks_bson()
468
+
469
+ Example:
470
+ chunks = [
471
+ ({"metadata.instrument": "AUD_CAD", "timestamp": {...}}, 0, start, end)
472
+ ]
473
+ bson_bytes = serialize_chunks_for_rust(chunks)
474
+ # Pass bson_bytes to rust_backend.fetch_chunks_bson(chunks_bson=bson_bytes)
475
+ """
476
+ bson_chunks = []
477
+
478
+ for chunk_filter, chunk_idx, c_start, c_end in chunks:
479
+ chunk_doc = {
480
+ "filter": chunk_filter, # Contains ObjectId, datetime, etc.
481
+ "chunk_idx": chunk_idx,
482
+ }
483
+ # Handle None timestamps for partial brackets (unbounded queries)
484
+ if c_start is not None:
485
+ chunk_doc["start_ms"] = int(c_start.timestamp() * 1000)
486
+ if c_end is not None:
487
+ chunk_doc["end_ms"] = int(c_end.timestamp() * 1000)
488
+ bson_chunks.append(chunk_doc)
489
+
490
+ # Wrap in document (Rust expects {"chunks": [...]})
491
+ wrapper = {"chunks": bson_chunks}
492
+
493
+ # Encode to BSON bytes using pymongo's bson module
494
+ return bson_encode(wrapper)
495
+
496
+
497
+ __all__ = [
498
+ "execute_parallel_stream_to_cache",
499
+ "serialize_chunks_for_rust",
500
+ ]