xlr8 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,377 @@
1
+ """
2
+ Execution Planner for XLR8.
3
+
4
+ ================================================================================
5
+ MEMORY MODEL FOR RUST BACKEND
6
+ ================================================================================
7
+
8
+ The Rust backend uses a memory-aware buffering system to control RAM usage
9
+ during parallel MongoDB fetches. Key concepts:
10
+
11
+ 1. BSON DOCUMENT MEMORY OVERHEAD (15x Multiplier)
12
+ When MongoDB sends documents over the wire (avg_doc_size_bytes), they expand
13
+ to ~15x in memory due to heap allocations, pointers, and HashMap
14
+ structures. Measured: 14.8x, rounded to 15x for safety.
15
+
16
+ 2. BUFFER MANAGEMENT
17
+ Each async worker maintains its own MemoryAwareBuffer that:
18
+ - Tracks estimated memory using the 15x multiplier
19
+ - Flushes to Parquet when estimated bytes >= flush_trigger_mb
20
+ - Dynamically calibrates after first 10 documents
21
+
22
+ 3. MEMORY FORMULA
23
+ Given user's flush_ram_limit_mb and max_workers:
24
+
25
+ Per-Worker Allocation:
26
+ available_ram = flush_ram_limit_mb - BASELINE_MB
27
+ cursor_overhead = max_workers x CURSOR_OVERHEAD_MB_PER_WORKER
28
+ ram_for_data = available_ram - cursor_overhead
29
+ worker_allocation = ram_for_data / max_workers
30
+ flush_trigger_mb = worker_allocation # Rust handles 15x internally
31
+
32
+ ================================================================================
33
+ """
34
+
35
+ import logging
36
+ from dataclasses import dataclass
37
+ from datetime import datetime, timedelta
38
+ from enum import Enum
39
+ from typing import Optional, Union
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+
44
+ # =============================================================================
45
+ # BACKEND CONFIGURATION
46
+ # =============================================================================
47
+
48
+
49
+ class Backend(Enum):
50
+ """Supported execution backends."""
51
+
52
+ RUST = "rust"
53
+ PYTHON = "python" # Future use
54
+
55
+
56
+ @dataclass(frozen=True)
57
+ class BackendConfig:
58
+ """
59
+ Configuration constants for a specific backend.
60
+
61
+ All values are empirically measured. See:
62
+ - Rust: rust/xlr8_rust/tests/doc_memory_test.rs
63
+ - Python: tests/test_schema_memory.py
64
+ """
65
+
66
+ # Baseline memory before any data processing
67
+ baseline_mb: int
68
+
69
+ # Memory expansion factor during flush/encoding
70
+ # - Python: Arrow conversion spike (lists + arrays coexist)
71
+ # - Rust: BSON Document heap overhead (15x serialized size)
72
+ memory_multiplier: float
73
+
74
+ # Per-worker MongoDB cursor overhead
75
+ cursor_overhead_mb: int
76
+
77
+ # Memory retention factor (Python GC holds onto freed memory)
78
+ retention_factor: float
79
+
80
+ # Description for logging
81
+ description: str
82
+
83
+
84
+ # Rust backend: optimized for async workers in single process
85
+ RUST_CONFIG = BackendConfig(
86
+ baseline_mb=7, # Minimal Rust runtime overhead
87
+ memory_multiplier=15.0, # BSON Document heap overhead (measured 14.8x)
88
+ cursor_overhead_mb=8, # Async MongoDB cursor buffer
89
+ retention_factor=1.0, # Rust drops immediately, no GC retention
90
+ description="Rust async (single process, tokio threads)",
91
+ )
92
+
93
+ # Python backend: reserved for future non-Rust implementation
94
+ PYTHON_CONFIG = BackendConfig(
95
+ baseline_mb=120, # pymongo + pandas + pyarrow imports
96
+ memory_multiplier=3.0, # Arrow conversion spike
97
+ cursor_overhead_mb=16, # Python async cursor overhead
98
+ retention_factor=1.25, # Python GC retention
99
+ description="Python async (future implementation)",
100
+ )
101
+
102
+ # Default backend for current implementation
103
+ DEFAULT_BACKEND = Backend.RUST
104
+ DEFAULT_CONFIG = RUST_CONFIG
105
+
106
+
107
+ # =============================================================================
108
+ # SHARED CONSTANTS
109
+ # =============================================================================
110
+
111
+ # MongoDB cursor efficiency: below this, network overhead dominates
112
+ MIN_BATCH_SIZE = 2_000
113
+
114
+ # Buffer headroom for in-flight batch (flush check happens after batch added)
115
+ BATCH_HEADROOM_RATIO = 0.2
116
+
117
+
118
+ # =============================================================================
119
+ # EXECUTION PLAN
120
+ # =============================================================================
121
+
122
+
123
+ @dataclass
124
+ class ExecutionPlan:
125
+ """
126
+ Execution plan for parallel query execution.
127
+
128
+ Attributes:
129
+ worker_count: Number of parallel workers
130
+ batch_size_docs: Documents per MongoDB cursor batch
131
+ chunk_size: Time chunk size as timedelta
132
+ estimated_ram_mb: Estimated peak RAM usage
133
+ flush_trigger_mb: Memory threshold to trigger buffer flush (per worker)
134
+ """
135
+
136
+ worker_count: int
137
+ batch_size_docs: int
138
+ chunk_size: timedelta
139
+ estimated_ram_mb: int
140
+ flush_trigger_mb: int
141
+
142
+
143
+ # =============================================================================
144
+ # MEMORY CALCULATION
145
+ # =============================================================================
146
+
147
+
148
+ def calculate_flush_trigger(
149
+ peak_ram_limit_mb: int,
150
+ worker_count: int,
151
+ avg_doc_size_bytes: int,
152
+ config: BackendConfig = DEFAULT_CONFIG,
153
+ ) -> tuple[int, int]:
154
+ """
155
+ Calculate flush trigger and batch size from memory constraints.
156
+
157
+ This is the core memory planning function. It divides available RAM
158
+ among workers while accounting for baseline overhead and cursor buffers.
159
+
160
+ Args:
161
+ peak_ram_limit_mb: Total RAM budget from user
162
+ worker_count: Number of parallel workers
163
+ avg_doc_size_bytes: Average document size for batch sizing
164
+ config: Backend-specific memory constants
165
+
166
+ Returns:
167
+ Tuple of (flush_trigger_mb, batch_size_docs)
168
+
169
+ Example:
170
+ >>> trigger, batch = calculate_flush_trigger(5000, 16, 250)
171
+ >>> print(f"Per-worker: {trigger}MB, batch: {batch} docs")
172
+ Per-worker: 300MB, batch: 500000 docs
173
+ """
174
+ # Available RAM after baseline overhead
175
+ available_ram_mb = peak_ram_limit_mb - config.baseline_mb
176
+
177
+ if available_ram_mb <= 0:
178
+ raise ValueError(
179
+ f"peak_ram_limit_mb ({peak_ram_limit_mb} MB) must be greater than "
180
+ f"baseline ({config.baseline_mb} MB). "
181
+ f"Minimum viable: {config.baseline_mb + 50} MB."
182
+ )
183
+
184
+ # Account for GC retention (Python holds onto freed memory)
185
+ effective_ram_mb = available_ram_mb / config.retention_factor
186
+
187
+ # Subtract cursor overhead (each worker has a live MongoDB cursor)
188
+ cursor_overhead_total = worker_count * config.cursor_overhead_mb
189
+ ram_for_data = effective_ram_mb - cursor_overhead_total
190
+
191
+ # Ensure we have at least some RAM for data
192
+ ram_for_data = max(ram_for_data, worker_count * 1) # At least 1 MB per worker
193
+
194
+ # Each worker's allocation
195
+ worker_allocation_mb = ram_for_data / worker_count
196
+
197
+ # For Rust backend: the 15x multiplier is handled INSIDE the Rust buffer
198
+ # So flush_trigger_mb is the actual MB limit the buffer should use
199
+ # No need to divide by memory_multiplier here - Rust does that internally
200
+
201
+ # Split: 80% flush trigger, 20% batch headroom
202
+ flush_trigger_mb = worker_allocation_mb * (1 - BATCH_HEADROOM_RATIO)
203
+ batch_headroom_mb = worker_allocation_mb * BATCH_HEADROOM_RATIO
204
+
205
+ # Batch size from headroom
206
+ batch_headroom_bytes = batch_headroom_mb * 1024 * 1024
207
+ batch_size_docs = int(batch_headroom_bytes / avg_doc_size_bytes)
208
+
209
+ # Floor at MIN_BATCH_SIZE for MongoDB efficiency
210
+ batch_size_docs = max(MIN_BATCH_SIZE, batch_size_docs)
211
+
212
+ # Floor flush trigger at 1 MB (sanity check)
213
+ flush_trigger_mb = max(1, int(flush_trigger_mb))
214
+
215
+ return flush_trigger_mb, batch_size_docs
216
+
217
+
218
+ def build_execution_plan(
219
+ start_time: Union[datetime, None],
220
+ end_time: Union[datetime, None],
221
+ avg_doc_size_bytes: int,
222
+ max_workers: int = 4,
223
+ peak_ram_limit_mb: int = 512,
224
+ chunking_granularity: Optional[timedelta] = timedelta(hours=8),
225
+ num_unchunked_queries: Optional[int] = None,
226
+ backend: Backend = DEFAULT_BACKEND,
227
+ ) -> ExecutionPlan:
228
+ """
229
+ Build execution plan for a time-range query, unchunked queries, or both.
230
+
231
+ All parameters derived from user inputs and empirically measured constants.
232
+ No arbitrary hardcodes.
233
+
234
+ Work items = (time chunks from full brackets) + (unchunked queries)
235
+
236
+ Unchunked queries include:
237
+ - Partial brackets: one-sided time bound (e.g., $gte only)
238
+ - Unbounded brackets: no time bounds at all
239
+
240
+ Args:
241
+ start_time: Query start time (None if no chunkable brackets)
242
+ end_time: Query end time (None if no chunkable brackets)
243
+ avg_doc_size_bytes: Average document size from schema
244
+ max_workers: Maximum workers (user-specified)
245
+ peak_ram_limit_mb: Total RAM budget (user-specified)
246
+ chunking_granularity: Time chunk size (optional, for time-range mode)
247
+ num_unchunked_queries: Number of unchunked queries
248
+ (partial + unbounded brackets)
249
+ backend: Execution backend (RUST or PYTHON)
250
+
251
+ Returns:
252
+ ExecutionPlan with memory-safe settings
253
+
254
+ Raises:
255
+ ValueError: If RAM budget is too low for single worker
256
+ ValueError: If no work items (neither time range nor unchunked queries)
257
+ """
258
+ config = RUST_CONFIG if backend == Backend.RUST else PYTHON_CONFIG
259
+
260
+ # ==========================================================================
261
+ # CALCULATE TOTAL WORK ITEMS
262
+ # ==========================================================================
263
+ # Total work = (time chunks from full brackets) + (unchunked queries)
264
+ # Unchunked queries = partial brackets + unbounded brackets
265
+ # ==========================================================================
266
+ chunk_size_seconds: Optional[int] = None
267
+ time_chunks = 0
268
+
269
+ if start_time is not None and end_time is not None:
270
+ # Calculate time chunks from chunkable (full) brackets
271
+ time_range = end_time - start_time
272
+ time_range_seconds = max(1, time_range.total_seconds())
273
+
274
+ if chunking_granularity is not None:
275
+ chunk_size_seconds = int(chunking_granularity.total_seconds())
276
+ time_chunks = max(
277
+ 1,
278
+ int(
279
+ (time_range_seconds + chunk_size_seconds - 1) // chunk_size_seconds
280
+ ),
281
+ )
282
+ else:
283
+ # No granularity specified, treat as single chunk
284
+ time_chunks = 1
285
+
286
+ # Add unchunked queries (partial + unbounded brackets)
287
+ unchunked = num_unchunked_queries or 0
288
+ num_chunks = time_chunks + unchunked
289
+
290
+ if num_chunks == 0:
291
+ raise ValueError(
292
+ "No work items found. Either (start_time, end_time) "
293
+ "or num_unchunked_queries must be provided to determine work distribution."
294
+ )
295
+ # ==========================================================================
296
+ # DETERMINE WORKER COUNT
297
+ # ==========================================================================
298
+ # Can't have more workers than chunks
299
+ worker_count = min(max_workers, num_chunks)
300
+ worker_count = max(1, worker_count)
301
+
302
+ # Check if we have enough RAM for this many workers
303
+ available_ram_mb = peak_ram_limit_mb - config.baseline_mb
304
+ effective_ram = available_ram_mb / config.retention_factor
305
+ min_ram_per_worker = config.cursor_overhead_mb + 1 # Cursor + 1MB buffer
306
+ max_workers_for_ram = max(1, int(effective_ram / min_ram_per_worker))
307
+
308
+ if worker_count > max_workers_for_ram:
309
+ logger.warning(
310
+ "RAM budget too tight for %d workers. Reducing to %d workers. "
311
+ "Consider increasing peak_ram_limit_mb from %d MB.",
312
+ worker_count,
313
+ max_workers_for_ram,
314
+ peak_ram_limit_mb,
315
+ )
316
+ worker_count = max_workers_for_ram
317
+
318
+ # ==========================================================================
319
+ # CALCULATE MEMORY PARAMETERS
320
+ # ==========================================================================
321
+ flush_trigger_mb, batch_size_docs = calculate_flush_trigger(
322
+ peak_ram_limit_mb=peak_ram_limit_mb,
323
+ worker_count=worker_count,
324
+ avg_doc_size_bytes=avg_doc_size_bytes,
325
+ config=config,
326
+ )
327
+
328
+ # Warn if flush trigger is very small
329
+ if flush_trigger_mb < 5:
330
+ logger.warning(
331
+ "Low memory budget results in %d MB flush trigger per worker. "
332
+ "This may create many small Parquet files. Consider reducing max_workers "
333
+ "from %d or increasing peak_ram_limit_mb from %d MB.",
334
+ flush_trigger_mb,
335
+ max_workers,
336
+ peak_ram_limit_mb,
337
+ )
338
+
339
+ # ==========================================================================
340
+ # ESTIMATE PEAK RAM USAGE
341
+ # ==========================================================================
342
+ cursor_overhead_total = worker_count * config.cursor_overhead_mb
343
+ # For Rust: memory_multiplier is handled inside buffer, not here
344
+ # Estimate is: baseline + cursors + (flush_trigger x workers)
345
+ data_buffers = worker_count * flush_trigger_mb
346
+
347
+ allocated = cursor_overhead_total + data_buffers
348
+ estimated_ram_mb = int(config.baseline_mb + allocated * config.retention_factor)
349
+ estimated_ram_mb = min(estimated_ram_mb, peak_ram_limit_mb)
350
+
351
+ # Store chunk size as timedelta
352
+ chunk_size_td = (
353
+ timedelta(seconds=chunk_size_seconds)
354
+ if chunk_size_seconds is not None
355
+ else timedelta(days=1)
356
+ )
357
+
358
+ return ExecutionPlan(
359
+ worker_count=worker_count,
360
+ batch_size_docs=batch_size_docs,
361
+ chunk_size=chunk_size_td,
362
+ estimated_ram_mb=estimated_ram_mb,
363
+ flush_trigger_mb=flush_trigger_mb,
364
+ )
365
+
366
+
367
+ __all__ = [
368
+ "Backend",
369
+ "BackendConfig",
370
+ "RUST_CONFIG",
371
+ "PYTHON_CONFIG",
372
+ "DEFAULT_BACKEND",
373
+ "DEFAULT_CONFIG",
374
+ "ExecutionPlan",
375
+ "calculate_flush_trigger",
376
+ "build_execution_plan",
377
+ ]
xlr8/py.typed ADDED
@@ -0,0 +1 @@
1
+ # Marker file for PEP 561 - indicates this package supports type checking
xlr8/rust_backend.py ADDED
@@ -0,0 +1,42 @@
1
+ """Python bridge for the native Rust backend.
2
+
3
+ This module provides the Rust-accelerated functions for XLR8.
4
+ The Rust backend is REQUIRED - not optional.
5
+
6
+ Why Rust?
7
+ ---------
8
+ - 10-15x faster than Python for encoding/decoding operations
9
+ - GIL-free execution enables true parallelism
10
+ - Zero-copy Arrow operations
11
+ - Memory-efficient BSON processing
12
+
13
+ Key Functions:
14
+ --------------
15
+ - fetch_chunks_bson: Parallel MongoDB fetches with GIL-free execution
16
+ - encode_any_values_to_arrow: Fast encoding for Types.Any fields
17
+ - decode_any_struct_arrow: Fast decoding from Arrow structs
18
+
19
+ All functions are implemented in Rust (see rust/xlr8_rust/) and
20
+ exposed via PyO3 bindings.
21
+
22
+ Usage:
23
+ ------
24
+ from xlr8.rust_backend import encode_any_values_to_arrow
25
+
26
+ values = [42.5, "hello", None, True]
27
+ arrow_array = encode_any_values_to_arrow(values)
28
+ """
29
+
30
+ import _xlr8_rust as _native # type: ignore[import-not-found]
31
+
32
+ # GIL-FREE: BSON-based chunks (Phase 1 integration)
33
+ # Accepts BSON-serialized chunks for proper ObjectId, datetime handling
34
+ fetch_chunks_bson = _native.fetch_chunks_bson
35
+
36
+ # Fast Arrow-native decoder - takes PyArrow StructArray directly
37
+ # Operates on Arrow memory directly for ~44x speedup vs Python iteration
38
+ decode_any_struct_arrow = _native.decode_any_struct_arrow
39
+
40
+ # Fast Arrow-native encoder - takes Python list, returns PyArrow StructArray
41
+ # Operates directly in Rust for ~10x speedup vs Python iteration
42
+ encode_any_values_to_arrow = _native.encode_any_values_to_arrow
xlr8/rust_backend.pyi ADDED
@@ -0,0 +1,71 @@
1
+ """Type stubs for rust_backend module.
2
+
3
+ Provides type hints for Rust-compiled functions to enable intellisense.
4
+ """
5
+
6
+ from typing import Any, Dict, List
7
+
8
+ def fetch_chunks_bson(
9
+ mongodb_uri: str,
10
+ db_name: str,
11
+ collection_name: str,
12
+ chunks_bson: bytes,
13
+ schema_json: str,
14
+ cache_dir: str,
15
+ num_workers: int,
16
+ batch_size: int,
17
+ flush_trigger_mb: int,
18
+ avg_doc_size_bytes: int,
19
+ sort_spec_json: str,
20
+ time_field: str,
21
+ projection_json: str,
22
+ row_group_size: int | None = None,
23
+ ) -> Dict[str, Any]:
24
+ """Fetch MongoDB documents in parallel chunks and write to Parquet.
25
+
26
+ Args:
27
+ mongodb_uri: MongoDB connection string
28
+ db_name: Database name
29
+ collection_name: Collection name
30
+ chunks_bson: BSON-encoded chunk definitions
31
+ schema_json: JSON string describing Arrow schema
32
+ cache_dir: Directory where Parquet files will be written
33
+ num_workers: Number of parallel workers
34
+ batch_size: Documents per MongoDB batch
35
+ flush_trigger_mb: Memory threshold for flushing to disk (MB)
36
+ avg_doc_size_bytes: Average document size for memory estimation
37
+ sort_spec_json: JSON sort specification
38
+ time_field: Field name containing timestamps
39
+ projection_json: MongoDB projection as JSON
40
+ row_group_size: Parquet row group size (None = use Arrow default)
41
+
42
+ Returns:
43
+ Dictionary with total_docs, total_files, duration_secs
44
+ """
45
+ ...
46
+
47
+ def decode_any_struct_arrow(
48
+ arrow_array: Any, # pyarrow.StructArray
49
+ ) -> List[Any]:
50
+ """Decode PyArrow StructArray (Any type) to Python values.
51
+
52
+ Args:
53
+ arrow_array: PyArrow StructArray with 13-field Any encoding
54
+
55
+ Returns:
56
+ List of decoded Python values
57
+ """
58
+ ...
59
+
60
+ def encode_any_values_to_arrow(
61
+ values: List[Any],
62
+ ) -> Any: # pyarrow.StructArray
63
+ """Encode Python values to PyArrow StructArray (Any type).
64
+
65
+ Args:
66
+ values: List of Python values to encode
67
+
68
+ Returns:
69
+ PyArrow StructArray with 13-field Any encoding
70
+ """
71
+ ...
@@ -0,0 +1,42 @@
1
+ """
2
+ Schema system for XLR8.
3
+
4
+ Provides types, schema definitions, and value encoding for MongoDB documents.
5
+ """
6
+
7
+ # Import types module for Types.X syntax
8
+ from . import types as Types
9
+ from .encoder import ValueEncoder
10
+ from .schema import Schema
11
+ from .types import (
12
+ Any,
13
+ BaseType,
14
+ Bool,
15
+ Float,
16
+ Int,
17
+ List,
18
+ ObjectId,
19
+ String,
20
+ Struct,
21
+ Timestamp,
22
+ )
23
+
24
+ __all__ = [
25
+ # Types module for Types.X syntax
26
+ "Types",
27
+ # Individual type classes
28
+ "BaseType",
29
+ "String",
30
+ "Int",
31
+ "Float",
32
+ "Bool",
33
+ "Timestamp",
34
+ "ObjectId",
35
+ "Any",
36
+ "Struct",
37
+ "List",
38
+ # Schema
39
+ "Schema",
40
+ # Encoder
41
+ "ValueEncoder",
42
+ ]