xlr8 0.1.7b2__cp313-cp313-macosx_10_12_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xlr8/__init__.py +113 -0
- xlr8/_xlr8_rust.cpython-313-darwin.so +0 -0
- xlr8/_xlr8_rust.pyi +71 -0
- xlr8/analysis/__init__.py +58 -0
- xlr8/analysis/brackets.py +1201 -0
- xlr8/analysis/chunker.py +118 -0
- xlr8/analysis/inspector.py +1889 -0
- xlr8/collection/__init__.py +6 -0
- xlr8/collection/cursor.py +2155 -0
- xlr8/collection/cursor.pyi +104 -0
- xlr8/collection/wrapper.py +399 -0
- xlr8/collection/wrapper.pyi +61 -0
- xlr8/constants.py +24 -0
- xlr8/execution/__init__.py +43 -0
- xlr8/execution/callback.py +792 -0
- xlr8/execution/executor.py +500 -0
- xlr8/execution/planner.py +377 -0
- xlr8/py.typed +1 -0
- xlr8/rust_backend.py +40 -0
- xlr8/rust_backend.pyi +71 -0
- xlr8/schema/__init__.py +42 -0
- xlr8/schema/encoder.py +235 -0
- xlr8/schema/schema.py +265 -0
- xlr8/schema/types.py +239 -0
- xlr8/storage/__init__.py +17 -0
- xlr8/storage/cache.py +228 -0
- xlr8/storage/reader.py +1369 -0
- xlr8-0.1.7b2.dist-info/METADATA +176 -0
- xlr8-0.1.7b2.dist-info/RECORD +31 -0
- xlr8-0.1.7b2.dist-info/WHEEL +4 -0
- xlr8-0.1.7b2.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Type stubs for XLR8 Cursor.
|
|
2
|
+
|
|
3
|
+
Inherits from PyMongoCursor for full IDE autocomplete of PyMongo methods.
|
|
4
|
+
Adds 4 XLR8-specific methods for DataFrame/Polars conversion.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from datetime import date, datetime, timedelta
|
|
10
|
+
from typing import (
|
|
11
|
+
Any,
|
|
12
|
+
Callable,
|
|
13
|
+
Dict,
|
|
14
|
+
Generator,
|
|
15
|
+
List,
|
|
16
|
+
Literal,
|
|
17
|
+
Optional,
|
|
18
|
+
Tuple,
|
|
19
|
+
Union,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
import pandas as pd
|
|
23
|
+
import polars as pl
|
|
24
|
+
import pyarrow as pa
|
|
25
|
+
from pymongo.cursor import Cursor as PyMongoCursor
|
|
26
|
+
|
|
27
|
+
class XLR8Cursor(PyMongoCursor):
|
|
28
|
+
"""PyMongo-compatible cursor with optional acceleration.
|
|
29
|
+
|
|
30
|
+
Inherits all PyMongo cursor methods. Adds 4 XLR8-specific methods:
|
|
31
|
+
- to_dataframe(): Convert to Pandas DataFrame with acceleration
|
|
32
|
+
- to_polars(): Convert to Polars DataFrame with acceleration
|
|
33
|
+
- to_dataframe_batches(): Memory-efficient batch streaming
|
|
34
|
+
- stream_to_callback(): Partitioned PyArrow table callbacks
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
collection: Any,
|
|
40
|
+
query_filter: Dict[str, Any],
|
|
41
|
+
projection: Optional[Dict[str, Any]] = None,
|
|
42
|
+
skip: int = 0,
|
|
43
|
+
limit: int = 0,
|
|
44
|
+
sort: Optional[List[Tuple[str, int]]] = None,
|
|
45
|
+
batch_size: int = 1000,
|
|
46
|
+
) -> None: ...
|
|
47
|
+
|
|
48
|
+
# XLR8-specific accelerated methods
|
|
49
|
+
def to_dataframe(
|
|
50
|
+
self,
|
|
51
|
+
accelerate: bool = True,
|
|
52
|
+
cache_read: bool = True,
|
|
53
|
+
cache_write: bool = True,
|
|
54
|
+
start_date: Optional[Union[datetime, date, str]] = None,
|
|
55
|
+
end_date: Optional[Union[datetime, date, str]] = None,
|
|
56
|
+
coerce: Literal["raise", "error"] = "raise",
|
|
57
|
+
max_workers: int = 4,
|
|
58
|
+
chunking_granularity: Optional[timedelta] = None,
|
|
59
|
+
row_group_size: Optional[int] = None,
|
|
60
|
+
flush_ram_limit_mb: int = 512,
|
|
61
|
+
) -> pd.DataFrame: ...
|
|
62
|
+
def to_polars(
|
|
63
|
+
self,
|
|
64
|
+
accelerate: bool = True,
|
|
65
|
+
cache_read: bool = True,
|
|
66
|
+
cache_write: bool = True,
|
|
67
|
+
start_date: Optional[Union[datetime, date, str]] = None,
|
|
68
|
+
end_date: Optional[Union[datetime, date, str]] = None,
|
|
69
|
+
coerce: Literal["raise", "error"] = "raise",
|
|
70
|
+
max_workers: int = 4,
|
|
71
|
+
chunking_granularity: Optional[timedelta] = None,
|
|
72
|
+
row_group_size: Optional[int] = None,
|
|
73
|
+
any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
|
|
74
|
+
flush_ram_limit_mb: int = 512,
|
|
75
|
+
) -> pl.DataFrame: ...
|
|
76
|
+
def to_dataframe_batches(
|
|
77
|
+
self,
|
|
78
|
+
batch_size: int = 10000,
|
|
79
|
+
cache_read: bool = True,
|
|
80
|
+
cache_write: bool = True,
|
|
81
|
+
start_date: Optional[Union[datetime, date, str]] = None,
|
|
82
|
+
end_date: Optional[Union[datetime, date, str]] = None,
|
|
83
|
+
coerce: Literal["raise", "error"] = "raise",
|
|
84
|
+
max_workers: int = 4,
|
|
85
|
+
chunking_granularity: Optional[timedelta] = None,
|
|
86
|
+
row_group_size: Optional[int] = None,
|
|
87
|
+
flush_ram_limit_mb: int = 512,
|
|
88
|
+
) -> Generator[pd.DataFrame, None, None]: ...
|
|
89
|
+
def stream_to_callback(
|
|
90
|
+
self,
|
|
91
|
+
callback: Callable[[pa.Table, Dict[str, Any]], None],
|
|
92
|
+
*,
|
|
93
|
+
partition_time_delta: timedelta,
|
|
94
|
+
partition_by: Optional[Union[str, List[str]]] = None,
|
|
95
|
+
any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
|
|
96
|
+
max_workers: int = 4,
|
|
97
|
+
chunking_granularity: Optional[timedelta] = None,
|
|
98
|
+
row_group_size: Optional[int] = None,
|
|
99
|
+
flush_ram_limit_mb: int = 512,
|
|
100
|
+
cache_read: bool = True,
|
|
101
|
+
cache_write: bool = True,
|
|
102
|
+
) -> Dict[str, Any]: ...
|
|
103
|
+
def raw_cursor(self) -> PyMongoCursor: ...
|
|
104
|
+
def explain_acceleration(self) -> Dict[str, Any]: ...
|
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
"""
|
|
2
|
+
XLR8 collection wrapper with PyMongo compatibility.
|
|
3
|
+
|
|
4
|
+
================================================================================
|
|
5
|
+
DATA FLOW - COLLECTION WRAPPER
|
|
6
|
+
================================================================================
|
|
7
|
+
|
|
8
|
+
This module wraps pymongo.collection.Collection to provide the `accelerate()`
|
|
9
|
+
function - the main entry point for users.
|
|
10
|
+
|
|
11
|
+
TYPICAL USAGE FLOW:
|
|
12
|
+
────────────────────────────────────────────────────────────────────────────────
|
|
13
|
+
|
|
14
|
+
1. USER WRAPS A COLLECTION:
|
|
15
|
+
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
16
|
+
│ from xlr8 import accelerate, Schema, Types │
|
|
17
|
+
│ │
|
|
18
|
+
│ schema = Schema( │
|
|
19
|
+
│ time_field="timestamp", │
|
|
20
|
+
│ fields={ │
|
|
21
|
+
│ "timestamp": Types.Timestamp("ms", tz="UTC"), │
|
|
22
|
+
│ "metadata.device_id": Types.ObjectId(), │
|
|
23
|
+
│ "metadata.sensor_id": Types.ObjectId(), │
|
|
24
|
+
│ "value": Types.Any(), # Polymorphic - can be int, float, str etc..│
|
|
25
|
+
│ } │
|
|
26
|
+
│ ) │
|
|
27
|
+
│ │
|
|
28
|
+
│ xlr8_col = accelerate(pymongo_collection, schema, mongo_uri) │
|
|
29
|
+
└─────────────────────────────────────────────────────────────────────────────┘
|
|
30
|
+
|
|
31
|
+
2. USER CALLS find() - RETURNS XLR8Cursor (NOT PYMONGO CURSOR):
|
|
32
|
+
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
33
|
+
│ cursor = xlr8_col.find({ │
|
|
34
|
+
│ "timestamp": {"$gte": start, "$lt": end}, │
|
|
35
|
+
│ "metadata.device_id": ObjectId("64a..."), │
|
|
36
|
+
│ }) │
|
|
37
|
+
│ # cursor is XLR8Cursor, wrapping the query params │
|
|
38
|
+
└─────────────────────────────────────────────────────────────────────────────┘
|
|
39
|
+
|
|
40
|
+
3. USER CALLS to_dataframe() - TRIGGERS ACCELERATION:
|
|
41
|
+
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
42
|
+
│ df = cursor.to_dataframe() │
|
|
43
|
+
│ # This triggers: │
|
|
44
|
+
│ # 1. Query analysis (can we chunk by time?) │
|
|
45
|
+
│ # 2. Check cache (have we fetched this before?) │
|
|
46
|
+
│ # 3. Parallel fetch via Rust async backend │
|
|
47
|
+
│ # 4. Stream to Parquet cache │
|
|
48
|
+
│ # 5. Read back and return DataFrame │
|
|
49
|
+
└─────────────────────────────────────────────────────────────────────────────┘
|
|
50
|
+
|
|
51
|
+
KEY CONFIG OPTIONS:
|
|
52
|
+
────────────────────────────────────────────────────────────────────────────────
|
|
53
|
+
- schema: Required for type-aware encoding (especially Types.Any)
|
|
54
|
+
- mongo_uri: Required for accelerated execution (workers create connections)
|
|
55
|
+
- cache_dir: Where to store Parquet cache (default: .xlr8_cache)
|
|
56
|
+
|
|
57
|
+
PER-QUERY OPTIONS (via to_dataframe):
|
|
58
|
+
────────────────────────────────────────────────────────────────────────────────
|
|
59
|
+
- max_workers: Number of parallel workers (default: 4)
|
|
60
|
+
- flush_ram_limit_mb: RAM budget for batch sizing (default: 512)
|
|
61
|
+
- chunking_granularity: Time chunk size (e.g., timedelta(days=7))
|
|
62
|
+
|
|
63
|
+
================================================================================
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
|
67
|
+
|
|
68
|
+
from pymongo.collection import Collection as PyMongoCollection
|
|
69
|
+
|
|
70
|
+
from xlr8.collection.cursor import XLR8Cursor
|
|
71
|
+
from xlr8.schema import Schema
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class XLR8Collection:
|
|
75
|
+
"""
|
|
76
|
+
PyMongo-compatible collection wrapper with acceleration.
|
|
77
|
+
|
|
78
|
+
Drop-in replacement for pymongo.collection.Collection that transparently
|
|
79
|
+
accelerates analytical queries through parallel execution and caching.
|
|
80
|
+
|
|
81
|
+
All write operations (insert, update, delete) pass through to PyMongo.
|
|
82
|
+
Read operations (find, aggregate) can be accelerated if:
|
|
83
|
+
- Schema is provided
|
|
84
|
+
- Query has time-range predicates
|
|
85
|
+
- Query doesn't use complex operators ($nor, $where, etc.)
|
|
86
|
+
|
|
87
|
+
Example:
|
|
88
|
+
>>> import pymongo
|
|
89
|
+
>>> from xlr8 import XLR8Collection, Schema, Types
|
|
90
|
+
>>>
|
|
91
|
+
>>> # Create schema
|
|
92
|
+
>>> schema = Schema(
|
|
93
|
+
... time_field="timestamp",
|
|
94
|
+
... fields={
|
|
95
|
+
... "timestamp": Types.Timestamp(),
|
|
96
|
+
... "value": Types.Float(),
|
|
97
|
+
... "sensor_id": Types.String(),
|
|
98
|
+
... }
|
|
99
|
+
... )
|
|
100
|
+
>>>
|
|
101
|
+
>>> # Wrap collection with mongo_uri for accelerated execution
|
|
102
|
+
>>> client = pymongo.MongoClient("mongodb://localhost:27017")
|
|
103
|
+
>>> pymongo_col = client.mydb.mycollection
|
|
104
|
+
>>> col = XLR8Collection(pymongo_col, schema=schema, mongo_uri="mongodb://localhost:27017")
|
|
105
|
+
>>>
|
|
106
|
+
>>> # Use like regular PyMongo
|
|
107
|
+
>>> cursor = col.find({"timestamp": {"$gte": start, "$lt": end}})
|
|
108
|
+
>>> df = cursor.to_dataframe(flush_ram_limit_mb=2000)
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
def __init__(
|
|
112
|
+
self,
|
|
113
|
+
pymongo_collection,
|
|
114
|
+
schema: Optional[Schema] = None,
|
|
115
|
+
mongo_uri: Union[str, Callable[[], str], None] = None,
|
|
116
|
+
approx_document_size_bytes: int = 500,
|
|
117
|
+
):
|
|
118
|
+
"""
|
|
119
|
+
Initialize XLR8 collection wrapper.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
pymongo_collection: PyMongo Collection instance
|
|
123
|
+
schema: Optional schema definition for acceleration
|
|
124
|
+
mongo_uri: MongoDB connection string (str) or callable that returns one.
|
|
125
|
+
Required for accelerated execution. Can be:
|
|
126
|
+
- A string: "mongodb://localhost:27017"
|
|
127
|
+
- A callable: lambda: os.environ["MONGODB_URI"]
|
|
128
|
+
approx_document_size_bytes: Approximate size of each document in bytes
|
|
129
|
+
(default: 500). Used for memory budget calculations.
|
|
130
|
+
|
|
131
|
+
Note:
|
|
132
|
+
Cache directory is auto-managed based on query hash.
|
|
133
|
+
flush_ram_limit_mb and max_workers are parameters of to_dataframe(),
|
|
134
|
+
to_polars(), etc. for per-query control.
|
|
135
|
+
"""
|
|
136
|
+
self._pymongo_collection = pymongo_collection
|
|
137
|
+
self._schema = schema
|
|
138
|
+
self._mongo_uri = mongo_uri
|
|
139
|
+
self._approx_document_size_bytes = approx_document_size_bytes
|
|
140
|
+
|
|
141
|
+
def raw_collection(self) -> PyMongoCollection:
|
|
142
|
+
"""
|
|
143
|
+
Get direct access to underlying PyMongo collection.
|
|
144
|
+
|
|
145
|
+
This is an escape hatch for power users who need direct access to PyMongo
|
|
146
|
+
collection methods that may not be available through delegation.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
pymongo.collection.Collection: The underlying PyMongo collection
|
|
150
|
+
|
|
151
|
+
Example:
|
|
152
|
+
>>> xlr8_col = accelerate(collection, schema=schema)
|
|
153
|
+
>>> xlr8_col.raw_collection().watch() # Use MongoDB change streams
|
|
154
|
+
>>> xlr8_col.raw_collection().list_indexes() # Direct PyMongo access
|
|
155
|
+
"""
|
|
156
|
+
return self._pymongo_collection
|
|
157
|
+
|
|
158
|
+
# PyMongo pass-through properties
|
|
159
|
+
@property
|
|
160
|
+
def name(self) -> str:
|
|
161
|
+
"""Collection name."""
|
|
162
|
+
return self._pymongo_collection.name
|
|
163
|
+
|
|
164
|
+
@property
|
|
165
|
+
def full_name(self) -> str:
|
|
166
|
+
"""Full collection name (database.collection)."""
|
|
167
|
+
return self._pymongo_collection.full_name
|
|
168
|
+
|
|
169
|
+
@property
|
|
170
|
+
def database(self):
|
|
171
|
+
"""Parent database."""
|
|
172
|
+
return self._pymongo_collection.database
|
|
173
|
+
|
|
174
|
+
# Public accessor properties for cursor usage
|
|
175
|
+
|
|
176
|
+
@property
|
|
177
|
+
def schema(self):
|
|
178
|
+
"""Schema definition for acceleration."""
|
|
179
|
+
return self._schema
|
|
180
|
+
|
|
181
|
+
@property
|
|
182
|
+
def pymongo_collection(self):
|
|
183
|
+
"""Underlying PyMongo collection instance."""
|
|
184
|
+
return self._pymongo_collection
|
|
185
|
+
|
|
186
|
+
@property
|
|
187
|
+
def mongo_uri(self):
|
|
188
|
+
"""MongoDB connection URI for accelerated execution."""
|
|
189
|
+
return self._mongo_uri
|
|
190
|
+
|
|
191
|
+
@property
|
|
192
|
+
def approx_document_size_bytes(self) -> int:
|
|
193
|
+
"""Approximate size of each document in bytes."""
|
|
194
|
+
return self._approx_document_size_bytes
|
|
195
|
+
|
|
196
|
+
def __getattr__(self, name: str):
|
|
197
|
+
"""
|
|
198
|
+
Delegate unknown methods to PyMongo collection.
|
|
199
|
+
|
|
200
|
+
Why:
|
|
201
|
+
Provides full PyMongo compatibility without manually implementing
|
|
202
|
+
every collection method (insert, update, delete, indexes, etc.).
|
|
203
|
+
|
|
204
|
+
Example:
|
|
205
|
+
>>> xlr8_col.insert_one({...}) # Works via delegation
|
|
206
|
+
>>> xlr8_col.create_index("timestamp") # Works via delegation
|
|
207
|
+
>>> count = xlr8_col.count_documents({}) # Works via delegation
|
|
208
|
+
"""
|
|
209
|
+
return getattr(self._pymongo_collection, name)
|
|
210
|
+
|
|
211
|
+
# Read operations (can be accelerated)
|
|
212
|
+
def find(
|
|
213
|
+
self,
|
|
214
|
+
filter: Optional[Dict[str, Any]] = None,
|
|
215
|
+
projection: Optional[Dict[str, Any]] = None,
|
|
216
|
+
skip: int = 0,
|
|
217
|
+
limit: int = 0,
|
|
218
|
+
sort: Optional[List[tuple]] = None,
|
|
219
|
+
batch_size: int = 1000,
|
|
220
|
+
**kwargs,
|
|
221
|
+
) -> XLR8Cursor:
|
|
222
|
+
"""
|
|
223
|
+
Query collection with optional acceleration.
|
|
224
|
+
|
|
225
|
+
Returns XLR8Cursor which is PyMongo-compatible but can accelerate
|
|
226
|
+
to_dataframe() / to_polars() conversions.
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
DATA FLOW EXAMPLE:
|
|
230
|
+
|
|
231
|
+
INPUT (filter parameter):
|
|
232
|
+
{
|
|
233
|
+
"$or": [
|
|
234
|
+
{"metadata.sensor_id": ObjectId("64a...")},
|
|
235
|
+
{"metadata.sensor_id": ObjectId("64b...")},
|
|
236
|
+
],
|
|
237
|
+
"timestamp": {"$gte": datetime(2024,1,1), "$lt": datetime(...)}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
OUTPUT: XLR8Cursor object containing:
|
|
241
|
+
- _filter: The query dict (unchanged)
|
|
242
|
+
- _collection: Reference back to this XLR8Collection
|
|
243
|
+
- _projection, _skip, _limit, _sort: Query modifiers
|
|
244
|
+
|
|
245
|
+
NEXT STEP: User calls cursor.to_dataframe() which triggers:
|
|
246
|
+
1. Query analysis in analysis/brackets.py
|
|
247
|
+
2. Execution planning in execution/planner.py
|
|
248
|
+
3. Parallel fetch in execution/worker.py
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
filter: Query filter dict
|
|
252
|
+
projection: Field projection dict
|
|
253
|
+
skip: Number of documents to skip
|
|
254
|
+
limit: Maximum documents to return
|
|
255
|
+
sort: Sort specification
|
|
256
|
+
batch_size: Batch size for iteration
|
|
257
|
+
**kwargs: Additional PyMongo cursor options
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
XLR8Cursor instance
|
|
261
|
+
|
|
262
|
+
Example:
|
|
263
|
+
>>> # Simple query
|
|
264
|
+
>>> cursor = col.find({"status": "active"})
|
|
265
|
+
>>>
|
|
266
|
+
>>> # Query with time range (accelerated)
|
|
267
|
+
>>> cursor = col.find({
|
|
268
|
+
... "timestamp": {"$gte": start, "$lt": end},
|
|
269
|
+
... "sensor_id": "sensor_1"
|
|
270
|
+
... })
|
|
271
|
+
>>> df = cursor.to_dataframe()
|
|
272
|
+
"""
|
|
273
|
+
if filter is None:
|
|
274
|
+
filter = {}
|
|
275
|
+
|
|
276
|
+
return XLR8Cursor(
|
|
277
|
+
collection=self,
|
|
278
|
+
query_filter=filter,
|
|
279
|
+
projection=projection,
|
|
280
|
+
skip=skip,
|
|
281
|
+
limit=limit,
|
|
282
|
+
sort=sort,
|
|
283
|
+
batch_size=batch_size,
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
# XLR8-specific methods
|
|
287
|
+
|
|
288
|
+
def set_schema(self, schema: Schema) -> None:
|
|
289
|
+
"""
|
|
290
|
+
Set or update schema for acceleration.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
schema: Schema definition
|
|
294
|
+
"""
|
|
295
|
+
self._schema = schema
|
|
296
|
+
|
|
297
|
+
def get_schema(self) -> Optional[Schema]:
|
|
298
|
+
"""
|
|
299
|
+
Get current schema.
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
Schema or None
|
|
303
|
+
"""
|
|
304
|
+
return self._schema
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def accelerate(
|
|
308
|
+
pymongo_collection: PyMongoCollection,
|
|
309
|
+
schema: Schema,
|
|
310
|
+
mongo_uri: Union[str, Callable[[], str]],
|
|
311
|
+
approx_document_size_bytes: int = 500,
|
|
312
|
+
) -> XLR8Collection:
|
|
313
|
+
"""
|
|
314
|
+
Convenience function to wrap a PyMongo collection with acceleration.
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
DATA FLOW EXAMPLE - MAIN ENTRY POINT:
|
|
318
|
+
|
|
319
|
+
INPUT:
|
|
320
|
+
- pymongo_collection: client["main"]["sensorData"]
|
|
321
|
+
- schema: Schema(time_field="timestamp", fields={...})
|
|
322
|
+
- mongo_uri: Connection string used by accelerated workers
|
|
323
|
+
|
|
324
|
+
Example:
|
|
325
|
+
accelerate(
|
|
326
|
+
collection,
|
|
327
|
+
schema,
|
|
328
|
+
mongo_uri="mongodb://localhost:27017", # Or callable
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
OUTPUT: XLR8Collection wrapper that:
|
|
332
|
+
- Wraps pymongo collection for transparent pass-through
|
|
333
|
+
- Stores schema for type-aware Parquet encoding
|
|
334
|
+
- Stores mongo_uri for workers to create their own connections
|
|
335
|
+
|
|
336
|
+
WHAT HAPPENS NEXT:
|
|
337
|
+
1. User calls: xlr8_col.find({...})
|
|
338
|
+
2. Returns XLR8Cursor (wraps query params)
|
|
339
|
+
3. User calls: cursor.to_dataframe()
|
|
340
|
+
4. Workers use mongo_uri to create their own connections
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
pymongo_collection: PyMongo Collection instance
|
|
345
|
+
schema: Schema definition
|
|
346
|
+
mongo_uri: MongoDB connection string (str) or callable that returns one.
|
|
347
|
+
Required for accelerated execution. Can be:
|
|
348
|
+
- A string: "mongodb://localhost:27017"
|
|
349
|
+
- A callable: lambda: os.environ["MONGODB_URI"]
|
|
350
|
+
approx_document_size_bytes: Approximate size of each document in bytes
|
|
351
|
+
(default: 500). Used for memory budget calculations.
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
XLR8Collection wrapper
|
|
355
|
+
|
|
356
|
+
Note:
|
|
357
|
+
Cache directory is auto-managed based on query hash.
|
|
358
|
+
flush_ram_limit_mb and max_workers are parameters of to_dataframe(),
|
|
359
|
+
to_polars(), etc. for per-query control.
|
|
360
|
+
|
|
361
|
+
Example:
|
|
362
|
+
>>> import pymongo
|
|
363
|
+
>>> from xlr8 import accelerate, Schema, Types
|
|
364
|
+
>>>
|
|
365
|
+
>>> # Connection string or callable
|
|
366
|
+
>>> MONGO_URI = "mongodb://localhost:27017"
|
|
367
|
+
>>> # OR: get_uri = lambda: os.environ["MONGODB_URI"]
|
|
368
|
+
>>>
|
|
369
|
+
>>> client = pymongo.MongoClient(MONGO_URI)
|
|
370
|
+
>>> col = client.mydb.sensor_logs
|
|
371
|
+
>>>
|
|
372
|
+
>>> schema = Schema(
|
|
373
|
+
... time_field="timestamp",
|
|
374
|
+
... fields={
|
|
375
|
+
... "timestamp": Types.Timestamp(),
|
|
376
|
+
... "sensor_id": Types.String(),
|
|
377
|
+
... "value": Types.Float(),
|
|
378
|
+
... },
|
|
379
|
+
... )
|
|
380
|
+
>>>
|
|
381
|
+
>>> # Pass mongo_uri for accelerated workers
|
|
382
|
+
>>> accelerated_col = accelerate(col, schema, mongo_uri=MONGO_URI)
|
|
383
|
+
>>>
|
|
384
|
+
>>> # max_workers and flush_ram_limit_mb are per-query
|
|
385
|
+
>>> from datetime import timedelta
|
|
386
|
+
>>> df = accelerated_col.find({
|
|
387
|
+
... "timestamp": {"$gte": start, "$lt": end}
|
|
388
|
+
... }).to_dataframe(
|
|
389
|
+
... max_workers=8,
|
|
390
|
+
... chunking_granularity=timedelta(days=1),
|
|
391
|
+
... flush_ram_limit_mb=2000,
|
|
392
|
+
... )
|
|
393
|
+
"""
|
|
394
|
+
return XLR8Collection(
|
|
395
|
+
pymongo_collection=pymongo_collection,
|
|
396
|
+
schema=schema,
|
|
397
|
+
mongo_uri=mongo_uri,
|
|
398
|
+
approx_document_size_bytes=approx_document_size_bytes,
|
|
399
|
+
)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Type stubs for XLR8 Collection.
|
|
2
|
+
|
|
3
|
+
Inherits from PyMongoCollection for full IDE autocomplete.
|
|
4
|
+
Only overrides find() to return XLR8Cursor instead of PyMongo cursor.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
8
|
+
|
|
9
|
+
from pymongo.collection import Collection as PyMongoCollection
|
|
10
|
+
|
|
11
|
+
from .cursor import XLR8Cursor
|
|
12
|
+
|
|
13
|
+
class XLR8Collection(PyMongoCollection):
|
|
14
|
+
"""PyMongo-compatible collection with optional acceleration.
|
|
15
|
+
|
|
16
|
+
Inherits all PyMongo collection methods. Only find() is overridden
|
|
17
|
+
to return XLR8Cursor for accelerated DataFrame/Polars conversion.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
pymongo_collection: PyMongoCollection,
|
|
23
|
+
schema: Optional[Any] = None,
|
|
24
|
+
mongo_uri: Optional[str] = None,
|
|
25
|
+
approx_document_size_bytes: int = 500,
|
|
26
|
+
) -> None: ...
|
|
27
|
+
|
|
28
|
+
# Override find() to return XLR8Cursor
|
|
29
|
+
def find(
|
|
30
|
+
self,
|
|
31
|
+
filter: Optional[Dict[str, Any]] = None,
|
|
32
|
+
projection: Optional[Dict[str, Any]] = None,
|
|
33
|
+
skip: int = 0,
|
|
34
|
+
limit: int = 0,
|
|
35
|
+
sort: Optional[List[Tuple[str, int]]] = None,
|
|
36
|
+
batch_size: int = 1000,
|
|
37
|
+
**kwargs: Any,
|
|
38
|
+
) -> XLR8Cursor: ...
|
|
39
|
+
|
|
40
|
+
# XLR8-specific methods
|
|
41
|
+
def raw_collection(self) -> PyMongoCollection: ...
|
|
42
|
+
def set_schema(self, schema: Any) -> None: ...
|
|
43
|
+
def get_schema(self) -> Optional[Any]: ...
|
|
44
|
+
def clear_cache(self) -> None: ...
|
|
45
|
+
|
|
46
|
+
# Properties
|
|
47
|
+
@property
|
|
48
|
+
def schema(self) -> Optional[Any]: ...
|
|
49
|
+
@property
|
|
50
|
+
def pymongo_collection(self) -> PyMongoCollection: ...
|
|
51
|
+
@property
|
|
52
|
+
def mongo_uri(self) -> Optional[str]: ...
|
|
53
|
+
@property
|
|
54
|
+
def approx_document_size_bytes(self) -> int: ...
|
|
55
|
+
|
|
56
|
+
def accelerate(
|
|
57
|
+
pymongo_collection: PyMongoCollection,
|
|
58
|
+
schema: Any,
|
|
59
|
+
mongo_uri: Any,
|
|
60
|
+
approx_document_size_bytes: int = 500,
|
|
61
|
+
) -> XLR8Collection: ...
|
xlr8/constants.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""
|
|
2
|
+
XLR8 constants and configuration values.
|
|
3
|
+
|
|
4
|
+
Centralized constants to avoid magic numbers scattered throughout codebase.
|
|
5
|
+
All tuneable performance parameters should be defined here.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# =============================================================================
|
|
9
|
+
# PARQUET FILE SETTINGS
|
|
10
|
+
# =============================================================================
|
|
11
|
+
|
|
12
|
+
# Default row group size for compression can be altered via argument passed
|
|
13
|
+
# to the special cursor methods e.g to_dataframe
|
|
14
|
+
PARQUET_ROW_GROUP_SIZE = 100_000
|
|
15
|
+
|
|
16
|
+
# Default compression codec for Parquet files
|
|
17
|
+
DEFAULT_COMPRESSION = "zstd"
|
|
18
|
+
|
|
19
|
+
# =============================================================================
|
|
20
|
+
# BATCH PROCESSING
|
|
21
|
+
# =============================================================================
|
|
22
|
+
|
|
23
|
+
# Default batch size for DataFrame operations
|
|
24
|
+
DEFAULT_BATCH_SIZE = 10_000
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Execution engine for parallel query execution via Rust backend.
|
|
3
|
+
|
|
4
|
+
All parallel execution now goes through the Rust backend for GIL-free performance.
|
|
5
|
+
|
|
6
|
+
Components:
|
|
7
|
+
- executor: High-level parallel execution (execute_parallel_stream_to_cache)
|
|
8
|
+
- callback: Partitioned streaming for data lake population
|
|
9
|
+
- planner: Memory-aware execution planning and worker configuration
|
|
10
|
+
|
|
11
|
+
Python handles:
|
|
12
|
+
- Query planning and bracketing
|
|
13
|
+
- Memory budget calculations
|
|
14
|
+
- Result reading and DataFrame construction
|
|
15
|
+
|
|
16
|
+
Rust backend handles:
|
|
17
|
+
- Parallel MongoDB fetches (GIL-free)
|
|
18
|
+
- BSON decoding and Arrow encoding
|
|
19
|
+
- Memory-aware buffering
|
|
20
|
+
- Parquet writing
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from .callback import PartitionWorkItem, execute_partitioned_callback
|
|
24
|
+
from .executor import execute_parallel_stream_to_cache
|
|
25
|
+
from .planner import (
|
|
26
|
+
Backend,
|
|
27
|
+
BackendConfig,
|
|
28
|
+
ExecutionPlan,
|
|
29
|
+
build_execution_plan,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
# Executor
|
|
34
|
+
"execute_parallel_stream_to_cache",
|
|
35
|
+
# Callback
|
|
36
|
+
"PartitionWorkItem",
|
|
37
|
+
"execute_partitioned_callback",
|
|
38
|
+
# Planner
|
|
39
|
+
"Backend",
|
|
40
|
+
"BackendConfig",
|
|
41
|
+
"ExecutionPlan",
|
|
42
|
+
"build_execution_plan",
|
|
43
|
+
]
|