xlr8 0.1.7b3__cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xlr8/__init__.py +113 -0
- xlr8/_xlr8_rust.cpython-311-aarch64-linux-gnu.so +0 -0
- xlr8/_xlr8_rust.pyi +71 -0
- xlr8/analysis/__init__.py +58 -0
- xlr8/analysis/brackets.py +1201 -0
- xlr8/analysis/chunker.py +118 -0
- xlr8/analysis/inspector.py +1889 -0
- xlr8/collection/__init__.py +6 -0
- xlr8/collection/cursor.py +2161 -0
- xlr8/collection/cursor.pyi +179 -0
- xlr8/collection/wrapper.py +400 -0
- xlr8/collection/wrapper.pyi +420 -0
- xlr8/constants.py +24 -0
- xlr8/execution/__init__.py +43 -0
- xlr8/execution/callback.py +792 -0
- xlr8/execution/executor.py +500 -0
- xlr8/execution/planner.py +377 -0
- xlr8/py.typed +1 -0
- xlr8/rust_backend.py +40 -0
- xlr8/rust_backend.pyi +71 -0
- xlr8/schema/__init__.py +42 -0
- xlr8/schema/encoder.py +235 -0
- xlr8/schema/schema.py +265 -0
- xlr8/schema/types.py +239 -0
- xlr8/storage/__init__.py +17 -0
- xlr8/storage/cache.py +228 -0
- xlr8/storage/reader.py +1369 -0
- xlr8-0.1.7b3.dist-info/METADATA +176 -0
- xlr8-0.1.7b3.dist-info/RECORD +31 -0
- xlr8-0.1.7b3.dist-info/WHEEL +5 -0
- xlr8-0.1.7b3.dist-info/licenses/LICENSE +201 -0
xlr8/__init__.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# XLR8: High-Performance MongoDB Acceleration Layer
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
XLR8 - Accelerate MongoDB analytical queries with Parquet caching.
|
|
5
|
+
|
|
6
|
+
XLR8 is a high-performance wrapper for PyMongo that:
|
|
7
|
+
- Decomposes queries into brackets for parallel execution
|
|
8
|
+
- Executes parallel async MongoDB fetches with Rust backend
|
|
9
|
+
- Caches results in compressed Parquet files
|
|
10
|
+
- Reconstructs pandas/Polars DataFrames efficiently using Rust backend.
|
|
11
|
+
|
|
12
|
+
Quick Start:
|
|
13
|
+
```python
|
|
14
|
+
from pymongo import MongoClient
|
|
15
|
+
from xlr8 import accelerate, Schema, Types
|
|
16
|
+
|
|
17
|
+
# Define your schema with time field and typed fields
|
|
18
|
+
schema = Schema(
|
|
19
|
+
time_field="recordedAt",
|
|
20
|
+
fields={
|
|
21
|
+
"recordedAt": Types.Timestamp("ms", tz="UTC"),
|
|
22
|
+
"metadata.device_id": Types.ObjectId(),
|
|
23
|
+
"metadata.sensor_id": Types.ObjectId(),
|
|
24
|
+
"value": Types.Any(), # Polymorphic - int, float, str, etc.
|
|
25
|
+
},
|
|
26
|
+
avg_doc_size_bytes=250,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# Connect to MongoDB and get collection
|
|
30
|
+
client = MongoClient("mongodb://localhost:27017")
|
|
31
|
+
collection = client["main"]["sensorData"]
|
|
32
|
+
|
|
33
|
+
# Wrap collection with schema for acceleration
|
|
34
|
+
xlr8_collection = accelerate(
|
|
35
|
+
collection,
|
|
36
|
+
schema=schema,
|
|
37
|
+
mongo_uri="mongodb://localhost:27017"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Use like normal PyMongo - find() returns XLR8Cursor
|
|
41
|
+
cursor = xlr8_collection.find({
|
|
42
|
+
"recordedAt": {"$gte": start_date, "$lt": end_date}
|
|
43
|
+
}).sort("recordedAt", 1)
|
|
44
|
+
|
|
45
|
+
# Accelerated DataFrame construction
|
|
46
|
+
df = cursor.to_dataframe()
|
|
47
|
+
|
|
48
|
+
# Clean up
|
|
49
|
+
client.close()
|
|
50
|
+
```
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
from __future__ import annotations
|
|
54
|
+
|
|
55
|
+
from importlib.metadata import version as _get_version
|
|
56
|
+
from typing import TYPE_CHECKING
|
|
57
|
+
|
|
58
|
+
from .schema import types as Types
|
|
59
|
+
from .schema.schema import Schema
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
__version__ = _get_version("xlr8")
|
|
63
|
+
except Exception:
|
|
64
|
+
__version__ = "0.0.0.dev" # Fallback for editable installs without metadata
|
|
65
|
+
|
|
66
|
+
# Lazy loader for rust_backend to avoid import errors when Rust isn't built yet
|
|
67
|
+
# This allows `from xlr8 import rust_backend` to work without importing at module load
|
|
68
|
+
_rust_backend_cached = None
|
|
69
|
+
_collection_exports_cached: dict[str, object] | None = None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
if TYPE_CHECKING:
|
|
73
|
+
from .collection.cursor import XLR8Cursor as XLR8Cursor
|
|
74
|
+
from .collection.wrapper import XLR8Collection as XLR8Collection
|
|
75
|
+
from .collection.wrapper import accelerate as accelerate
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def __getattr__(name: str):
|
|
79
|
+
global _rust_backend_cached
|
|
80
|
+
global _collection_exports_cached
|
|
81
|
+
if name == "rust_backend":
|
|
82
|
+
if _rust_backend_cached is None:
|
|
83
|
+
# Import the module directly to avoid recursion
|
|
84
|
+
import importlib
|
|
85
|
+
|
|
86
|
+
_rust_backend_cached = importlib.import_module(
|
|
87
|
+
".rust_backend", package="xlr8"
|
|
88
|
+
)
|
|
89
|
+
return _rust_backend_cached
|
|
90
|
+
|
|
91
|
+
if name in {"XLR8Cursor", "XLR8Collection", "accelerate"}:
|
|
92
|
+
if _collection_exports_cached is None:
|
|
93
|
+
from .collection.cursor import XLR8Cursor
|
|
94
|
+
from .collection.wrapper import XLR8Collection, accelerate
|
|
95
|
+
|
|
96
|
+
_collection_exports_cached = {
|
|
97
|
+
"XLR8Cursor": XLR8Cursor,
|
|
98
|
+
"XLR8Collection": XLR8Collection,
|
|
99
|
+
"accelerate": accelerate,
|
|
100
|
+
}
|
|
101
|
+
return _collection_exports_cached[name]
|
|
102
|
+
|
|
103
|
+
raise AttributeError(f"module 'xlr8' has no attribute '{name}'")
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
__all__ = [
|
|
107
|
+
"Schema",
|
|
108
|
+
"Types",
|
|
109
|
+
"rust_backend",
|
|
110
|
+
"accelerate",
|
|
111
|
+
"XLR8Collection",
|
|
112
|
+
"XLR8Cursor",
|
|
113
|
+
]
|
|
Binary file
|
xlr8/_xlr8_rust.pyi
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Type stubs for _xlr8_rust native module.
|
|
2
|
+
|
|
3
|
+
This is the compiled Rust extension. Type hints enable intellisense.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import Any, Dict, List
|
|
7
|
+
|
|
8
|
+
def fetch_chunks_bson(
|
|
9
|
+
mongodb_uri: str,
|
|
10
|
+
db_name: str,
|
|
11
|
+
collection_name: str,
|
|
12
|
+
chunks_bson: bytes,
|
|
13
|
+
schema_json: str,
|
|
14
|
+
cache_dir: str,
|
|
15
|
+
num_workers: int,
|
|
16
|
+
batch_size: int,
|
|
17
|
+
flush_trigger_mb: int,
|
|
18
|
+
avg_doc_size_bytes: int,
|
|
19
|
+
sort_spec_json: str,
|
|
20
|
+
time_field: str,
|
|
21
|
+
projection_json: str,
|
|
22
|
+
row_group_size: int | None = None,
|
|
23
|
+
) -> Dict[str, Any]:
|
|
24
|
+
"""Fetch MongoDB documents in parallel chunks and write to Parquet.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
mongodb_uri: MongoDB connection string
|
|
28
|
+
db_name: Database name
|
|
29
|
+
collection_name: Collection name
|
|
30
|
+
chunks_bson: BSON-encoded chunk definitions
|
|
31
|
+
schema_json: JSON string describing Arrow schema
|
|
32
|
+
cache_dir: Directory where Parquet files will be written
|
|
33
|
+
num_workers: Number of parallel workers
|
|
34
|
+
batch_size: Documents per MongoDB batch
|
|
35
|
+
flush_trigger_mb: Memory threshold for flushing to disk (MB)
|
|
36
|
+
avg_doc_size_bytes: Average document size for memory estimation
|
|
37
|
+
sort_spec_json: JSON sort specification
|
|
38
|
+
time_field: Field name containing timestamps
|
|
39
|
+
projection_json: MongoDB projection as JSON
|
|
40
|
+
row_group_size: Parquet row group size (None = use Arrow default)
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Dictionary with total_docs, total_files, duration_secs
|
|
44
|
+
"""
|
|
45
|
+
...
|
|
46
|
+
|
|
47
|
+
def decode_any_struct_arrow(
|
|
48
|
+
arrow_array: Any, # pyarrow.StructArray
|
|
49
|
+
) -> List[Any]:
|
|
50
|
+
"""Decode PyArrow StructArray (Any type) to Python values.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
arrow_array: PyArrow StructArray with 13-field Any encoding
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
List of decoded Python values
|
|
57
|
+
"""
|
|
58
|
+
...
|
|
59
|
+
|
|
60
|
+
def encode_any_values_to_arrow(
|
|
61
|
+
values: List[Any],
|
|
62
|
+
) -> Any: # pyarrow.StructArray
|
|
63
|
+
"""Encode Python values to PyArrow StructArray (Any type).
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
values: List of Python values to encode
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
PyArrow StructArray with 13-field Any encoding
|
|
70
|
+
"""
|
|
71
|
+
...
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Query analysis and execution planning.
|
|
3
|
+
|
|
4
|
+
This module provides utilities for analyzing MongoDB queries and
|
|
5
|
+
creating optimal execution plans for parallel processing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .brackets import (
|
|
9
|
+
build_brackets_for_find,
|
|
10
|
+
)
|
|
11
|
+
from .chunker import (
|
|
12
|
+
chunk_time_range,
|
|
13
|
+
)
|
|
14
|
+
from .inspector import (
|
|
15
|
+
# Operator classification sets
|
|
16
|
+
ALWAYS_ALLOWED,
|
|
17
|
+
CONDITIONAL,
|
|
18
|
+
NEVER_ALLOWED,
|
|
19
|
+
# Validation
|
|
20
|
+
ValidationResult,
|
|
21
|
+
check_conditional_operators,
|
|
22
|
+
extract_time_bounds_recursive,
|
|
23
|
+
generate_sort_sql,
|
|
24
|
+
get_sort_field_info,
|
|
25
|
+
has_forbidden_ops,
|
|
26
|
+
is_chunkable_query,
|
|
27
|
+
normalize_datetime,
|
|
28
|
+
# Query analysis
|
|
29
|
+
or_depth,
|
|
30
|
+
split_global_and,
|
|
31
|
+
validate_query_for_chunking,
|
|
32
|
+
validate_sort_field,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
__all__ = [
|
|
36
|
+
# inspector - operator sets
|
|
37
|
+
"ALWAYS_ALLOWED",
|
|
38
|
+
"CONDITIONAL",
|
|
39
|
+
"NEVER_ALLOWED",
|
|
40
|
+
# inspector - validation
|
|
41
|
+
"ValidationResult",
|
|
42
|
+
"has_forbidden_ops",
|
|
43
|
+
"validate_query_for_chunking",
|
|
44
|
+
"validate_sort_field",
|
|
45
|
+
"get_sort_field_info",
|
|
46
|
+
"generate_sort_sql",
|
|
47
|
+
"check_conditional_operators",
|
|
48
|
+
# inspector - analysis
|
|
49
|
+
"or_depth",
|
|
50
|
+
"split_global_and",
|
|
51
|
+
"normalize_datetime",
|
|
52
|
+
"extract_time_bounds_recursive",
|
|
53
|
+
"is_chunkable_query",
|
|
54
|
+
# brackets
|
|
55
|
+
"build_brackets_for_find",
|
|
56
|
+
# chunker
|
|
57
|
+
"chunk_time_range",
|
|
58
|
+
]
|