PyPI - xlr8 - Versions diffs - 0.1.7b3__cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - Mend

xlr8 0.1.7b3__cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

xlr8/__init__.py +113 -0
xlr8/_xlr8_rust.cpython-311-x86_64-linux-gnu.so +0 -0
xlr8/_xlr8_rust.pyi +71 -0
xlr8/analysis/__init__.py +58 -0
xlr8/analysis/brackets.py +1201 -0
xlr8/analysis/chunker.py +118 -0
xlr8/analysis/inspector.py +1889 -0
xlr8/collection/__init__.py +6 -0
xlr8/collection/cursor.py +2161 -0
xlr8/collection/cursor.pyi +179 -0
xlr8/collection/wrapper.py +400 -0
xlr8/collection/wrapper.pyi +420 -0
xlr8/constants.py +24 -0
xlr8/execution/__init__.py +43 -0
xlr8/execution/callback.py +792 -0
xlr8/execution/executor.py +500 -0
xlr8/execution/planner.py +377 -0
xlr8/py.typed +1 -0
xlr8/rust_backend.py +40 -0
xlr8/rust_backend.pyi +71 -0
xlr8/schema/__init__.py +42 -0
xlr8/schema/encoder.py +235 -0
xlr8/schema/schema.py +265 -0
xlr8/schema/types.py +239 -0
xlr8/storage/__init__.py +17 -0
xlr8/storage/cache.py +228 -0
xlr8/storage/reader.py +1369 -0
xlr8-0.1.7b3.dist-info/METADATA +176 -0
xlr8-0.1.7b3.dist-info/RECORD +31 -0
xlr8-0.1.7b3.dist-info/WHEEL +5 -0
xlr8-0.1.7b3.dist-info/licenses/LICENSE +201 -0

xlr8/collection/cursor.pyi ADDED Viewed

@@ -0,0 +1,179 @@
+"""Type stubs for XLR8 Cursor.
+Provides IDE autocomplete for all PyMongo cursor methods plus XLR8 extensions.
+"""
+from __future__ import annotations
+from datetime import date, datetime, timedelta
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    Generic,
+    List,
+    Literal,
+    Mapping,
+    Optional,
+    Tuple,
+    TypeVar,
+    Union,
+    overload,
+)
+import pandas as pd
+import polars as pl
+import pyarrow as pa
+from bson.code import Code
+from pymongo.cursor_shared import _Hint, _Sort
+from pymongo.synchronous.client_session import ClientSession
+from pymongo.synchronous.collection import Collection
+from pymongo.synchronous.cursor import Cursor as PyMongoCursor
+from pymongo.typings import _CollationIn
+_DocumentType = TypeVar("_DocumentType", bound=Mapping[str, Any])
+class XLR8Cursor(Generic[_DocumentType]):
+    """PyMongo-compatible cursor with optional acceleration.
+    All PyMongo cursor methods work via delegation.
+    Adds 4 XLR8-specific methods for DataFrame conversion.
+    """
+    def __init__(
+        self,
+        collection: Any,  # XLR8Collection
+        query_filter: Dict[str, Any],
+        projection: Optional[Dict[str, Any]] = None,
+        skip: int = 0,
+        limit: int = 0,
+        sort: Optional[List[Tuple[str, int]]] = None,
+        batch_size: int = 1000,
+        # PyMongo compatibility parameters (passed through to PyMongo cursor)
+        no_cursor_timeout: bool = False,
+        cursor_type: int = ...,
+        allow_partial_results: bool = False,
+        oplog_replay: bool = False,
+        collation: Optional[Dict[str, Any]] = None,
+        hint: Optional[Any] = None,
+        max_scan: Optional[int] = None,
+        max_time_ms: Optional[int] = None,
+        max: Optional[List[Tuple[str, Any]]] = None,
+        min: Optional[List[Tuple[str, Any]]] = None,
+        return_key: Optional[bool] = None,
+        show_record_id: Optional[bool] = None,
+        snapshot: Optional[bool] = None,
+        comment: Optional[Any] = None,
+        session: Optional[Any] = None,
+        allow_disk_use: Optional[bool] = None,
+        let: Optional[Dict[str, Any]] = None,
+        **kwargs: Any,
+    ) -> None: ...
+    @property
+    def collection(self) -> Collection[_DocumentType]: ...
+    @property
+    def retrieved(self) -> int: ...
+    def clone(self) -> XLR8Cursor[_DocumentType]: ...
+    def add_option(self, mask: int) -> XLR8Cursor[_DocumentType]: ...
+    def remove_option(self, mask: int) -> XLR8Cursor[_DocumentType]: ...
+    def allow_disk_use(self, allow_disk_use: bool) -> XLR8Cursor[_DocumentType]: ...
+    def limit(self, limit: int) -> XLR8Cursor[_DocumentType]: ...
+    def batch_size(self, batch_size: int) -> XLR8Cursor[_DocumentType]: ...
+    def skip(self, skip: int) -> XLR8Cursor[_DocumentType]: ...
+    def max_time_ms(self, max_time_ms: int | None) -> XLR8Cursor[_DocumentType]: ...
+    def max_await_time_ms(
+        self, max_await_time_ms: int | None
+    ) -> XLR8Cursor[_DocumentType]: ...
+    @overload
+    def __getitem__(self, index: int) -> _DocumentType: ...
+    @overload
+    def __getitem__(self, index: slice) -> XLR8Cursor[_DocumentType]: ...
+    def max_scan(self, max_scan: int | None) -> XLR8Cursor[_DocumentType]: ...
+    def max(self, spec: _Sort) -> XLR8Cursor[_DocumentType]: ...
+    def min(self, spec: _Sort) -> XLR8Cursor[_DocumentType]: ...
+    def sort(
+        self, key_or_list: _Hint, direction: int | str | None = None
+    ) -> XLR8Cursor[_DocumentType]: ...
+    def explain(self) -> _DocumentType: ...
+    def hint(self, index: _Hint | None) -> XLR8Cursor[_DocumentType]: ...
+    def comment(self, comment: Any) -> XLR8Cursor[_DocumentType]: ...
+    def where(self, code: str | Code) -> XLR8Cursor[_DocumentType]: ...
+    def collation(
+        self, collation: _CollationIn | None
+    ) -> XLR8Cursor[_DocumentType]: ...
+    @property
+    def alive(self) -> bool: ...
+    @property
+    def cursor_id(self) -> int | None: ...
+    @property
+    def address(self) -> tuple[str, Any] | None: ...
+    @property
+    def session(self) -> ClientSession | None: ...
+    def close(self) -> None: ...
+    def distinct(self, key: str) -> list[Any]: ...
+    def rewind(self) -> XLR8Cursor[_DocumentType]: ...
+    def next(self) -> _DocumentType: ...
+    def __next__(self) -> _DocumentType: ...
+    def __iter__(self) -> XLR8Cursor[_DocumentType]: ...
+    def __enter__(self) -> XLR8Cursor[_DocumentType]: ...
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: ...
+    def to_list(self, length: int | None = None) -> list[_DocumentType]: ...
+    # XLR8-specific accelerated methods
+    def to_dataframe(
+        self,
+        accelerate: bool = True,
+        cache_read: bool = True,
+        cache_write: bool = True,
+        start_date: Optional[Union[datetime, date, str]] = None,
+        end_date: Optional[Union[datetime, date, str]] = None,
+        coerce: Literal["raise", "error"] = "raise",
+        max_workers: int = 4,
+        chunking_granularity: Optional[timedelta] = None,
+        row_group_size: Optional[int] = None,
+        flush_ram_limit_mb: int = 512,
+    ) -> pd.DataFrame: ...
+    def to_polars(
+        self,
+        accelerate: bool = True,
+        cache_read: bool = True,
+        cache_write: bool = True,
+        start_date: Optional[Union[datetime, date, str]] = None,
+        end_date: Optional[Union[datetime, date, str]] = None,
+        coerce: Literal["raise", "error"] = "raise",
+        max_workers: int = 4,
+        chunking_granularity: Optional[timedelta] = None,
+        row_group_size: Optional[int] = None,
+        any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
+        flush_ram_limit_mb: int = 512,
+    ) -> pl.DataFrame: ...
+    def to_dataframe_batches(
+        self,
+        batch_size: int = 10000,
+        cache_read: bool = True,
+        cache_write: bool = True,
+        start_date: Optional[Union[datetime, date, str]] = None,
+        end_date: Optional[Union[datetime, date, str]] = None,
+        coerce: Literal["raise", "error"] = "raise",
+        max_workers: int = 4,
+        chunking_granularity: Optional[timedelta] = None,
+        row_group_size: Optional[int] = None,
+        flush_ram_limit_mb: int = 512,
+    ) -> Generator[pd.DataFrame, None, None]: ...
+    def stream_to_callback(
+        self,
+        callback: Callable[[pa.Table, Dict[str, Any]], None],
+        *,
+        partition_time_delta: timedelta,
+        partition_by: Optional[Union[str, List[str]]] = None,
+        any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
+        max_workers: int = 4,
+        chunking_granularity: Optional[timedelta] = None,
+        row_group_size: Optional[int] = None,
+        flush_ram_limit_mb: int = 512,
+        cache_read: bool = True,
+        cache_write: bool = True,
+    ) -> Dict[str, Any]: ...
+    def raw_cursor(self) -> PyMongoCursor[_DocumentType]: ...
+    def explain_acceleration(self) -> Dict[str, Any]: ...

xlr8/collection/wrapper.py ADDED Viewed

@@ -0,0 +1,400 @@
+"""
+XLR8 collection wrapper with PyMongo compatibility.
+================================================================================
+DATA FLOW - COLLECTION WRAPPER
+================================================================================
+This module wraps pymongo.collection.Collection to provide the `accelerate()`
+function - the main entry point for users.
+TYPICAL USAGE FLOW:
+────────────────────────────────────────────────────────────────────────────────
+1. USER WRAPS A COLLECTION:
+┌─────────────────────────────────────────────────────────────────────────────┐
+│  from xlr8 import accelerate, Schema, Types                                 │
+│                                                                             │
+│  schema = Schema(                                                           │
+│      time_field="timestamp",                                                │
+│      fields={                                                               │
+│          "timestamp": Types.Timestamp("ms", tz="UTC"),                      │
+│          "metadata.device_id": Types.ObjectId(),                            │
+│          "metadata.sensor_id": Types.ObjectId(),                         │
+│          "value": Types.Any(),  # Polymorphic - can be int, float, str etc..│
+│      }                                                                      │
+│  )                                                                          │
+│                                                                             │
+│  xlr8_col = accelerate(pymongo_collection, schema, mongo_uri)               │
+└─────────────────────────────────────────────────────────────────────────────┘
+2. USER CALLS find() - RETURNS XLR8Cursor (NOT PYMONGO CURSOR):
+┌─────────────────────────────────────────────────────────────────────────────┐
+│  cursor = xlr8_col.find({                                                   │
+│      "timestamp": {"$gte": start, "$lt": end},                              │
+│      "metadata.device_id": ObjectId("64a..."),                              │
+│  })                                                                         │
+│  # cursor is XLR8Cursor, wrapping the query params                          │
+└─────────────────────────────────────────────────────────────────────────────┘
+3. USER CALLS to_dataframe() - TRIGGERS ACCELERATION:
+┌─────────────────────────────────────────────────────────────────────────────┐
+│  df = cursor.to_dataframe()                                                 │
+│  # This triggers:                                                           │
+│  # 1. Query analysis (can we chunk by time?)                                │
+│  # 2. Check cache (have we fetched this before?)                            │
+│  # 3. Parallel fetch via Rust async backend                               │
+│  # 4. Stream to Parquet cache                                               │
+│  # 5. Read back and return DataFrame                                        │
+└─────────────────────────────────────────────────────────────────────────────┘
+KEY CONFIG OPTIONS:
+────────────────────────────────────────────────────────────────────────────────
+- schema: Required for type-aware encoding (especially Types.Any)
+- mongo_uri: Required for accelerated execution (workers create connections)
+- cache_dir: Where to store Parquet cache (default: .xlr8_cache)
+PER-QUERY OPTIONS (via to_dataframe):
+────────────────────────────────────────────────────────────────────────────────
+- max_workers: Number of parallel workers (default: 4)
+- flush_ram_limit_mb: RAM budget for batch sizing (default: 512)
+- chunking_granularity: Time chunk size (e.g., timedelta(days=7))
+================================================================================
+"""
+from typing import Any, Callable, Dict, List, Optional, Union
+from pymongo.collection import Collection as PyMongoCollection
+from xlr8.collection.cursor import XLR8Cursor
+from xlr8.schema import Schema
+class XLR8Collection:
+    """
+    PyMongo-compatible collection wrapper with acceleration.
+    Drop-in replacement for pymongo.collection.Collection that transparently
+    accelerates analytical queries through parallel execution and caching.
+    All write operations (insert, update, delete) pass through to PyMongo.
+    Read operations (find, aggregate) can be accelerated if:
+    - Schema is provided
+    - Query has time-range predicates
+    - Query doesn't use complex operators ($nor, $where, etc.)
+    Example:
+        >>> import pymongo
+        >>> from xlr8 import XLR8Collection, Schema, Types
+        >>>
+        >>> # Create schema
+        >>> schema = Schema(
+        ...     time_field="timestamp",
+        ...     fields={
+        ...         "timestamp": Types.Timestamp(),
+        ...         "value": Types.Float(),
+        ...         "sensor_id": Types.String(),
+        ...     }
+        ... )
+        >>>
+        >>> # Wrap collection with mongo_uri for accelerated execution
+        >>> client = pymongo.MongoClient("mongodb://localhost:27017")
+        >>> pymongo_col = client.mydb.mycollection
+        >>> col = XLR8Collection(pymongo_col, schema=schema, mongo_uri="mongodb://localhost:27017")
+        >>>
+        >>> # Use like regular PyMongo
+        >>> cursor = col.find({"timestamp": {"$gte": start, "$lt": end}})
+        >>> df = cursor.to_dataframe(flush_ram_limit_mb=2000)
+    """
+    def __init__(
+        self,
+        pymongo_collection,
+        schema: Optional[Schema] = None,
+        mongo_uri: Union[str, Callable[[], str], None] = None,
+        approx_document_size_bytes: int = 500,
+    ):
+        """
+        Initialize XLR8 collection wrapper.
+        Args:
+            pymongo_collection: PyMongo Collection instance
+            schema: Optional schema definition for acceleration
+            mongo_uri: MongoDB connection string (str) or callable that returns one.
+                       Required for accelerated execution. Can be:
+                       - A string: "mongodb://localhost:27017"
+                       - A callable: lambda: os.environ["MONGODB_URI"]
+            approx_document_size_bytes: Approximate size of each document in bytes
+                (default: 500). Used for memory budget calculations.
+        Note:
+            Cache directory is auto-managed based on query hash.
+            flush_ram_limit_mb and max_workers are parameters of to_dataframe(),
+            to_polars(), etc. for per-query control.
+        """
+        self._pymongo_collection = pymongo_collection
+        self._schema = schema
+        self._mongo_uri = mongo_uri
+        self._approx_document_size_bytes = approx_document_size_bytes
+    def raw_collection(self) -> PyMongoCollection:
+        """
+        Get direct access to underlying PyMongo collection.
+        This is an escape hatch for power users who need direct access to PyMongo
+        collection methods that may not be available through delegation.
+        Returns:
+            pymongo.collection.Collection: The underlying PyMongo collection
+        Example:
+            >>> xlr8_col = accelerate(collection, schema=schema)
+            >>> xlr8_col.raw_collection().watch()  # Use MongoDB change streams
+            >>> xlr8_col.raw_collection().list_indexes()  # Direct PyMongo access
+        """
+        return self._pymongo_collection
+    # PyMongo pass-through properties
+    @property
+    def name(self) -> str:
+        """Collection name."""
+        return self._pymongo_collection.name
+    @property
+    def full_name(self) -> str:
+        """Full collection name (database.collection)."""
+        return self._pymongo_collection.full_name
+    @property
+    def database(self):
+        """Parent database."""
+        return self._pymongo_collection.database
+    # Public accessor properties for cursor usage
+    @property
+    def schema(self):
+        """Schema definition for acceleration."""
+        return self._schema
+    @property
+    def pymongo_collection(self):
+        """Underlying PyMongo collection instance."""
+        return self._pymongo_collection
+    @property
+    def mongo_uri(self):
+        """MongoDB connection URI for accelerated execution."""
+        return self._mongo_uri
+    @property
+    def approx_document_size_bytes(self) -> int:
+        """Approximate size of each document in bytes."""
+        return self._approx_document_size_bytes
+    def __getattr__(self, name: str):
+        """
+        Delegate unknown methods to PyMongo collection.
+        Why:
+            Provides full PyMongo compatibility without manually implementing
+            every collection method (insert, update, delete, indexes, etc.).
+        Example:
+            >>> xlr8_col.insert_one({...})  # Works via delegation
+            >>> xlr8_col.create_index("timestamp")  # Works via delegation
+            >>> count = xlr8_col.count_documents({})  # Works via delegation
+        """
+        return getattr(self._pymongo_collection, name)
+    # Read operations (can be accelerated)
+    def find(
+        self,
+        filter: Optional[Dict[str, Any]] = None,
+        projection: Optional[Dict[str, Any]] = None,
+        skip: int = 0,
+        limit: int = 0,
+        sort: Optional[List[tuple]] = None,
+        batch_size: int = 1000,
+        **kwargs,
+    ) -> XLR8Cursor:
+        """
+        Query collection with optional acceleration.
+        Returns XLR8Cursor which is PyMongo-compatible but can accelerate
+        to_dataframe() / to_polars() conversions.
+        DATA FLOW EXAMPLE:
+        INPUT (filter parameter):
+        {
+            "$or": [
+                {"metadata.sensor_id": ObjectId("64a...")},
+                {"metadata.sensor_id": ObjectId("64b...")},
+            ],
+            "timestamp": {"$gte": datetime(2024,1,1), "$lt": datetime(...)}
+        }
+        OUTPUT: XLR8Cursor object containing:
+        - _filter: The query dict (unchanged)
+        - _collection: Reference back to this XLR8Collection
+        - _projection, _skip, _limit, _sort: Query modifiers
+        NEXT STEP: User calls cursor.to_dataframe() which triggers:
+        1. Query analysis in analysis/brackets.py
+        2. Execution planning in execution/planner.py
+        3. Parallel fetch in execution/worker.py
+        Args:
+            filter: Query filter dict
+            projection: Field projection dict
+            skip: Number of documents to skip
+            limit: Maximum documents to return
+            sort: Sort specification
+            batch_size: Batch size for iteration
+            **kwargs: Additional PyMongo cursor options
+        Returns:
+            XLR8Cursor instance
+        Example:
+            >>> # Simple query
+            >>> cursor = col.find({"status": "active"})
+            >>>
+            >>> # Query with time range (accelerated)
+            >>> cursor = col.find({
+            ...     "timestamp": {"$gte": start, "$lt": end},
+            ...     "sensor_id": "sensor_1"
+            ... })
+            >>> df = cursor.to_dataframe()
+        """
+        if filter is None:
+            filter = {}
+        return XLR8Cursor(
+            collection=self,
+            query_filter=filter,
+            projection=projection,
+            skip=skip,
+            limit=limit,
+            sort=sort,
+            batch_size=batch_size,
+            **kwargs,  # Pass through all PyMongo cursor options
+        )
+    # XLR8-specific methods
+    def set_schema(self, schema: Schema) -> None:
+        """
+        Set or update schema for acceleration.
+        Args:
+            schema: Schema definition
+        """
+        self._schema = schema
+    def get_schema(self) -> Optional[Schema]:
+        """
+        Get current schema.
+        Returns:
+            Schema or None
+        """
+        return self._schema
+def accelerate(
+    pymongo_collection: PyMongoCollection,
+    schema: Schema,
+    mongo_uri: Union[str, Callable[[], str]],
+    approx_document_size_bytes: int = 500,
+) -> XLR8Collection:
+    """
+    Convenience function to wrap a PyMongo collection with acceleration.
+    DATA FLOW EXAMPLE - MAIN ENTRY POINT:
+    INPUT:
+    - pymongo_collection: client["main"]["sensorData"]
+    - schema: Schema(time_field="timestamp", fields={...})
+    - mongo_uri: Connection string used by accelerated workers
+    Example:
+    accelerate(
+        collection,
+        schema,
+        mongo_uri="mongodb://localhost:27017",  # Or callable
+    )
+    OUTPUT: XLR8Collection wrapper that:
+    - Wraps pymongo collection for transparent pass-through
+    - Stores schema for type-aware Parquet encoding
+    - Stores mongo_uri for workers to create their own connections
+    WHAT HAPPENS NEXT:
+    1. User calls: xlr8_col.find({...})
+    2. Returns XLR8Cursor (wraps query params)
+    3. User calls: cursor.to_dataframe()
+    4. Workers use mongo_uri to create their own connections
+    Args:
+        pymongo_collection: PyMongo Collection instance
+        schema: Schema definition
+        mongo_uri: MongoDB connection string (str) or callable that returns one.
+                   Required for accelerated execution. Can be:
+                   - A string: "mongodb://localhost:27017"
+                   - A callable: lambda: os.environ["MONGODB_URI"]
+        approx_document_size_bytes: Approximate size of each document in bytes
+            (default: 500). Used for memory budget calculations.
+    Returns:
+        XLR8Collection wrapper
+    Note:
+        Cache directory is auto-managed based on query hash.
+        flush_ram_limit_mb and max_workers are parameters of to_dataframe(),
+        to_polars(), etc. for per-query control.
+    Example:
+        >>> import pymongo
+        >>> from xlr8 import accelerate, Schema, Types
+        >>>
+        >>> # Connection string or callable
+        >>> MONGO_URI = "mongodb://localhost:27017"
+        >>> # OR: get_uri = lambda: os.environ["MONGODB_URI"]
+        >>>
+        >>> client = pymongo.MongoClient(MONGO_URI)
+        >>> col = client.mydb.sensor_logs
+        >>>
+        >>> schema = Schema(
+        ...     time_field="timestamp",
+        ...     fields={
+        ...         "timestamp": Types.Timestamp(),
+        ...         "sensor_id": Types.String(),
+        ...         "value": Types.Float(),
+        ...     },
+        ... )
+        >>>
+        >>> # Pass mongo_uri for accelerated workers
+        >>> accelerated_col = accelerate(col, schema, mongo_uri=MONGO_URI)
+        >>>
+        >>> # max_workers and flush_ram_limit_mb are per-query
+        >>> from datetime import timedelta
+        >>> df = accelerated_col.find({
+        ...     "timestamp": {"$gte": start, "$lt": end}
+        ... }).to_dataframe(
+        ...     max_workers=8,
+        ...     chunking_granularity=timedelta(days=1),
+        ...     flush_ram_limit_mb=2000,
+        ... )
+    """
+    return XLR8Collection(
+        pymongo_collection=pymongo_collection,
+        schema=schema,
+        mongo_uri=mongo_uri,
+        approx_document_size_bytes=approx_document_size_bytes,
+    )