PyPI - xlr8 - Versions diffs - 0.1.7b3__cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - Mend

xlr8 0.1.7b3__cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

xlr8/__init__.py +113 -0
xlr8/_xlr8_rust.cpython-311-x86_64-linux-gnu.so +0 -0
xlr8/_xlr8_rust.pyi +71 -0
xlr8/analysis/__init__.py +58 -0
xlr8/analysis/brackets.py +1201 -0
xlr8/analysis/chunker.py +118 -0
xlr8/analysis/inspector.py +1889 -0
xlr8/collection/__init__.py +6 -0
xlr8/collection/cursor.py +2161 -0
xlr8/collection/cursor.pyi +179 -0
xlr8/collection/wrapper.py +400 -0
xlr8/collection/wrapper.pyi +420 -0
xlr8/constants.py +24 -0
xlr8/execution/__init__.py +43 -0
xlr8/execution/callback.py +792 -0
xlr8/execution/executor.py +500 -0
xlr8/execution/planner.py +377 -0
xlr8/py.typed +1 -0
xlr8/rust_backend.py +40 -0
xlr8/rust_backend.pyi +71 -0
xlr8/schema/__init__.py +42 -0
xlr8/schema/encoder.py +235 -0
xlr8/schema/schema.py +265 -0
xlr8/schema/types.py +239 -0
xlr8/storage/__init__.py +17 -0
xlr8/storage/cache.py +228 -0
xlr8/storage/reader.py +1369 -0
xlr8-0.1.7b3.dist-info/METADATA +176 -0
xlr8-0.1.7b3.dist-info/RECORD +31 -0
xlr8-0.1.7b3.dist-info/WHEEL +5 -0
xlr8-0.1.7b3.dist-info/licenses/LICENSE +201 -0

xlr8/schema/types.py ADDED Viewed

@@ -0,0 +1,239 @@
+"""
+Type definitions for XLR8's schema system.
+This module provides type classes that define how MongoDB BSON values
+are mapped to Parquet types.These types form the foundation of XLR8's
+schema system, enabling efficient storage and querying of MongoDB data.
+Key Features:
+- **Type Safety**: Explicit type definitions for MongoDB document schemas
+- **Arrow Integration**: Seamless conversion between MongoDB BSON and Apache
+Arrow types
+- **Flexible Schema**: Support for both strict and flexible schemas via Types.Any
+Supported Types:
+- Primitives: String, Int, Float, Bool, Timestamp, ObjectId
+  TODO: include all BSON types
+- Complex: Struct (nested documents), List (arrays)
+Schema Behavior:
+- Fields defined in the schema are type-checked and converted to Arrow types
+- Fields not in the schema are discarded when writing to Parquet.
+- Types.Any provides a flexible escape hatch for dynamic/unknown fields
+which are stored as structs in Parquet and later decoded back to original
+BSON types via the Rust backend.
+"""
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Dict, Optional
+import pyarrow as pa
+class BaseType(ABC):
+    """Base class for all XLR8 types."""
+    @abstractmethod
+    def to_arrow(self) -> pa.DataType:
+        """Convert to PyArrow data type."""
+        pass
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}()"
+    def __eq__(self, other) -> bool:
+        """Compare types for equality."""
+        return isinstance(other, self.__class__)
+    def __hash__(self) -> int:
+        """Make types hashable for use in sets/dicts."""
+        return hash(self.__class__.__name__)
+class String(BaseType):
+    """String type."""
+    def to_arrow(self) -> pa.DataType:
+        return pa.string()
+    def __eq__(self, other) -> bool:
+        return isinstance(other, String)
+    def __hash__(self) -> int:
+        return hash(self.__class__.__name__)
+class Int(BaseType):
+    """Integer type (always 64-bit)."""
+    def to_arrow(self) -> pa.DataType:
+        return pa.int64()
+class Float(BaseType):
+    """Floating-point type (always 64-bit)."""
+    def to_arrow(self) -> pa.DataType:
+        return pa.float64()
+class Bool(BaseType):
+    """Boolean type."""
+    def to_arrow(self) -> pa.DataType:
+        return pa.bool_()
+    def __eq__(self, other) -> bool:
+        return isinstance(other, Bool)
+    def __hash__(self) -> int:
+        return hash(self.__class__.__name__)
+@dataclass(frozen=True)
+class Timestamp(BaseType):
+    """Timestamp type."""
+    unit: str = "ns"
+    tz: Optional[str] = "UTC"
+    def to_arrow(self) -> pa.DataType:
+        return pa.timestamp(self.unit, tz=self.tz)
+@dataclass(frozen=True)
+class DateTime(BaseType):
+    """
+    DateTime type - convenience wrapper for MongoDB ISODate fields.
+    Automatically uses millisecond precision (MongoDB's standard format).
+    For custom precision, use Timestamp() directly.
+    Args:
+        tz: Timezone (default: "UTC")
+    Example:
+        >>> Schema(
+        ...     time_field="createdAt",
+        ...     fields={
+        ...         "createdAt": Types.DateTime(),  # MongoDB ISODate
+        ...         "customTime": Types.Timestamp("s", tz="UTC"),  # Custom unit
+        ...     }
+        ... )
+    """
+    tz: Optional[str] = "UTC"
+    def to_arrow(self) -> pa.DataType:
+        # MongoDB stores ISODate as milliseconds since epoch
+        return pa.timestamp("ms", tz=self.tz)
+class ObjectId(BaseType):
+    """MongoDB ObjectId type (stored as string in Parquet)."""
+    def to_arrow(self) -> pa.DataType:
+        return pa.string()
+    def __eq__(self, other) -> bool:
+        return isinstance(other, ObjectId)
+class Any(BaseType):
+    """
+    Polymorphic type - can hold any MongoDB value.
+    Stored as a union struct in Parquet with fields for each possible type.
+    The Rust backend handles encoding/decoding for performance.
+    Supports ALL MongoDB BSON types:
+    - Double (float64)
+    - Int32 (int32)
+    - Int64 (int64)
+    - String (utf8)
+    - ObjectId (hex string)
+    - Decimal128 (string)
+    - Regex (pattern string)
+    - Binary (base64 string)
+    - Document (JSON string)
+    - Array (JSON string)
+    - Boolean (bool)
+    - Date (timestamp[ms])
+    - Null (bool indicator)
+    """
+    def to_arrow(self) -> pa.DataType:
+        """Return the Arrow struct type for polymorphic values.
+        This schema must match the Rust backend's encode_any_values_to_arrow
+        and decode_any_struct_arrow functions exactly.
+        """
+        return pa.struct(
+            [
+                ("float_value", pa.float64()),
+                ("int32_value", pa.int32()),
+                ("int64_value", pa.int64()),
+                ("string_value", pa.string()),
+                ("objectid_value", pa.string()),
+                ("decimal128_value", pa.string()),
+                ("regex_value", pa.string()),
+                ("binary_value", pa.string()),
+                ("document_value", pa.string()),
+                ("array_value", pa.string()),
+                ("bool_value", pa.bool_()),
+                ("datetime_value", pa.timestamp("ms")),
+                ("null_value", pa.bool_()),
+            ]
+        )
+    def __eq__(self, other) -> bool:
+        return isinstance(other, Any)
+class Struct(BaseType):
+    """Nested struct type."""
+    def __init__(self, fields: Dict[str, BaseType]):
+        """
+        Args:
+            fields: Dict mapping field name to type
+        """
+        self.fields = fields
+    def to_arrow(self) -> pa.DataType:
+        return pa.struct(
+            [(name, field_type.to_arrow()) for name, field_type in self.fields.items()]
+        )
+    def __repr__(self) -> str:
+        field_str = ", ".join(f"{k}: {v}" for k, v in self.fields.items())
+        return f"Struct({{{field_str}}})"
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, Struct):
+            return False
+        if set(self.fields.keys()) != set(other.fields.keys()):
+            return False
+        return all(self.fields[k] == other.fields[k] for k in self.fields)
+class List(BaseType):
+    """List type."""
+    def __init__(self, element_type: BaseType):
+        """
+        Args:
+            element_type: Type of list elements
+        """
+        self.element_type = element_type
+    def to_arrow(self) -> pa.DataType:
+        return pa.list_(self.element_type.to_arrow())
+    def __repr__(self) -> str:
+        return f"List({self.element_type})"
+    def __eq__(self, other) -> bool:
+        return isinstance(other, List) and self.element_type == other.element_type

xlr8/storage/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+"""
+Parquet storage layer for XLR8.
+Provides efficient storage components for MongoDB query results:
+- Reader: Batch-aware Parquet reader for DataFrame construction
+- Cache: Query-specific cache management with deterministic hashing
+"""
+from .cache import CacheManager, hash_query
+from .reader import ParquetReader
+__all__ = [
+    "ParquetReader",
+    "CacheManager",
+    "hash_query",
+]

xlr8/storage/cache.py ADDED Viewed

@@ -0,0 +1,228 @@
+"""
+Cache management for XLR8 Parquet storage.
+This module provides query-specific caching for MongoDB results:
+1. Query Hashing (hash_query):
+   - Creates deterministic MD5 hash from query parameters (filter, projection, sort)
+   - Normalizes datetimes to ISO format, ObjectIds to strings
+   - Recursively sorts dicts for determinism
+   - Same query always produces same hash
+2. Cache Lifecycle (CacheManager):
+   - Each query gets unique directory: .cache/{query_hash}/
+   - Manages Parquet file storage per query
+   - Provides cache existence checking, file listing, cleanup
+Usage:
+    # Hash a query
+    query_hash = hash_query(filter_dict={"timestamp": {"$gte": start_date}})
+    # Manage cache lifecycle
+    cache = CacheManager(filter_dict={"timestamp": {"$gte": start_date}})
+    cache.ensure_cache_dir()
+    # ... write parquet files to cache.cache_dir ...
+    if cache.exists():
+        files = cache.list_parquet_files()
+    cache.clean()  # Remove when done
+"""
+import hashlib
+import json
+import shutil
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, Optional
+from bson import ObjectId
+def hash_query(
+    filter_dict: Dict[str, Any],
+    projection: Optional[Dict[str, Any]] = None,
+    sort: Optional[list] = None,
+) -> str:
+    """
+    Create deterministic hash of query parameters.
+    Uses MD5 hash of canonicalized JSON to create unique cache directory name.
+    Same query parameters will always produce the same hash.
+    Args:
+        filter_dict: MongoDB filter dictionary
+        projection: Field projection
+        sort: Sort specification
+    Returns:
+        Hex string hash (32 characters)
+    Example:
+        >>> hash_query({"timestamp": {"$gte": "2024-01-01"}})
+        'a3f5c9d2e1b4f6a8c7e9d1b3f5a7c9e1'
+    """
+    def normalize_value(obj):
+        """
+        Recursively normalize query values for deterministic hashing.
+        Converts datetimes to ISO strings, ObjectIds to strings,
+        and sorts dict keys to ensure same query always hashes identically.
+        """
+        if isinstance(obj, datetime):
+            return obj.isoformat()
+        elif isinstance(obj, ObjectId):
+            return str(obj)
+        elif isinstance(obj, dict):
+            return {k: normalize_value(v) for k, v in sorted(obj.items())}
+        elif isinstance(obj, list):
+            return [normalize_value(v) for v in obj]
+        return obj
+    # Build canonical representation
+    query_repr = {
+        "filter": normalize_value(filter_dict),
+    }
+    if projection:
+        query_repr["projection"] = normalize_value(projection)
+    if sort:
+        query_repr["sort"] = normalize_value(sort)
+    # Create deterministic JSON (sorted keys)
+    json_str = json.dumps(query_repr, sort_keys=True, separators=(",", ":"))
+    # Hash it
+    return hashlib.md5(json_str.encode("utf-8")).hexdigest()
+class CacheManager:
+    """
+    Manages Parquet cache lifecycle for a specific query.
+    Each query gets a unique cache directory based on query hash:
+    .cache/{query_hash}/
+    Provides:
+    - Cache directory creation
+    - Cache existence checking
+    - Cache cleanup
+    Example:
+        >>> cache = CacheManager(filter_dict={"timestamp": {"$gte": start}})
+        >>> cache.ensure_cache_dir()
+        >>> # ... write parquet files to cache.cache_dir ...
+        >>> cache.clean()  # Remove cache when done
+    """
+    def __init__(
+        self,
+        filter_dict: Dict[str, Any],
+        projection: Optional[Dict[str, Any]] = None,
+        sort: Optional[list] = None,
+        cache_root: Path = Path(".cache"),
+    ):
+        """
+        Initialize cache manager for a query.
+        Args:
+            filter_dict: MongoDB filter
+            projection: Field projection
+            sort: Sort specification
+            cache_root: Root directory for all caches (default: .cache)
+        """
+        self.filter_dict = filter_dict
+        self.projection = projection
+        self.sort = sort
+        self.cache_root = Path(cache_root)
+        # Generate query hash
+        self.query_hash = hash_query(filter_dict, projection, sort)
+        # Cache directory for this specific query
+        self.cache_dir = self.cache_root / self.query_hash
+    def ensure_cache_dir(self) -> Path:
+        """
+        Create cache directory if it doesn't exist.
+        Returns:
+            Path to cache directory
+        """
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        return self.cache_dir
+    def exists(self) -> bool:
+        """
+        Check if cache directory exists and has parquet files.
+        Returns:
+            True if cache exists with .parquet files
+        """
+        if not self.cache_dir.exists():
+            return False
+        # Check for at least one parquet file
+        parquet_files = list(self.cache_dir.glob("*.parquet"))
+        return len(parquet_files) > 0
+    def list_parquet_files(self) -> list[Path]:
+        """
+        List all parquet files in cache directory.
+        Returns:
+            List of parquet file paths, sorted by name
+        """
+        if not self.cache_dir.exists():
+            return []
+        files = sorted(self.cache_dir.glob("*.parquet"))
+        return files
+    def clean(self) -> bool:
+        """
+        Remove cache directory and all contents.
+        Use after downloading data to free disk space.
+        Returns:
+            True if cache was removed, False if didn't exist
+        """
+        if not self.cache_dir.exists():
+            return False
+        shutil.rmtree(self.cache_dir)
+        return True
+    def get_metadata(self) -> Dict[str, Any]:
+        """
+        Get cache metadata.
+        Returns:
+            Dict with keys:
+                - query_hash (str): Full hash of the query
+                - cache_dir (str): Path to cache directory
+                - exists (bool): Whether cache has parquet files
+                - file_count (int): Number of parquet files
+                - total_size_mb (float): Total size in megabytes
+        """
+        parquet_files = self.list_parquet_files()
+        total_size = sum(f.stat().st_size for f in parquet_files)
+        total_size_mb = total_size / (1024 * 1024)
+        return {
+            "query_hash": self.query_hash,
+            "cache_dir": str(self.cache_dir),
+            "exists": self.exists(),
+            "file_count": len(parquet_files),
+            "total_size_mb": round(total_size_mb, 2),
+        }
+    def __repr__(self) -> str:
+        meta = self.get_metadata()
+        return (
+            f"CacheManager(hash={self.query_hash[:8]}..., "
+            f"exists={meta['exists']}, files={meta['file_count']}, "
+            f"size={meta['total_size_mb']:.1f}MB)"
+        )