xlr8 0.1.7b3__cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xlr8/schema/types.py ADDED
@@ -0,0 +1,239 @@
1
+ """
2
+ Type definitions for XLR8's schema system.
3
+
4
+ This module provides type classes that define how MongoDB BSON values
5
+ are mapped to Parquet types.These types form the foundation of XLR8's
6
+ schema system, enabling efficient storage and querying of MongoDB data.
7
+
8
+ Key Features:
9
+ - **Type Safety**: Explicit type definitions for MongoDB document schemas
10
+ - **Arrow Integration**: Seamless conversion between MongoDB BSON and Apache
11
+ Arrow types
12
+ - **Flexible Schema**: Support for both strict and flexible schemas via Types.Any
13
+
14
+ Supported Types:
15
+ - Primitives: String, Int, Float, Bool, Timestamp, ObjectId
16
+ TODO: include all BSON types
17
+ - Complex: Struct (nested documents), List (arrays)
18
+
19
+ Schema Behavior:
20
+ - Fields defined in the schema are type-checked and converted to Arrow types
21
+ - Fields not in the schema are discarded when writing to Parquet.
22
+ - Types.Any provides a flexible escape hatch for dynamic/unknown fields
23
+ which are stored as structs in Parquet and later decoded back to original
24
+ BSON types via the Rust backend.
25
+
26
+ """
27
+
28
+ from abc import ABC, abstractmethod
29
+ from dataclasses import dataclass
30
+ from typing import Dict, Optional
31
+
32
+ import pyarrow as pa
33
+
34
+
35
+ class BaseType(ABC):
36
+ """Base class for all XLR8 types."""
37
+
38
+ @abstractmethod
39
+ def to_arrow(self) -> pa.DataType:
40
+ """Convert to PyArrow data type."""
41
+ pass
42
+
43
+ def __repr__(self) -> str:
44
+ return f"{self.__class__.__name__}()"
45
+
46
+ def __eq__(self, other) -> bool:
47
+ """Compare types for equality."""
48
+ return isinstance(other, self.__class__)
49
+
50
+ def __hash__(self) -> int:
51
+ """Make types hashable for use in sets/dicts."""
52
+ return hash(self.__class__.__name__)
53
+
54
+
55
+ class String(BaseType):
56
+ """String type."""
57
+
58
+ def to_arrow(self) -> pa.DataType:
59
+ return pa.string()
60
+
61
+ def __eq__(self, other) -> bool:
62
+ return isinstance(other, String)
63
+
64
+ def __hash__(self) -> int:
65
+ return hash(self.__class__.__name__)
66
+
67
+
68
+ class Int(BaseType):
69
+ """Integer type (always 64-bit)."""
70
+
71
+ def to_arrow(self) -> pa.DataType:
72
+ return pa.int64()
73
+
74
+
75
+ class Float(BaseType):
76
+ """Floating-point type (always 64-bit)."""
77
+
78
+ def to_arrow(self) -> pa.DataType:
79
+ return pa.float64()
80
+
81
+
82
+ class Bool(BaseType):
83
+ """Boolean type."""
84
+
85
+ def to_arrow(self) -> pa.DataType:
86
+ return pa.bool_()
87
+
88
+ def __eq__(self, other) -> bool:
89
+ return isinstance(other, Bool)
90
+
91
+ def __hash__(self) -> int:
92
+ return hash(self.__class__.__name__)
93
+
94
+
95
+ @dataclass(frozen=True)
96
+ class Timestamp(BaseType):
97
+ """Timestamp type."""
98
+
99
+ unit: str = "ns"
100
+ tz: Optional[str] = "UTC"
101
+
102
+ def to_arrow(self) -> pa.DataType:
103
+ return pa.timestamp(self.unit, tz=self.tz)
104
+
105
+
106
+ @dataclass(frozen=True)
107
+ class DateTime(BaseType):
108
+ """
109
+ DateTime type - convenience wrapper for MongoDB ISODate fields.
110
+
111
+ Automatically uses millisecond precision (MongoDB's standard format).
112
+ For custom precision, use Timestamp() directly.
113
+
114
+ Args:
115
+ tz: Timezone (default: "UTC")
116
+
117
+ Example:
118
+ >>> Schema(
119
+ ... time_field="createdAt",
120
+ ... fields={
121
+ ... "createdAt": Types.DateTime(), # MongoDB ISODate
122
+ ... "customTime": Types.Timestamp("s", tz="UTC"), # Custom unit
123
+ ... }
124
+ ... )
125
+ """
126
+
127
+ tz: Optional[str] = "UTC"
128
+
129
+ def to_arrow(self) -> pa.DataType:
130
+ # MongoDB stores ISODate as milliseconds since epoch
131
+ return pa.timestamp("ms", tz=self.tz)
132
+
133
+
134
+ class ObjectId(BaseType):
135
+ """MongoDB ObjectId type (stored as string in Parquet)."""
136
+
137
+ def to_arrow(self) -> pa.DataType:
138
+ return pa.string()
139
+
140
+ def __eq__(self, other) -> bool:
141
+ return isinstance(other, ObjectId)
142
+
143
+
144
+ class Any(BaseType):
145
+ """
146
+ Polymorphic type - can hold any MongoDB value.
147
+
148
+ Stored as a union struct in Parquet with fields for each possible type.
149
+ The Rust backend handles encoding/decoding for performance.
150
+
151
+ Supports ALL MongoDB BSON types:
152
+ - Double (float64)
153
+ - Int32 (int32)
154
+ - Int64 (int64)
155
+ - String (utf8)
156
+ - ObjectId (hex string)
157
+ - Decimal128 (string)
158
+ - Regex (pattern string)
159
+ - Binary (base64 string)
160
+ - Document (JSON string)
161
+ - Array (JSON string)
162
+ - Boolean (bool)
163
+ - Date (timestamp[ms])
164
+ - Null (bool indicator)
165
+ """
166
+
167
+ def to_arrow(self) -> pa.DataType:
168
+ """Return the Arrow struct type for polymorphic values.
169
+
170
+ This schema must match the Rust backend's encode_any_values_to_arrow
171
+ and decode_any_struct_arrow functions exactly.
172
+ """
173
+ return pa.struct(
174
+ [
175
+ ("float_value", pa.float64()),
176
+ ("int32_value", pa.int32()),
177
+ ("int64_value", pa.int64()),
178
+ ("string_value", pa.string()),
179
+ ("objectid_value", pa.string()),
180
+ ("decimal128_value", pa.string()),
181
+ ("regex_value", pa.string()),
182
+ ("binary_value", pa.string()),
183
+ ("document_value", pa.string()),
184
+ ("array_value", pa.string()),
185
+ ("bool_value", pa.bool_()),
186
+ ("datetime_value", pa.timestamp("ms")),
187
+ ("null_value", pa.bool_()),
188
+ ]
189
+ )
190
+
191
+ def __eq__(self, other) -> bool:
192
+ return isinstance(other, Any)
193
+
194
+
195
+ class Struct(BaseType):
196
+ """Nested struct type."""
197
+
198
+ def __init__(self, fields: Dict[str, BaseType]):
199
+ """
200
+ Args:
201
+ fields: Dict mapping field name to type
202
+ """
203
+ self.fields = fields
204
+
205
+ def to_arrow(self) -> pa.DataType:
206
+ return pa.struct(
207
+ [(name, field_type.to_arrow()) for name, field_type in self.fields.items()]
208
+ )
209
+
210
+ def __repr__(self) -> str:
211
+ field_str = ", ".join(f"{k}: {v}" for k, v in self.fields.items())
212
+ return f"Struct({{{field_str}}})"
213
+
214
+ def __eq__(self, other) -> bool:
215
+ if not isinstance(other, Struct):
216
+ return False
217
+ if set(self.fields.keys()) != set(other.fields.keys()):
218
+ return False
219
+ return all(self.fields[k] == other.fields[k] for k in self.fields)
220
+
221
+
222
+ class List(BaseType):
223
+ """List type."""
224
+
225
+ def __init__(self, element_type: BaseType):
226
+ """
227
+ Args:
228
+ element_type: Type of list elements
229
+ """
230
+ self.element_type = element_type
231
+
232
+ def to_arrow(self) -> pa.DataType:
233
+ return pa.list_(self.element_type.to_arrow())
234
+
235
+ def __repr__(self) -> str:
236
+ return f"List({self.element_type})"
237
+
238
+ def __eq__(self, other) -> bool:
239
+ return isinstance(other, List) and self.element_type == other.element_type
@@ -0,0 +1,17 @@
1
+ """
2
+ Parquet storage layer for XLR8.
3
+
4
+ Provides efficient storage components for MongoDB query results:
5
+
6
+ - Reader: Batch-aware Parquet reader for DataFrame construction
7
+ - Cache: Query-specific cache management with deterministic hashing
8
+ """
9
+
10
+ from .cache import CacheManager, hash_query
11
+ from .reader import ParquetReader
12
+
13
+ __all__ = [
14
+ "ParquetReader",
15
+ "CacheManager",
16
+ "hash_query",
17
+ ]
xlr8/storage/cache.py ADDED
@@ -0,0 +1,228 @@
1
+ """
2
+ Cache management for XLR8 Parquet storage.
3
+
4
+ This module provides query-specific caching for MongoDB results:
5
+
6
+ 1. Query Hashing (hash_query):
7
+ - Creates deterministic MD5 hash from query parameters (filter, projection, sort)
8
+ - Normalizes datetimes to ISO format, ObjectIds to strings
9
+ - Recursively sorts dicts for determinism
10
+ - Same query always produces same hash
11
+
12
+ 2. Cache Lifecycle (CacheManager):
13
+ - Each query gets unique directory: .cache/{query_hash}/
14
+ - Manages Parquet file storage per query
15
+ - Provides cache existence checking, file listing, cleanup
16
+
17
+ Usage:
18
+ # Hash a query
19
+ query_hash = hash_query(filter_dict={"timestamp": {"$gte": start_date}})
20
+
21
+ # Manage cache lifecycle
22
+ cache = CacheManager(filter_dict={"timestamp": {"$gte": start_date}})
23
+ cache.ensure_cache_dir()
24
+ # ... write parquet files to cache.cache_dir ...
25
+ if cache.exists():
26
+ files = cache.list_parquet_files()
27
+ cache.clean() # Remove when done
28
+ """
29
+
30
+ import hashlib
31
+ import json
32
+ import shutil
33
+ from datetime import datetime
34
+ from pathlib import Path
35
+ from typing import Any, Dict, Optional
36
+
37
+ from bson import ObjectId
38
+
39
+
40
+ def hash_query(
41
+ filter_dict: Dict[str, Any],
42
+ projection: Optional[Dict[str, Any]] = None,
43
+ sort: Optional[list] = None,
44
+ ) -> str:
45
+ """
46
+ Create deterministic hash of query parameters.
47
+
48
+ Uses MD5 hash of canonicalized JSON to create unique cache directory name.
49
+ Same query parameters will always produce the same hash.
50
+
51
+ Args:
52
+ filter_dict: MongoDB filter dictionary
53
+ projection: Field projection
54
+ sort: Sort specification
55
+
56
+ Returns:
57
+ Hex string hash (32 characters)
58
+
59
+ Example:
60
+ >>> hash_query({"timestamp": {"$gte": "2024-01-01"}})
61
+ 'a3f5c9d2e1b4f6a8c7e9d1b3f5a7c9e1'
62
+ """
63
+
64
+ def normalize_value(obj):
65
+ """
66
+ Recursively normalize query values for deterministic hashing.
67
+
68
+ Converts datetimes to ISO strings, ObjectIds to strings,
69
+ and sorts dict keys to ensure same query always hashes identically.
70
+ """
71
+ if isinstance(obj, datetime):
72
+ return obj.isoformat()
73
+ elif isinstance(obj, ObjectId):
74
+ return str(obj)
75
+ elif isinstance(obj, dict):
76
+ return {k: normalize_value(v) for k, v in sorted(obj.items())}
77
+ elif isinstance(obj, list):
78
+ return [normalize_value(v) for v in obj]
79
+ return obj
80
+
81
+ # Build canonical representation
82
+ query_repr = {
83
+ "filter": normalize_value(filter_dict),
84
+ }
85
+
86
+ if projection:
87
+ query_repr["projection"] = normalize_value(projection)
88
+
89
+ if sort:
90
+ query_repr["sort"] = normalize_value(sort)
91
+
92
+ # Create deterministic JSON (sorted keys)
93
+ json_str = json.dumps(query_repr, sort_keys=True, separators=(",", ":"))
94
+
95
+ # Hash it
96
+ return hashlib.md5(json_str.encode("utf-8")).hexdigest()
97
+
98
+
99
+ class CacheManager:
100
+ """
101
+ Manages Parquet cache lifecycle for a specific query.
102
+
103
+ Each query gets a unique cache directory based on query hash:
104
+ .cache/{query_hash}/
105
+
106
+ Provides:
107
+ - Cache directory creation
108
+ - Cache existence checking
109
+ - Cache cleanup
110
+
111
+ Example:
112
+ >>> cache = CacheManager(filter_dict={"timestamp": {"$gte": start}})
113
+ >>> cache.ensure_cache_dir()
114
+ >>> # ... write parquet files to cache.cache_dir ...
115
+ >>> cache.clean() # Remove cache when done
116
+ """
117
+
118
+ def __init__(
119
+ self,
120
+ filter_dict: Dict[str, Any],
121
+ projection: Optional[Dict[str, Any]] = None,
122
+ sort: Optional[list] = None,
123
+ cache_root: Path = Path(".cache"),
124
+ ):
125
+ """
126
+ Initialize cache manager for a query.
127
+
128
+ Args:
129
+ filter_dict: MongoDB filter
130
+ projection: Field projection
131
+ sort: Sort specification
132
+ cache_root: Root directory for all caches (default: .cache)
133
+ """
134
+ self.filter_dict = filter_dict
135
+ self.projection = projection
136
+ self.sort = sort
137
+ self.cache_root = Path(cache_root)
138
+
139
+ # Generate query hash
140
+ self.query_hash = hash_query(filter_dict, projection, sort)
141
+
142
+ # Cache directory for this specific query
143
+ self.cache_dir = self.cache_root / self.query_hash
144
+
145
+ def ensure_cache_dir(self) -> Path:
146
+ """
147
+ Create cache directory if it doesn't exist.
148
+
149
+ Returns:
150
+ Path to cache directory
151
+ """
152
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
153
+ return self.cache_dir
154
+
155
+ def exists(self) -> bool:
156
+ """
157
+ Check if cache directory exists and has parquet files.
158
+
159
+ Returns:
160
+ True if cache exists with .parquet files
161
+ """
162
+ if not self.cache_dir.exists():
163
+ return False
164
+
165
+ # Check for at least one parquet file
166
+ parquet_files = list(self.cache_dir.glob("*.parquet"))
167
+ return len(parquet_files) > 0
168
+
169
+ def list_parquet_files(self) -> list[Path]:
170
+ """
171
+ List all parquet files in cache directory.
172
+
173
+ Returns:
174
+ List of parquet file paths, sorted by name
175
+ """
176
+ if not self.cache_dir.exists():
177
+ return []
178
+
179
+ files = sorted(self.cache_dir.glob("*.parquet"))
180
+ return files
181
+
182
+ def clean(self) -> bool:
183
+ """
184
+ Remove cache directory and all contents.
185
+
186
+ Use after downloading data to free disk space.
187
+
188
+ Returns:
189
+ True if cache was removed, False if didn't exist
190
+ """
191
+ if not self.cache_dir.exists():
192
+ return False
193
+
194
+ shutil.rmtree(self.cache_dir)
195
+ return True
196
+
197
+ def get_metadata(self) -> Dict[str, Any]:
198
+ """
199
+ Get cache metadata.
200
+
201
+ Returns:
202
+ Dict with keys:
203
+ - query_hash (str): Full hash of the query
204
+ - cache_dir (str): Path to cache directory
205
+ - exists (bool): Whether cache has parquet files
206
+ - file_count (int): Number of parquet files
207
+ - total_size_mb (float): Total size in megabytes
208
+ """
209
+ parquet_files = self.list_parquet_files()
210
+
211
+ total_size = sum(f.stat().st_size for f in parquet_files)
212
+ total_size_mb = total_size / (1024 * 1024)
213
+
214
+ return {
215
+ "query_hash": self.query_hash,
216
+ "cache_dir": str(self.cache_dir),
217
+ "exists": self.exists(),
218
+ "file_count": len(parquet_files),
219
+ "total_size_mb": round(total_size_mb, 2),
220
+ }
221
+
222
+ def __repr__(self) -> str:
223
+ meta = self.get_metadata()
224
+ return (
225
+ f"CacheManager(hash={self.query_hash[:8]}..., "
226
+ f"exists={meta['exists']}, files={meta['file_count']}, "
227
+ f"size={meta['total_size_mb']:.1f}MB)"
228
+ )