xlr8 0.1.7b3__cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xlr8/schema/encoder.py ADDED
@@ -0,0 +1,235 @@
1
+ """
2
+ Value encoder for XLR8 (Python reference implementation).
3
+
4
+ NOTE: This module provides a pure Python implementation for reference
5
+ and testing. The Rust backend (encode_any_values_to_arrow) provides
6
+ an optimized version that is much faster.
7
+
8
+ ================================================================================
9
+ DATA FLOW - POLYMORPHIC VALUE ENCODING (Types.Any)
10
+ ================================================================================
11
+
12
+ This module handles encoding/decoding of polymorphic values (Types.Any) to/from
13
+ union structs. This is the "bitmap struct" pattern from cetolib.
14
+
15
+ THE PROBLEM:
16
+ ────────────────────────────────────────────────────────────────────────────────
17
+
18
+ MongoDB "value" fields often contain mixed types:
19
+ doc1: {"value": 42.5} # float
20
+ doc2: {"value": 100} # int
21
+ doc3: {"value": "active"} # string
22
+ doc4: {"value": true} # bool
23
+
24
+ Parquet columns must be homogeneous (one type per column).
25
+ How do we store mixed types?
26
+
27
+ THE SOLUTION - UNION STRUCT:
28
+ ────────────────────────────────────────────────────────────────────────────────
29
+
30
+ Store as a struct with ONE field populated, others null:
31
+
32
+ ┌─────────────────────────────────────────────────────────────────────────────┐
33
+ │ ENCODE: Python value -> Struct │
34
+ │ │
35
+ │ encode_any(42.5) returns: │
36
+ │ { │
37
+ │ "float_value": 42.5, ← VALUE IS HERE │
38
+ │ "int_value": null, │
39
+ │ "string_value": null, │
40
+ │ "bool_value": null, │
41
+ │ "datetime_value": null, │
42
+ │ "objectid_value": null, │
43
+ │ "null_value": null, │
44
+ │ } │
45
+ │ │
46
+ │ encode_any("active") returns: │
47
+ │ { │
48
+ │ "float_value": null, │
49
+ │ "int_value": null, │
50
+ │ "string_value": "active", ← VALUE IS HERE │
51
+ │ "bool_value": null, │
52
+ │ "datetime_value": null, │
53
+ │ "objectid_value": null, │
54
+ │ "null_value": null, │
55
+ │ } │
56
+ └─────────────────────────────────────────────────────────────────────────────┘
57
+
58
+ ┌─────────────────────────────────────────────────────────────────────────────┐
59
+ │ DECODE: Struct -> Python value │
60
+ │ │
61
+ │ decode_any({"float_value": 42.5, ...others null}) returns: 42.5 │
62
+ │ decode_any({"string_value": "active", ...others null}) returns: "active" │
63
+ │ │
64
+ │ Algorithm: Check each field in order, return first non-null │
65
+ └─────────────────────────────────────────────────────────────────────────────┘
66
+
67
+ TYPE MAPPING:
68
+ ────────────────────────────────────────────────────────────────────────────────
69
+
70
+ Python type -> Struct field
71
+ ──────────────────────────────────
72
+ None -> null_value: True
73
+ bool -> bool_value (CHECK BEFORE int!)
74
+ int -> int_value
75
+ float -> float_value
76
+ str -> string_value
77
+ datetime -> datetime_value
78
+ ObjectId -> objectid_value (as string)
79
+ other -> string_value (JSON serialized)
80
+
81
+ NOTE: bool must be checked BEFORE int because isinstance(True, int) is True!
82
+
83
+ ================================================================================
84
+ """
85
+
86
+ import json
87
+ from datetime import datetime
88
+ from typing import Any as AnyPython
89
+ from typing import Dict
90
+
91
+ from bson import ObjectId
92
+
93
+ __all__ = [
94
+ "ValueEncoder",
95
+ ]
96
+
97
+
98
+ class ValueEncoder:
99
+ """
100
+ Encodes and decodes values according to schema types.
101
+
102
+ For Types.Any fields, encodes Python values into union structs
103
+ where only one field is populated based on the value's type.
104
+
105
+ Example:
106
+ encoder = ValueEncoder()
107
+
108
+ # Encode different types
109
+ encoder.encode_any(42.5) # {"float_value": 42.5, ...others null}
110
+ encoder.encode_any("hello") # {"string_value": "hello", ...others null}
111
+ encoder.encode_any(True) # {"bool_value": True, ...others null}
112
+
113
+ # Decode back
114
+ struct = {"float_value": 42.5, "int_value": None, ...}
115
+ encoder.decode_any(struct) # Returns: 42.5
116
+ """
117
+
118
+ @staticmethod
119
+ def encode_any(value: AnyPython) -> Dict[str, AnyPython]:
120
+ """
121
+ Encode a polymorphic value to union struct.
122
+
123
+ Maps Python types to appropriate struct fields:
124
+ - None -> null_value: True
125
+ - bool -> bool_value
126
+ - int -> int_value
127
+ - float -> float_value
128
+ - str -> string_value
129
+ - datetime -> datetime_value
130
+ - ObjectId -> objectid_value (as string)
131
+ - other -> string_value (JSON serialized)
132
+
133
+ Args:
134
+ value: Python value to encode
135
+
136
+ Returns:
137
+ Dict with one field populated, others None
138
+ """
139
+ result: Dict[str, AnyPython] = {
140
+ "float_value": None,
141
+ "int_value": None,
142
+ "string_value": None,
143
+ "bool_value": None,
144
+ "datetime_value": None,
145
+ "objectid_value": None,
146
+ "null_value": None,
147
+ }
148
+
149
+ if value is None:
150
+ result["null_value"] = True
151
+ elif isinstance(value, bool):
152
+ # Check bool BEFORE int (bool is subclass of int in Python)
153
+ result["bool_value"] = value
154
+ elif isinstance(value, int):
155
+ result["int_value"] = value
156
+ elif isinstance(value, float):
157
+ result["float_value"] = value
158
+ elif isinstance(value, str):
159
+ result["string_value"] = value
160
+ elif isinstance(value, datetime):
161
+ result["datetime_value"] = value
162
+ elif isinstance(value, ObjectId):
163
+ result["objectid_value"] = str(value)
164
+ else:
165
+ # Fallback: JSON serialize complex types
166
+ try:
167
+ result["string_value"] = json.dumps(value, default=str)
168
+ except (TypeError, ValueError):
169
+ result["string_value"] = str(value)
170
+
171
+ return result
172
+
173
+ @staticmethod
174
+ def decode_any(struct_value: Dict[str, AnyPython]) -> AnyPython:
175
+ """
176
+ Decode union struct back to Python value.
177
+
178
+ Checks fields in priority order and returns the first non-null value.
179
+
180
+ Args:
181
+ struct_value: Dict with union struct fields
182
+
183
+ Returns:
184
+ Decoded Python value
185
+ """
186
+ if struct_value.get("null_value"):
187
+ return None
188
+
189
+ # Check in order of specificity
190
+ if struct_value.get("float_value") is not None:
191
+ return struct_value["float_value"]
192
+
193
+ if struct_value.get("int_value") is not None:
194
+ return struct_value["int_value"]
195
+
196
+ if struct_value.get("bool_value") is not None:
197
+ return struct_value["bool_value"]
198
+
199
+ if struct_value.get("datetime_value") is not None:
200
+ return struct_value["datetime_value"]
201
+
202
+ if struct_value.get("objectid_value") is not None:
203
+ return ObjectId(struct_value["objectid_value"])
204
+
205
+ if struct_value.get("string_value") is not None:
206
+ return struct_value["string_value"]
207
+
208
+ # All fields None (shouldn't happen with valid data)
209
+ return None
210
+
211
+ @staticmethod
212
+ def encode_batch(values: list) -> list:
213
+ """
214
+ Encode a batch of values.
215
+
216
+ Args:
217
+ values: List of Python values
218
+
219
+ Returns:
220
+ List of encoded structs
221
+ """
222
+ return [ValueEncoder.encode_any(v) for v in values]
223
+
224
+ @staticmethod
225
+ def decode_batch(struct_values: list) -> list:
226
+ """
227
+ Decode a batch of struct values.
228
+
229
+ Args:
230
+ struct_values: List of union structs
231
+
232
+ Returns:
233
+ List of decoded Python values
234
+ """
235
+ return [ValueEncoder.decode_any(s) for s in struct_values]
xlr8/schema/schema.py ADDED
@@ -0,0 +1,265 @@
1
+ """
2
+ Schema definition for XLR8.
3
+
4
+ Schema describes the structure of MongoDB documents and how they map to Arrow/Parquet.
5
+ """
6
+
7
+ from typing import Dict, List
8
+
9
+ import pyarrow as pa
10
+
11
+ from .types import Any as AnyType
12
+ from .types import BaseType, DateTime, Timestamp
13
+
14
+
15
+ class Schema:
16
+ """
17
+ Defines the structure of MongoDB documents for XLR8 acceleration.
18
+
19
+ Schema is required to:
20
+ - Convert MongoDB documents to Arrow tables
21
+ - Store data efficiently in Parquet
22
+ - Reconstruct DataFrames with correct types
23
+
24
+ Example:
25
+ ```python
26
+ schema = Schema(
27
+ time_field="timestamp",
28
+ fields={
29
+ "timestamp": Types.Timestamp("ns", tz="UTC"),
30
+ "sensor_id": Types.String(),
31
+ "value": Types.Float(),
32
+ "metadata": Types.Any, # Polymorphic
33
+ }
34
+ )
35
+ ```
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ time_field: str,
41
+ fields: Dict[str, BaseType],
42
+ avg_doc_size_bytes: int = 500,
43
+ flatten_nested: bool = True,
44
+ ):
45
+ """
46
+ Create a schema definition.
47
+
48
+ Args:
49
+ time_field: Name of the timestamp field (required for chunking)
50
+ fields: Dict mapping field name to XLR8 type
51
+ avg_doc_size_bytes: Average BSON document size in bytes (default: 500)
52
+ Used for memory-aware batch sizing and execution planning
53
+ flatten_nested: If True, flatten nested paths like "metadata.user_id"
54
+
55
+ Raises:
56
+ ValueError: If time_field not in fields or not Timestamp type
57
+ """
58
+ self.time_field = time_field
59
+ self.fields = fields
60
+ self.avg_doc_size_bytes = avg_doc_size_bytes
61
+ self.flatten_nested = flatten_nested
62
+ self._validate()
63
+
64
+ SPEC_VERSION = 1
65
+
66
+ def _validate(self):
67
+ """Validate schema configuration."""
68
+ if self.time_field not in self.fields:
69
+ raise ValueError(
70
+ f"time_field '{self.time_field}' must be present in fields. "
71
+ f"Available fields: {list(self.fields.keys())}"
72
+ )
73
+
74
+ time_field_type = self.fields[self.time_field]
75
+ if not isinstance(time_field_type, (Timestamp, DateTime)):
76
+ raise ValueError(
77
+ f"time_field '{self.time_field}' must be Timestamp or DateTime type, "
78
+ f"got {type(time_field_type).__name__}"
79
+ )
80
+
81
+ def to_arrow_schema(self) -> pa.Schema:
82
+ """
83
+ Convert to PyArrow schema.
84
+
85
+ Returns:
86
+ PyArrow schema object
87
+ """
88
+ return pa.schema(
89
+ [(name, field_type.to_arrow()) for name, field_type in self.fields.items()]
90
+ )
91
+
92
+ def get_any_fields(self) -> List[str]:
93
+ """
94
+ Get list of fields with Types.Any (polymorphic types).
95
+
96
+ Returns:
97
+ List of field names that are Any type
98
+ """
99
+ return [
100
+ name
101
+ for name, field_type in self.fields.items()
102
+ if isinstance(field_type, AnyType)
103
+ ]
104
+
105
+ def get_field_names(self) -> List[str]:
106
+ """
107
+ Get all field names in schema.
108
+
109
+ Returns:
110
+ List of field names
111
+ """
112
+ return list(self.fields.keys())
113
+
114
+ def has_field(self, name: str) -> bool:
115
+ """
116
+ Check if field exists in schema.
117
+
118
+ Args:
119
+ name: Field name to check
120
+
121
+ Returns:
122
+ True if field exists
123
+ """
124
+ return name in self.fields
125
+
126
+ def get_field_type(self, name: str) -> BaseType:
127
+ """
128
+ Get type for a field.
129
+
130
+ Args:
131
+ name: Field name
132
+
133
+ Returns:
134
+ XLR8 type object
135
+
136
+ Raises:
137
+ KeyError: If field not in schema
138
+ """
139
+ return self.fields[name]
140
+
141
+ def to_spec(self) -> Dict[str, object]:
142
+ """Export schema to a JSON-serializable specification.
143
+
144
+ Converts Python schema objects to a plain dict that can be:
145
+ - Saved to disk (JSON/YAML)
146
+ - Transmitted over network
147
+ - Consumed by native backends (e.g., Rust)
148
+ - Reconstructed later using from_spec()
149
+
150
+ The spec format is intentionally generic and uses introspection to
151
+ automatically handle any user-defined Types.* classes without
152
+ hardcoding each type. This means you can add new type classes and
153
+ they'll automatically work with serialization/deserialization.
154
+
155
+ Returns:
156
+ Dict containing schema version, time field, and field specifications
157
+
158
+ Example:
159
+ >>> schema = Schema(
160
+ ... time_field="ts",
161
+ ... fields={"ts": Timestamp("ms"), "value": Float()}
162
+ ... )
163
+ >>> spec = schema.to_spec()
164
+ >>> # Later: schema2 = Schema.from_spec(spec)
165
+ """
166
+ from . import types as Types # local to avoid cycles
167
+
168
+ fields_spec: List[Dict[str, object]] = []
169
+ for name, f in self.fields.items():
170
+ entry: Dict[str, object] = {"name": name}
171
+
172
+ if isinstance(f, (Types.Timestamp, Types.DateTime)):
173
+ # Both Timestamp and DateTime serialize the same way
174
+ # DateTime is just a convenience wrapper that defaults to "ms"
175
+ entry.update(
176
+ {
177
+ "kind": "timestamp",
178
+ "unit": getattr(f, "unit", "ms"),
179
+ "tz": getattr(f, "tz", "UTC") or "UTC",
180
+ }
181
+ )
182
+ elif isinstance(f, Types.ObjectId):
183
+ entry.update({"kind": "objectid"})
184
+ elif isinstance(f, Types.Any):
185
+ # Preserve Any() union/bitmap layout; the concrete
186
+ # encoder decides how to materialize this.
187
+ any_layout: Dict[str, object] = {
188
+ "variants": [
189
+ {"name": "int64", "id": 0},
190
+ {"name": "float64", "id": 1},
191
+ {"name": "bool", "id": 2},
192
+ {"name": "string", "id": 3},
193
+ {"name": "timestamp_ms_utc", "id": 4},
194
+ {"name": "json_blob", "id": 5},
195
+ ],
196
+ }
197
+
198
+ # If the Any type exposes explicit bitmap/payload
199
+ # field naming, surface that; otherwise let the
200
+ # backend choose sensible defaults.
201
+ bitmap_field = getattr(f, "bitmap_field_name", None)
202
+ payload_field = getattr(f, "payload_field_name", None)
203
+ if bitmap_field is not None:
204
+ any_layout["bitmap_field"] = bitmap_field
205
+ if payload_field is not None:
206
+ any_layout["payload_field"] = payload_field
207
+
208
+ entry.update({"kind": "any", "any_layout": any_layout})
209
+ elif isinstance(f, Types.Int):
210
+ entry.update({"kind": "int64"})
211
+ elif isinstance(f, Types.Float):
212
+ entry.update({"kind": "float64"})
213
+ elif isinstance(f, Types.String):
214
+ entry.update({"kind": "string"})
215
+ elif isinstance(f, Types.Bool):
216
+ entry.update({"kind": "bool"})
217
+ elif isinstance(f, Types.List):
218
+ # List type - serialize with element type info
219
+ # Map element type to kind string
220
+ elem_type = f.element_type
221
+ if isinstance(elem_type, Types.Float):
222
+ elem_kind = "float"
223
+ elif isinstance(elem_type, Types.Int):
224
+ elem_kind = "int"
225
+ elif isinstance(elem_type, Types.String):
226
+ elem_kind = "string"
227
+ elif isinstance(elem_type, Types.Bool):
228
+ elem_kind = "bool"
229
+ elif isinstance(elem_type, Types.DateTime):
230
+ elem_kind = "datetime"
231
+ elif isinstance(elem_type, Types.ObjectId):
232
+ elem_kind = "objectid"
233
+ else:
234
+ raise ValueError(
235
+ f"Unsupported List element type: {type(elem_type).__name__}. "
236
+ f"Supported types: Float, Int, String, Bool, DateTime, ObjectId"
237
+ )
238
+ entry.update({"kind": f"list:{elem_kind}"})
239
+ else:
240
+ # Conservative fallback: treat as json_blob-backed Any.
241
+ entry.update(
242
+ {
243
+ "kind": "any",
244
+ "any_layout": {
245
+ "variants": [{"name": "json_blob", "id": 0}],
246
+ },
247
+ }
248
+ )
249
+
250
+ fields_spec.append(entry)
251
+
252
+ return {
253
+ "version": self.SPEC_VERSION,
254
+ "time_field": self.time_field,
255
+ "avg_doc_size_bytes": self.avg_doc_size_bytes,
256
+ "fields": fields_spec,
257
+ }
258
+
259
+ def __repr__(self) -> str:
260
+ field_lines = [
261
+ f" {name}: {field_type}" for name, field_type in self.fields.items()
262
+ ]
263
+ return (
264
+ f"Schema(time_field='{self.time_field}',\n" + "\n".join(field_lines) + "\n)"
265
+ )