xlr8 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xlr8/__init__.py +109 -0
- xlr8/_xlr8_rust.pyi +71 -0
- xlr8/analysis/__init__.py +58 -0
- xlr8/analysis/brackets.py +1201 -0
- xlr8/analysis/chunker.py +118 -0
- xlr8/analysis/inspector.py +1889 -0
- xlr8/collection/__init__.py +6 -0
- xlr8/collection/cursor.py +2145 -0
- xlr8/collection/cursor.pyi +173 -0
- xlr8/collection/wrapper.py +661 -0
- xlr8/collection/wrapper.pyi +218 -0
- xlr8/constants.py +24 -0
- xlr8/execution/__init__.py +43 -0
- xlr8/execution/callback.py +792 -0
- xlr8/execution/executor.py +500 -0
- xlr8/execution/planner.py +377 -0
- xlr8/py.typed +1 -0
- xlr8/rust_backend.py +42 -0
- xlr8/rust_backend.pyi +71 -0
- xlr8/schema/__init__.py +42 -0
- xlr8/schema/encoder.py +235 -0
- xlr8/schema/schema.py +265 -0
- xlr8/schema/types.py +239 -0
- xlr8/storage/__init__.py +17 -0
- xlr8/storage/cache.py +228 -0
- xlr8/storage/reader.py +1276 -0
- xlr8-0.1.2.dist-info/METADATA +177 -0
- xlr8-0.1.2.dist-info/RECORD +30 -0
- xlr8-0.1.2.dist-info/WHEEL +4 -0
- xlr8-0.1.2.dist-info/licenses/LICENSE +201 -0
xlr8/schema/encoder.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Value encoder for XLR8 (Python reference implementation).
|
|
3
|
+
|
|
4
|
+
NOTE: This module provides a pure Python implementation for reference
|
|
5
|
+
and testing. The Rust backend (encode_any_values_to_arrow) provides
|
|
6
|
+
an optimized version that is much faster.
|
|
7
|
+
|
|
8
|
+
================================================================================
|
|
9
|
+
DATA FLOW - POLYMORPHIC VALUE ENCODING (Types.Any)
|
|
10
|
+
================================================================================
|
|
11
|
+
|
|
12
|
+
This module handles encoding/decoding of polymorphic values (Types.Any) to/from
|
|
13
|
+
union structs. This is the "bitmap struct" pattern from cetolib.
|
|
14
|
+
|
|
15
|
+
THE PROBLEM:
|
|
16
|
+
────────────────────────────────────────────────────────────────────────────────
|
|
17
|
+
|
|
18
|
+
MongoDB "value" fields often contain mixed types:
|
|
19
|
+
doc1: {"value": 42.5} # float
|
|
20
|
+
doc2: {"value": 100} # int
|
|
21
|
+
doc3: {"value": "active"} # string
|
|
22
|
+
doc4: {"value": true} # bool
|
|
23
|
+
|
|
24
|
+
Parquet columns must be homogeneous (one type per column).
|
|
25
|
+
How do we store mixed types?
|
|
26
|
+
|
|
27
|
+
THE SOLUTION - UNION STRUCT:
|
|
28
|
+
────────────────────────────────────────────────────────────────────────────────
|
|
29
|
+
|
|
30
|
+
Store as a struct with ONE field populated, others null:
|
|
31
|
+
|
|
32
|
+
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
33
|
+
│ ENCODE: Python value -> Struct │
|
|
34
|
+
│ │
|
|
35
|
+
│ encode_any(42.5) returns: │
|
|
36
|
+
│ { │
|
|
37
|
+
│ "float_value": 42.5, ← VALUE IS HERE │
|
|
38
|
+
│ "int_value": null, │
|
|
39
|
+
│ "string_value": null, │
|
|
40
|
+
│ "bool_value": null, │
|
|
41
|
+
│ "datetime_value": null, │
|
|
42
|
+
│ "objectid_value": null, │
|
|
43
|
+
│ "null_value": null, │
|
|
44
|
+
│ } │
|
|
45
|
+
│ │
|
|
46
|
+
│ encode_any("active") returns: │
|
|
47
|
+
│ { │
|
|
48
|
+
│ "float_value": null, │
|
|
49
|
+
│ "int_value": null, │
|
|
50
|
+
│ "string_value": "active", ← VALUE IS HERE │
|
|
51
|
+
│ "bool_value": null, │
|
|
52
|
+
│ "datetime_value": null, │
|
|
53
|
+
│ "objectid_value": null, │
|
|
54
|
+
│ "null_value": null, │
|
|
55
|
+
│ } │
|
|
56
|
+
└─────────────────────────────────────────────────────────────────────────────┘
|
|
57
|
+
|
|
58
|
+
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
59
|
+
│ DECODE: Struct -> Python value │
|
|
60
|
+
│ │
|
|
61
|
+
│ decode_any({"float_value": 42.5, ...others null}) returns: 42.5 │
|
|
62
|
+
│ decode_any({"string_value": "active", ...others null}) returns: "active" │
|
|
63
|
+
│ │
|
|
64
|
+
│ Algorithm: Check each field in order, return first non-null │
|
|
65
|
+
└─────────────────────────────────────────────────────────────────────────────┘
|
|
66
|
+
|
|
67
|
+
TYPE MAPPING:
|
|
68
|
+
────────────────────────────────────────────────────────────────────────────────
|
|
69
|
+
|
|
70
|
+
Python type -> Struct field
|
|
71
|
+
──────────────────────────────────
|
|
72
|
+
None -> null_value: True
|
|
73
|
+
bool -> bool_value (CHECK BEFORE int!)
|
|
74
|
+
int -> int_value
|
|
75
|
+
float -> float_value
|
|
76
|
+
str -> string_value
|
|
77
|
+
datetime -> datetime_value
|
|
78
|
+
ObjectId -> objectid_value (as string)
|
|
79
|
+
other -> string_value (JSON serialized)
|
|
80
|
+
|
|
81
|
+
NOTE: bool must be checked BEFORE int because isinstance(True, int) is True!
|
|
82
|
+
|
|
83
|
+
================================================================================
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
import json
|
|
87
|
+
from datetime import datetime
|
|
88
|
+
from typing import Any as AnyPython
|
|
89
|
+
from typing import Dict
|
|
90
|
+
|
|
91
|
+
from bson import ObjectId
|
|
92
|
+
|
|
93
|
+
__all__ = [
|
|
94
|
+
"ValueEncoder",
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class ValueEncoder:
|
|
99
|
+
"""
|
|
100
|
+
Encodes and decodes values according to schema types.
|
|
101
|
+
|
|
102
|
+
For Types.Any fields, encodes Python values into union structs
|
|
103
|
+
where only one field is populated based on the value's type.
|
|
104
|
+
|
|
105
|
+
Example:
|
|
106
|
+
encoder = ValueEncoder()
|
|
107
|
+
|
|
108
|
+
# Encode different types
|
|
109
|
+
encoder.encode_any(42.5) # {"float_value": 42.5, ...others null}
|
|
110
|
+
encoder.encode_any("hello") # {"string_value": "hello", ...others null}
|
|
111
|
+
encoder.encode_any(True) # {"bool_value": True, ...others null}
|
|
112
|
+
|
|
113
|
+
# Decode back
|
|
114
|
+
struct = {"float_value": 42.5, "int_value": None, ...}
|
|
115
|
+
encoder.decode_any(struct) # Returns: 42.5
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
@staticmethod
|
|
119
|
+
def encode_any(value: AnyPython) -> Dict[str, AnyPython]:
|
|
120
|
+
"""
|
|
121
|
+
Encode a polymorphic value to union struct.
|
|
122
|
+
|
|
123
|
+
Maps Python types to appropriate struct fields:
|
|
124
|
+
- None -> null_value: True
|
|
125
|
+
- bool -> bool_value
|
|
126
|
+
- int -> int_value
|
|
127
|
+
- float -> float_value
|
|
128
|
+
- str -> string_value
|
|
129
|
+
- datetime -> datetime_value
|
|
130
|
+
- ObjectId -> objectid_value (as string)
|
|
131
|
+
- other -> string_value (JSON serialized)
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
value: Python value to encode
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
Dict with one field populated, others None
|
|
138
|
+
"""
|
|
139
|
+
result: Dict[str, AnyPython] = {
|
|
140
|
+
"float_value": None,
|
|
141
|
+
"int_value": None,
|
|
142
|
+
"string_value": None,
|
|
143
|
+
"bool_value": None,
|
|
144
|
+
"datetime_value": None,
|
|
145
|
+
"objectid_value": None,
|
|
146
|
+
"null_value": None,
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
if value is None:
|
|
150
|
+
result["null_value"] = True
|
|
151
|
+
elif isinstance(value, bool):
|
|
152
|
+
# Check bool BEFORE int (bool is subclass of int in Python)
|
|
153
|
+
result["bool_value"] = value
|
|
154
|
+
elif isinstance(value, int):
|
|
155
|
+
result["int_value"] = value
|
|
156
|
+
elif isinstance(value, float):
|
|
157
|
+
result["float_value"] = value
|
|
158
|
+
elif isinstance(value, str):
|
|
159
|
+
result["string_value"] = value
|
|
160
|
+
elif isinstance(value, datetime):
|
|
161
|
+
result["datetime_value"] = value
|
|
162
|
+
elif isinstance(value, ObjectId):
|
|
163
|
+
result["objectid_value"] = str(value)
|
|
164
|
+
else:
|
|
165
|
+
# Fallback: JSON serialize complex types
|
|
166
|
+
try:
|
|
167
|
+
result["string_value"] = json.dumps(value, default=str)
|
|
168
|
+
except (TypeError, ValueError):
|
|
169
|
+
result["string_value"] = str(value)
|
|
170
|
+
|
|
171
|
+
return result
|
|
172
|
+
|
|
173
|
+
@staticmethod
|
|
174
|
+
def decode_any(struct_value: Dict[str, AnyPython]) -> AnyPython:
|
|
175
|
+
"""
|
|
176
|
+
Decode union struct back to Python value.
|
|
177
|
+
|
|
178
|
+
Checks fields in priority order and returns the first non-null value.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
struct_value: Dict with union struct fields
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Decoded Python value
|
|
185
|
+
"""
|
|
186
|
+
if struct_value.get("null_value"):
|
|
187
|
+
return None
|
|
188
|
+
|
|
189
|
+
# Check in order of specificity
|
|
190
|
+
if struct_value.get("float_value") is not None:
|
|
191
|
+
return struct_value["float_value"]
|
|
192
|
+
|
|
193
|
+
if struct_value.get("int_value") is not None:
|
|
194
|
+
return struct_value["int_value"]
|
|
195
|
+
|
|
196
|
+
if struct_value.get("bool_value") is not None:
|
|
197
|
+
return struct_value["bool_value"]
|
|
198
|
+
|
|
199
|
+
if struct_value.get("datetime_value") is not None:
|
|
200
|
+
return struct_value["datetime_value"]
|
|
201
|
+
|
|
202
|
+
if struct_value.get("objectid_value") is not None:
|
|
203
|
+
return ObjectId(struct_value["objectid_value"])
|
|
204
|
+
|
|
205
|
+
if struct_value.get("string_value") is not None:
|
|
206
|
+
return struct_value["string_value"]
|
|
207
|
+
|
|
208
|
+
# All fields None (shouldn't happen with valid data)
|
|
209
|
+
return None
|
|
210
|
+
|
|
211
|
+
@staticmethod
|
|
212
|
+
def encode_batch(values: list) -> list:
|
|
213
|
+
"""
|
|
214
|
+
Encode a batch of values.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
values: List of Python values
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
List of encoded structs
|
|
221
|
+
"""
|
|
222
|
+
return [ValueEncoder.encode_any(v) for v in values]
|
|
223
|
+
|
|
224
|
+
@staticmethod
|
|
225
|
+
def decode_batch(struct_values: list) -> list:
|
|
226
|
+
"""
|
|
227
|
+
Decode a batch of struct values.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
struct_values: List of union structs
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
List of decoded Python values
|
|
234
|
+
"""
|
|
235
|
+
return [ValueEncoder.decode_any(s) for s in struct_values]
|
xlr8/schema/schema.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Schema definition for XLR8.
|
|
3
|
+
|
|
4
|
+
Schema describes the structure of MongoDB documents and how they map to Arrow/Parquet.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Dict, List
|
|
8
|
+
|
|
9
|
+
import pyarrow as pa
|
|
10
|
+
|
|
11
|
+
from .types import Any as AnyType
|
|
12
|
+
from .types import BaseType, DateTime, Timestamp
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Schema:
|
|
16
|
+
"""
|
|
17
|
+
Defines the structure of MongoDB documents for XLR8 acceleration.
|
|
18
|
+
|
|
19
|
+
Schema is required to:
|
|
20
|
+
- Convert MongoDB documents to Arrow tables
|
|
21
|
+
- Store data efficiently in Parquet
|
|
22
|
+
- Reconstruct DataFrames with correct types
|
|
23
|
+
|
|
24
|
+
Example:
|
|
25
|
+
```python
|
|
26
|
+
schema = Schema(
|
|
27
|
+
time_field="timestamp",
|
|
28
|
+
fields={
|
|
29
|
+
"timestamp": Types.Timestamp("ns", tz="UTC"),
|
|
30
|
+
"sensor_id": Types.String(),
|
|
31
|
+
"value": Types.Float(),
|
|
32
|
+
"metadata": Types.Any, # Polymorphic
|
|
33
|
+
}
|
|
34
|
+
)
|
|
35
|
+
```
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
time_field: str,
|
|
41
|
+
fields: Dict[str, BaseType],
|
|
42
|
+
avg_doc_size_bytes: int = 500,
|
|
43
|
+
flatten_nested: bool = True,
|
|
44
|
+
):
|
|
45
|
+
"""
|
|
46
|
+
Create a schema definition.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
time_field: Name of the timestamp field (required for chunking)
|
|
50
|
+
fields: Dict mapping field name to XLR8 type
|
|
51
|
+
avg_doc_size_bytes: Average BSON document size in bytes (default: 500)
|
|
52
|
+
Used for memory-aware batch sizing and execution planning
|
|
53
|
+
flatten_nested: If True, flatten nested paths like "metadata.user_id"
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
ValueError: If time_field not in fields or not Timestamp type
|
|
57
|
+
"""
|
|
58
|
+
self.time_field = time_field
|
|
59
|
+
self.fields = fields
|
|
60
|
+
self.avg_doc_size_bytes = avg_doc_size_bytes
|
|
61
|
+
self.flatten_nested = flatten_nested
|
|
62
|
+
self._validate()
|
|
63
|
+
|
|
64
|
+
SPEC_VERSION = 1
|
|
65
|
+
|
|
66
|
+
def _validate(self):
|
|
67
|
+
"""Validate schema configuration."""
|
|
68
|
+
if self.time_field not in self.fields:
|
|
69
|
+
raise ValueError(
|
|
70
|
+
f"time_field '{self.time_field}' must be present in fields. "
|
|
71
|
+
f"Available fields: {list(self.fields.keys())}"
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
time_field_type = self.fields[self.time_field]
|
|
75
|
+
if not isinstance(time_field_type, (Timestamp, DateTime)):
|
|
76
|
+
raise ValueError(
|
|
77
|
+
f"time_field '{self.time_field}' must be Timestamp or DateTime type, "
|
|
78
|
+
f"got {type(time_field_type).__name__}"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
def to_arrow_schema(self) -> pa.Schema:
|
|
82
|
+
"""
|
|
83
|
+
Convert to PyArrow schema.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
PyArrow schema object
|
|
87
|
+
"""
|
|
88
|
+
return pa.schema(
|
|
89
|
+
[(name, field_type.to_arrow()) for name, field_type in self.fields.items()]
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
def get_any_fields(self) -> List[str]:
|
|
93
|
+
"""
|
|
94
|
+
Get list of fields with Types.Any (polymorphic types).
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
List of field names that are Any type
|
|
98
|
+
"""
|
|
99
|
+
return [
|
|
100
|
+
name
|
|
101
|
+
for name, field_type in self.fields.items()
|
|
102
|
+
if isinstance(field_type, AnyType)
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
def get_field_names(self) -> List[str]:
|
|
106
|
+
"""
|
|
107
|
+
Get all field names in schema.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
List of field names
|
|
111
|
+
"""
|
|
112
|
+
return list(self.fields.keys())
|
|
113
|
+
|
|
114
|
+
def has_field(self, name: str) -> bool:
|
|
115
|
+
"""
|
|
116
|
+
Check if field exists in schema.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
name: Field name to check
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
True if field exists
|
|
123
|
+
"""
|
|
124
|
+
return name in self.fields
|
|
125
|
+
|
|
126
|
+
def get_field_type(self, name: str) -> BaseType:
|
|
127
|
+
"""
|
|
128
|
+
Get type for a field.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
name: Field name
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
XLR8 type object
|
|
135
|
+
|
|
136
|
+
Raises:
|
|
137
|
+
KeyError: If field not in schema
|
|
138
|
+
"""
|
|
139
|
+
return self.fields[name]
|
|
140
|
+
|
|
141
|
+
def to_spec(self) -> Dict[str, object]:
|
|
142
|
+
"""Export schema to a JSON-serializable specification.
|
|
143
|
+
|
|
144
|
+
Converts Python schema objects to a plain dict that can be:
|
|
145
|
+
- Saved to disk (JSON/YAML)
|
|
146
|
+
- Transmitted over network
|
|
147
|
+
- Consumed by native backends (e.g., Rust)
|
|
148
|
+
- Reconstructed later using from_spec()
|
|
149
|
+
|
|
150
|
+
The spec format is intentionally generic and uses introspection to
|
|
151
|
+
automatically handle any user-defined Types.* classes without
|
|
152
|
+
hardcoding each type. This means you can add new type classes and
|
|
153
|
+
they'll automatically work with serialization/deserialization.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
Dict containing schema version, time field, and field specifications
|
|
157
|
+
|
|
158
|
+
Example:
|
|
159
|
+
>>> schema = Schema(
|
|
160
|
+
... time_field="ts",
|
|
161
|
+
... fields={"ts": Timestamp("ms"), "value": Float()}
|
|
162
|
+
... )
|
|
163
|
+
>>> spec = schema.to_spec()
|
|
164
|
+
>>> # Later: schema2 = Schema.from_spec(spec)
|
|
165
|
+
"""
|
|
166
|
+
from . import types as Types # local to avoid cycles
|
|
167
|
+
|
|
168
|
+
fields_spec: List[Dict[str, object]] = []
|
|
169
|
+
for name, f in self.fields.items():
|
|
170
|
+
entry: Dict[str, object] = {"name": name}
|
|
171
|
+
|
|
172
|
+
if isinstance(f, (Types.Timestamp, Types.DateTime)):
|
|
173
|
+
# Both Timestamp and DateTime serialize the same way
|
|
174
|
+
# DateTime is just a convenience wrapper that defaults to "ms"
|
|
175
|
+
entry.update(
|
|
176
|
+
{
|
|
177
|
+
"kind": "timestamp",
|
|
178
|
+
"unit": getattr(f, "unit", "ms"),
|
|
179
|
+
"tz": getattr(f, "tz", "UTC") or "UTC",
|
|
180
|
+
}
|
|
181
|
+
)
|
|
182
|
+
elif isinstance(f, Types.ObjectId):
|
|
183
|
+
entry.update({"kind": "objectid"})
|
|
184
|
+
elif isinstance(f, Types.Any):
|
|
185
|
+
# Preserve Any() union/bitmap layout; the concrete
|
|
186
|
+
# encoder decides how to materialize this.
|
|
187
|
+
any_layout: Dict[str, object] = {
|
|
188
|
+
"variants": [
|
|
189
|
+
{"name": "int64", "id": 0},
|
|
190
|
+
{"name": "float64", "id": 1},
|
|
191
|
+
{"name": "bool", "id": 2},
|
|
192
|
+
{"name": "string", "id": 3},
|
|
193
|
+
{"name": "timestamp_ms_utc", "id": 4},
|
|
194
|
+
{"name": "json_blob", "id": 5},
|
|
195
|
+
],
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
# If the Any type exposes explicit bitmap/payload
|
|
199
|
+
# field naming, surface that; otherwise let the
|
|
200
|
+
# backend choose sensible defaults.
|
|
201
|
+
bitmap_field = getattr(f, "bitmap_field_name", None)
|
|
202
|
+
payload_field = getattr(f, "payload_field_name", None)
|
|
203
|
+
if bitmap_field is not None:
|
|
204
|
+
any_layout["bitmap_field"] = bitmap_field
|
|
205
|
+
if payload_field is not None:
|
|
206
|
+
any_layout["payload_field"] = payload_field
|
|
207
|
+
|
|
208
|
+
entry.update({"kind": "any", "any_layout": any_layout})
|
|
209
|
+
elif isinstance(f, Types.Int):
|
|
210
|
+
entry.update({"kind": "int64"})
|
|
211
|
+
elif isinstance(f, Types.Float):
|
|
212
|
+
entry.update({"kind": "float64"})
|
|
213
|
+
elif isinstance(f, Types.String):
|
|
214
|
+
entry.update({"kind": "string"})
|
|
215
|
+
elif isinstance(f, Types.Bool):
|
|
216
|
+
entry.update({"kind": "bool"})
|
|
217
|
+
elif isinstance(f, Types.List):
|
|
218
|
+
# List type - serialize with element type info
|
|
219
|
+
# Map element type to kind string
|
|
220
|
+
elem_type = f.element_type
|
|
221
|
+
if isinstance(elem_type, Types.Float):
|
|
222
|
+
elem_kind = "float"
|
|
223
|
+
elif isinstance(elem_type, Types.Int):
|
|
224
|
+
elem_kind = "int"
|
|
225
|
+
elif isinstance(elem_type, Types.String):
|
|
226
|
+
elem_kind = "string"
|
|
227
|
+
elif isinstance(elem_type, Types.Bool):
|
|
228
|
+
elem_kind = "bool"
|
|
229
|
+
elif isinstance(elem_type, Types.DateTime):
|
|
230
|
+
elem_kind = "datetime"
|
|
231
|
+
elif isinstance(elem_type, Types.ObjectId):
|
|
232
|
+
elem_kind = "objectid"
|
|
233
|
+
else:
|
|
234
|
+
raise ValueError(
|
|
235
|
+
f"Unsupported List element type: {type(elem_type).__name__}. "
|
|
236
|
+
f"Supported types: Float, Int, String, Bool, DateTime, ObjectId"
|
|
237
|
+
)
|
|
238
|
+
entry.update({"kind": f"list:{elem_kind}"})
|
|
239
|
+
else:
|
|
240
|
+
# Conservative fallback: treat as json_blob-backed Any.
|
|
241
|
+
entry.update(
|
|
242
|
+
{
|
|
243
|
+
"kind": "any",
|
|
244
|
+
"any_layout": {
|
|
245
|
+
"variants": [{"name": "json_blob", "id": 0}],
|
|
246
|
+
},
|
|
247
|
+
}
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
fields_spec.append(entry)
|
|
251
|
+
|
|
252
|
+
return {
|
|
253
|
+
"version": self.SPEC_VERSION,
|
|
254
|
+
"time_field": self.time_field,
|
|
255
|
+
"avg_doc_size_bytes": self.avg_doc_size_bytes,
|
|
256
|
+
"fields": fields_spec,
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
def __repr__(self) -> str:
|
|
260
|
+
field_lines = [
|
|
261
|
+
f" {name}: {field_type}" for name, field_type in self.fields.items()
|
|
262
|
+
]
|
|
263
|
+
return (
|
|
264
|
+
f"Schema(time_field='{self.time_field}',\n" + "\n".join(field_lines) + "\n)"
|
|
265
|
+
)
|