xlr8 0.1.7b2__cp313-cp313-macosx_10_12_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xlr8/__init__.py +113 -0
- xlr8/_xlr8_rust.cpython-313-darwin.so +0 -0
- xlr8/_xlr8_rust.pyi +71 -0
- xlr8/analysis/__init__.py +58 -0
- xlr8/analysis/brackets.py +1201 -0
- xlr8/analysis/chunker.py +118 -0
- xlr8/analysis/inspector.py +1889 -0
- xlr8/collection/__init__.py +6 -0
- xlr8/collection/cursor.py +2155 -0
- xlr8/collection/cursor.pyi +104 -0
- xlr8/collection/wrapper.py +399 -0
- xlr8/collection/wrapper.pyi +61 -0
- xlr8/constants.py +24 -0
- xlr8/execution/__init__.py +43 -0
- xlr8/execution/callback.py +792 -0
- xlr8/execution/executor.py +500 -0
- xlr8/execution/planner.py +377 -0
- xlr8/py.typed +1 -0
- xlr8/rust_backend.py +40 -0
- xlr8/rust_backend.pyi +71 -0
- xlr8/schema/__init__.py +42 -0
- xlr8/schema/encoder.py +235 -0
- xlr8/schema/schema.py +265 -0
- xlr8/schema/types.py +239 -0
- xlr8/storage/__init__.py +17 -0
- xlr8/storage/cache.py +228 -0
- xlr8/storage/reader.py +1369 -0
- xlr8-0.1.7b2.dist-info/METADATA +176 -0
- xlr8-0.1.7b2.dist-info/RECORD +31 -0
- xlr8-0.1.7b2.dist-info/WHEEL +4 -0
- xlr8-0.1.7b2.dist-info/licenses/LICENSE +201 -0
xlr8/storage/reader.py
ADDED
|
@@ -0,0 +1,1369 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Parquet file reader for cache-aware loading.
|
|
3
|
+
|
|
4
|
+
This module reads Parquet files written by the Rust backend and converts them
|
|
5
|
+
back into DataFrames with proper value decoding and type reconstruction.
|
|
6
|
+
|
|
7
|
+
DATA FLOW
|
|
8
|
+
=========
|
|
9
|
+
|
|
10
|
+
STEP 1: DISCOVER RUST-WRITTEN FILES
|
|
11
|
+
------------------------------------
|
|
12
|
+
The Rust backend (rust_backend.fetch_chunks_bson) writes Parquet files with
|
|
13
|
+
timestamp-based naming derived from actual document data:
|
|
14
|
+
|
|
15
|
+
cache_dir/.cache/abc123def/
|
|
16
|
+
ts_1704067200_1704070800_part_0000.parquet
|
|
17
|
+
ts_1704070801_1704074400_part_0000.parquet
|
|
18
|
+
ts_1704074401_1704078000_part_0000.parquet
|
|
19
|
+
ts_1704078001_1704081600_part_0000.parquet
|
|
20
|
+
...
|
|
21
|
+
|
|
22
|
+
Filename format: ts_{min_sec}_{max_sec}_part_{counter:04}.parquet
|
|
23
|
+
- min_sec: Unix timestamp (seconds) of earliest document in file
|
|
24
|
+
- max_sec: Unix timestamp (seconds) of latest document in file
|
|
25
|
+
- counter: Per-worker sequential counter (0000, 0001, 0002, ...)
|
|
26
|
+
Only increments if same worker writes multiple files with identical timestamps
|
|
27
|
+
|
|
28
|
+
How timestamps ensure uniqueness:
|
|
29
|
+
- Each chunk/bracket targets different time ranges
|
|
30
|
+
- Multiple workers process non-overlapping time ranges
|
|
31
|
+
- Natural file separation by actual data timestamps
|
|
32
|
+
- Counter only needed if worker flushes multiple batches with identical ranges
|
|
33
|
+
|
|
34
|
+
Fallback format (no timestamps): part_{counter:04}.parquet
|
|
35
|
+
Used when time_field is None or documents lack timestamps
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
STEP 2: READ & CONCATENATE
|
|
39
|
+
---------------------------
|
|
40
|
+
Pandas: Read all files sequentially, concatenate into single DataFrame
|
|
41
|
+
Polars: Read all files in parallel (native multi-file support)
|
|
42
|
+
|
|
43
|
+
Both engines use PyArrow under the hood for efficient Parquet parsing.
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
STEP 3: DECODE TYPES.ANY STRUCT VALUES
|
|
47
|
+
---------------------------------------
|
|
48
|
+
Types.Any fields are encoded as Arrow structs by Rust backend:
|
|
49
|
+
|
|
50
|
+
Parquet stores:
|
|
51
|
+
{
|
|
52
|
+
"value": {
|
|
53
|
+
"float_value": 42.5,
|
|
54
|
+
"int_value": null,
|
|
55
|
+
"string_value": null,
|
|
56
|
+
"bool_value": null,
|
|
57
|
+
...
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
After decoding (coalesce first non-null field):
|
|
62
|
+
{"value": 42.5}
|
|
63
|
+
|
|
64
|
+
This decoding happens in Rust via decode_any_struct_arrow() for maximum
|
|
65
|
+
performance.
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
STEP 4: FLATTEN NESTED STRUCTS
|
|
69
|
+
-------------------------------
|
|
70
|
+
Convert nested struct columns to dotted field names:
|
|
71
|
+
|
|
72
|
+
Before: {"metadata": {"device_id": "123...", "sensor_id": "456..."}}
|
|
73
|
+
After: {"metadata.device_id": "123...", "metadata.sensor_id": "456..."}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
STEP 5: RECONSTRUCT OBJECTIDS
|
|
77
|
+
------------------------------
|
|
78
|
+
Convert string-encoded ObjectIds back to bson.ObjectId instances:
|
|
79
|
+
|
|
80
|
+
"507f1f77bcf86cd799439011" -> ObjectId("507f1f77bcf86cd799439011")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
OUTPUT: DataFrame ( or Polars to stream pyarrow.Table )
|
|
84
|
+
-----------------
|
|
85
|
+
timestamp metadata.device_id value
|
|
86
|
+
0 2024-01-15 12:00 64a1b2c3... 42.5
|
|
87
|
+
1 2024-01-15 12:01 64a1b2c3... 43.1
|
|
88
|
+
2 2024-01-15 12:02 64a1b2c3... "active"
|
|
89
|
+
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
import logging
|
|
93
|
+
from datetime import datetime
|
|
94
|
+
from pathlib import Path
|
|
95
|
+
from typing import Any, Dict, Generator, Iterator, List, Literal, Optional, Tuple, Union
|
|
96
|
+
|
|
97
|
+
import duckdb
|
|
98
|
+
import pandas as pd
|
|
99
|
+
import polars as pl
|
|
100
|
+
import pyarrow as pa
|
|
101
|
+
import pyarrow.parquet as pq
|
|
102
|
+
from bson import ObjectId
|
|
103
|
+
|
|
104
|
+
from xlr8.constants import DEFAULT_BATCH_SIZE
|
|
105
|
+
|
|
106
|
+
logger = logging.getLogger(__name__)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _convert_datetime_for_filter(dt: datetime, target_type: pa.DataType) -> datetime:
|
|
110
|
+
"""Convert datetime to match the target Arrow timestamp type.
|
|
111
|
+
|
|
112
|
+
Handles timezone-aware vs timezone-naive conversions:
|
|
113
|
+
- If target has timezone and input doesn't: assume UTC
|
|
114
|
+
- If target has no timezone and input does: strip timezone
|
|
115
|
+
- Matching types: return as-is
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
dt: Input datetime
|
|
119
|
+
target_type: PyArrow timestamp type from parquet schema
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
datetime compatible with the target type
|
|
123
|
+
"""
|
|
124
|
+
if not isinstance(target_type, pa.TimestampType):
|
|
125
|
+
return dt
|
|
126
|
+
|
|
127
|
+
target_has_tz = target_type.tz is not None
|
|
128
|
+
input_has_tz = dt.tzinfo is not None
|
|
129
|
+
|
|
130
|
+
if target_has_tz and not input_has_tz:
|
|
131
|
+
# Target has tz, input doesn't - assume input is UTC
|
|
132
|
+
from datetime import timezone
|
|
133
|
+
|
|
134
|
+
return dt.replace(tzinfo=timezone.utc)
|
|
135
|
+
elif not target_has_tz and input_has_tz:
|
|
136
|
+
# Target has no tz, input does - strip timezone
|
|
137
|
+
return dt.replace(tzinfo=None)
|
|
138
|
+
else:
|
|
139
|
+
# Both match (both have tz or both don't)
|
|
140
|
+
return dt
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class ParquetReader:
|
|
144
|
+
"""
|
|
145
|
+
Reads Parquet files from cache directory.
|
|
146
|
+
|
|
147
|
+
Provides streaming and batch reading of documents from Parquet files.
|
|
148
|
+
Supports reading all files in a cache directory or specific partitions.
|
|
149
|
+
|
|
150
|
+
Example:
|
|
151
|
+
>>> reader = ParquetReader(cache_dir=".cache/abc123def")
|
|
152
|
+
>>>
|
|
153
|
+
>>> # Stream all documents
|
|
154
|
+
>>> for doc in reader.iter_documents():
|
|
155
|
+
... logging.debug(doc)
|
|
156
|
+
>>>
|
|
157
|
+
>>> # Or load to DataFrame
|
|
158
|
+
>>> df = reader.to_dataframe()
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
def __init__(self, cache_dir: Union[str, Path]):
|
|
162
|
+
"""
|
|
163
|
+
Initialize reader for cache directory.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
cache_dir: Directory containing parquet files
|
|
167
|
+
"""
|
|
168
|
+
self.cache_dir = Path(cache_dir)
|
|
169
|
+
|
|
170
|
+
if not self.cache_dir.exists():
|
|
171
|
+
raise FileNotFoundError(f"Cache directory not found: {cache_dir}")
|
|
172
|
+
|
|
173
|
+
# Find all parquet files (may be empty if query returned no results)
|
|
174
|
+
self.parquet_files = sorted(self.cache_dir.glob("*.parquet"))
|
|
175
|
+
|
|
176
|
+
def iter_documents(
|
|
177
|
+
self,
|
|
178
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
179
|
+
) -> Iterator[Dict[str, Any]]:
|
|
180
|
+
"""
|
|
181
|
+
Stream documents from all parquet files.
|
|
182
|
+
|
|
183
|
+
Reads in batches to avoid loading entire dataset into memory.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
batch_size: Number of rows to read per batch
|
|
187
|
+
|
|
188
|
+
Yields:
|
|
189
|
+
Document dictionaries
|
|
190
|
+
|
|
191
|
+
Example:
|
|
192
|
+
>>> for doc in reader.iter_documents(batch_size=5000):
|
|
193
|
+
... process(doc)
|
|
194
|
+
"""
|
|
195
|
+
for parquet_file in self.parquet_files:
|
|
196
|
+
# Read in batches
|
|
197
|
+
parquet_file_obj = pq.ParquetFile(parquet_file)
|
|
198
|
+
|
|
199
|
+
for batch in parquet_file_obj.iter_batches(batch_size=batch_size):
|
|
200
|
+
# Convert Arrow batch to pandas then to dicts
|
|
201
|
+
df_batch = batch.to_pandas()
|
|
202
|
+
|
|
203
|
+
for _, row in df_batch.iterrows():
|
|
204
|
+
yield row.to_dict()
|
|
205
|
+
|
|
206
|
+
def _is_any_type(self, field_type: Any) -> bool:
|
|
207
|
+
"""Check if field_type is an Any type (supports both class and instance)."""
|
|
208
|
+
from xlr8.schema.types import Any as AnyType
|
|
209
|
+
|
|
210
|
+
# Support both Types.Any (class) and Types.Any() (instance)
|
|
211
|
+
if isinstance(field_type, AnyType):
|
|
212
|
+
return True
|
|
213
|
+
if isinstance(field_type, type) and issubclass(field_type, AnyType):
|
|
214
|
+
return True
|
|
215
|
+
return False
|
|
216
|
+
|
|
217
|
+
def _decode_struct_values(self, df: pd.DataFrame, schema: Any) -> pd.DataFrame:
|
|
218
|
+
"""
|
|
219
|
+
Decode struct-encoded Any-typed columns back to actual values.
|
|
220
|
+
|
|
221
|
+
For columns marked as Any type in schema, extracts the actual value
|
|
222
|
+
from the struct bitmap representation (float_value, int_value, etc.).
|
|
223
|
+
|
|
224
|
+
Uses Rust Arrow-native decoding for maximum performance (~40x faster).
|
|
225
|
+
|
|
226
|
+
Note: This is a fallback path. The fast path decodes directly from Arrow
|
|
227
|
+
before to_pandas() conversion, avoiding dict overhead entirely.
|
|
228
|
+
"""
|
|
229
|
+
if not hasattr(schema, "fields"):
|
|
230
|
+
return df
|
|
231
|
+
|
|
232
|
+
# Import Rust Arrow-native decoder (required)
|
|
233
|
+
from xlr8.rust_backend import decode_any_struct_arrow
|
|
234
|
+
|
|
235
|
+
# Find Any-typed fields in schema
|
|
236
|
+
for field_name, field_type in schema.fields.items():
|
|
237
|
+
if self._is_any_type(field_type) and field_name in df.columns:
|
|
238
|
+
# Column contains struct-encoded values (dicts)
|
|
239
|
+
col = df[field_name]
|
|
240
|
+
|
|
241
|
+
if len(col) == 0:
|
|
242
|
+
continue
|
|
243
|
+
|
|
244
|
+
# Check if it's a struct (dict) column - skip if already decoded
|
|
245
|
+
first_val = col.iloc[0]
|
|
246
|
+
if not isinstance(first_val, dict):
|
|
247
|
+
# Already decoded in fast path - skip
|
|
248
|
+
continue
|
|
249
|
+
|
|
250
|
+
# Build struct type dynamically based on the dict keys
|
|
251
|
+
sample_dict = first_val
|
|
252
|
+
struct_fields = []
|
|
253
|
+
field_type_map = {
|
|
254
|
+
"float_value": pa.float64(),
|
|
255
|
+
"int32_value": pa.int32(),
|
|
256
|
+
"int64_value": pa.int64(),
|
|
257
|
+
"string_value": pa.string(),
|
|
258
|
+
"objectid_value": pa.string(),
|
|
259
|
+
"decimal128_value": pa.string(),
|
|
260
|
+
"regex_value": pa.string(),
|
|
261
|
+
"binary_value": pa.string(),
|
|
262
|
+
"document_value": pa.string(),
|
|
263
|
+
"array_value": pa.string(),
|
|
264
|
+
"bool_value": pa.bool_(),
|
|
265
|
+
"datetime_value": pa.timestamp("ms"), # Use ms for new schema
|
|
266
|
+
"null_value": pa.bool_(),
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
for key in sample_dict.keys():
|
|
270
|
+
if key in field_type_map:
|
|
271
|
+
struct_fields.append((key, field_type_map[key]))
|
|
272
|
+
|
|
273
|
+
any_struct_type = pa.struct(struct_fields)
|
|
274
|
+
|
|
275
|
+
# Convert to PyArrow array - this is a single pass over the data
|
|
276
|
+
arrow_array = pa.array(col.tolist(), type=any_struct_type)
|
|
277
|
+
|
|
278
|
+
# Decode in Rust - direct memory access to Arrow memory
|
|
279
|
+
decoded_values = decode_any_struct_arrow(arrow_array)
|
|
280
|
+
df[field_name] = decoded_values
|
|
281
|
+
|
|
282
|
+
return df
|
|
283
|
+
|
|
284
|
+
def _flatten_struct_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
285
|
+
"""
|
|
286
|
+
Flatten nested struct columns into separate columns.
|
|
287
|
+
|
|
288
|
+
Example:
|
|
289
|
+
metadata: {'sensor_id': '...', 'device_id': '...'}
|
|
290
|
+
-> metadata.sensor_id: '...', metadata.device_id: '...'
|
|
291
|
+
|
|
292
|
+
"""
|
|
293
|
+
if df.empty:
|
|
294
|
+
return df
|
|
295
|
+
|
|
296
|
+
struct_cols = []
|
|
297
|
+
for col in df.columns:
|
|
298
|
+
# Check if column contains dicts (structs)
|
|
299
|
+
if len(df) > 0 and isinstance(df[col].iloc[0], dict):
|
|
300
|
+
struct_cols.append(col)
|
|
301
|
+
|
|
302
|
+
for col in struct_cols:
|
|
303
|
+
# FAST PATH: Extract struct fields directly using list comprehension
|
|
304
|
+
# This is ~5x faster than pd.json_normalize() for large datasets
|
|
305
|
+
col_values = df[col].tolist()
|
|
306
|
+
|
|
307
|
+
# Detect subcolumns from first non-null row
|
|
308
|
+
first_val = col_values[0] if col_values else {}
|
|
309
|
+
subcolumns = list(first_val.keys()) if isinstance(first_val, dict) else []
|
|
310
|
+
|
|
311
|
+
# Build new columns efficiently
|
|
312
|
+
new_cols = {}
|
|
313
|
+
for subcol in subcolumns:
|
|
314
|
+
new_col_name = f"{col}.{subcol}"
|
|
315
|
+
new_cols[new_col_name] = [
|
|
316
|
+
row.get(subcol) if isinstance(row, dict) else None
|
|
317
|
+
for row in col_values
|
|
318
|
+
]
|
|
319
|
+
|
|
320
|
+
# Drop original struct column
|
|
321
|
+
df = df.drop(columns=[col])
|
|
322
|
+
|
|
323
|
+
# Add flattened columns
|
|
324
|
+
for new_col_name, values in new_cols.items():
|
|
325
|
+
df[new_col_name] = values
|
|
326
|
+
|
|
327
|
+
return df
|
|
328
|
+
|
|
329
|
+
def _reconstruct_objectids(self, df: pd.DataFrame, schema: Any) -> pd.DataFrame:
|
|
330
|
+
"""
|
|
331
|
+
Reconstruct ObjectId columns from string representation.
|
|
332
|
+
|
|
333
|
+
Converts string ObjectIds back to bson.ObjectId instances.
|
|
334
|
+
"""
|
|
335
|
+
from xlr8.schema.types import ObjectId as ObjectIdType
|
|
336
|
+
|
|
337
|
+
# Find all ObjectId fields in schema (including nested ones)
|
|
338
|
+
objectid_fields = []
|
|
339
|
+
|
|
340
|
+
if hasattr(schema, "fields"):
|
|
341
|
+
for field_name, field_type in schema.fields.items():
|
|
342
|
+
if isinstance(field_type, ObjectIdType):
|
|
343
|
+
objectid_fields.append(field_name)
|
|
344
|
+
elif hasattr(field_type, "fields"):
|
|
345
|
+
# Nested struct with ObjectId fields
|
|
346
|
+
for nested_name, nested_type in field_type.fields.items():
|
|
347
|
+
if isinstance(nested_type, ObjectIdType):
|
|
348
|
+
objectid_fields.append(f"{field_name}.{nested_name}")
|
|
349
|
+
|
|
350
|
+
# Convert string columns back to ObjectId
|
|
351
|
+
for field in objectid_fields:
|
|
352
|
+
if field in df.columns:
|
|
353
|
+
df[field] = df[field].apply(
|
|
354
|
+
lambda x: ObjectId(x) if x and pd.notna(x) else x
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
return df
|
|
358
|
+
|
|
359
|
+
def _decode_struct_values_polars(
|
|
360
|
+
self,
|
|
361
|
+
df: "pl.DataFrame",
|
|
362
|
+
schema: Any,
|
|
363
|
+
any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
|
|
364
|
+
) -> "pl.DataFrame":
|
|
365
|
+
"""
|
|
366
|
+
Decode struct-encoded Any-typed columns back to actual values (Polars).
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
df: Polars DataFrame
|
|
370
|
+
schema: Schema with field type info
|
|
371
|
+
any_type_strategy: How to decode:
|
|
372
|
+
- "float": Coalesce to Float64, prioritize numeric (default)
|
|
373
|
+
- "string": Convert everything to string (lossless)
|
|
374
|
+
- "keep_struct": Keep raw struct, don't decode
|
|
375
|
+
"""
|
|
376
|
+
if not hasattr(schema, "fields"):
|
|
377
|
+
return df
|
|
378
|
+
|
|
379
|
+
# Find Any-typed fields in schema
|
|
380
|
+
for field_name, field_type in schema.fields.items():
|
|
381
|
+
if self._is_any_type(field_type) and field_name in df.columns:
|
|
382
|
+
# Check if column is a struct
|
|
383
|
+
col_dtype = df.schema[field_name]
|
|
384
|
+
if str(col_dtype).startswith("Struct"):
|
|
385
|
+
# Strategy: keep_struct - don't decode at all
|
|
386
|
+
if any_type_strategy == "keep_struct":
|
|
387
|
+
continue
|
|
388
|
+
|
|
389
|
+
try:
|
|
390
|
+
# Get field names from the struct
|
|
391
|
+
struct_fields = (
|
|
392
|
+
col_dtype.fields if hasattr(col_dtype, "fields") else []
|
|
393
|
+
) # type: ignore[attr-defined]
|
|
394
|
+
field_names = (
|
|
395
|
+
[f.name for f in struct_fields] if struct_fields else []
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
if any_type_strategy == "string":
|
|
399
|
+
# Convert ALL value types to string
|
|
400
|
+
coalesce_exprs = []
|
|
401
|
+
|
|
402
|
+
# String first (already string)
|
|
403
|
+
if "string_value" in field_names:
|
|
404
|
+
coalesce_exprs.append(
|
|
405
|
+
pl.col(field_name).struct.field("string_value")
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
# Float to string
|
|
409
|
+
if "float_value" in field_names:
|
|
410
|
+
coalesce_exprs.append(
|
|
411
|
+
pl.col(field_name)
|
|
412
|
+
.struct.field("float_value")
|
|
413
|
+
.cast(pl.Utf8)
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
# Int to string
|
|
417
|
+
for int_name in ["int64_value", "int32_value"]:
|
|
418
|
+
if int_name in field_names:
|
|
419
|
+
coalesce_exprs.append(
|
|
420
|
+
pl.col(field_name)
|
|
421
|
+
.struct.field(int_name)
|
|
422
|
+
.cast(pl.Utf8)
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
# Bool to string
|
|
426
|
+
if "bool_value" in field_names:
|
|
427
|
+
coalesce_exprs.append(
|
|
428
|
+
pl.col(field_name)
|
|
429
|
+
.struct.field("bool_value")
|
|
430
|
+
.cast(pl.Utf8)
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
# ObjectId, decimal, etc. (already strings)
|
|
434
|
+
for str_field in [
|
|
435
|
+
"objectid_value",
|
|
436
|
+
"decimal128_value",
|
|
437
|
+
"regex_value",
|
|
438
|
+
"binary_value",
|
|
439
|
+
"document_value",
|
|
440
|
+
"array_value",
|
|
441
|
+
]:
|
|
442
|
+
if str_field in field_names:
|
|
443
|
+
coalesce_exprs.append(
|
|
444
|
+
pl.col(field_name).struct.field(str_field)
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
if coalesce_exprs:
|
|
448
|
+
df = df.with_columns(
|
|
449
|
+
pl.coalesce(coalesce_exprs).alias(field_name)
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
else: # "float" strategy (default)
|
|
453
|
+
# Coalesce to Float64, prioritize numeric
|
|
454
|
+
coalesce_exprs = []
|
|
455
|
+
|
|
456
|
+
# Try float first (highest precision)
|
|
457
|
+
if "float_value" in field_names:
|
|
458
|
+
coalesce_exprs.append(
|
|
459
|
+
pl.col(field_name).struct.field("float_value")
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
# Try various int types, cast to float
|
|
463
|
+
for int_name in ["int64_value", "int32_value"]:
|
|
464
|
+
if int_name in field_names:
|
|
465
|
+
coalesce_exprs.append(
|
|
466
|
+
pl.col(field_name)
|
|
467
|
+
.struct.field(int_name)
|
|
468
|
+
.cast(pl.Float64)
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
# Try bool (as 0.0/1.0)
|
|
472
|
+
if "bool_value" in field_names:
|
|
473
|
+
coalesce_exprs.append(
|
|
474
|
+
pl.col(field_name)
|
|
475
|
+
.struct.field("bool_value")
|
|
476
|
+
.cast(pl.Float64)
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
if coalesce_exprs:
|
|
480
|
+
if len(coalesce_exprs) == 1:
|
|
481
|
+
df = df.with_columns(
|
|
482
|
+
coalesce_exprs[0].alias(field_name)
|
|
483
|
+
)
|
|
484
|
+
else:
|
|
485
|
+
df = df.with_columns(
|
|
486
|
+
pl.coalesce(coalesce_exprs).alias(field_name)
|
|
487
|
+
)
|
|
488
|
+
else:
|
|
489
|
+
logger.warning(
|
|
490
|
+
"Could not decode struct column '%s': "
|
|
491
|
+
"no numeric fields in %s",
|
|
492
|
+
field_name,
|
|
493
|
+
field_names,
|
|
494
|
+
)
|
|
495
|
+
except (AttributeError, KeyError, ValueError) as e:
|
|
496
|
+
logger.warning("Error decoding struct '%s': %s", field_name, e)
|
|
497
|
+
|
|
498
|
+
return df
|
|
499
|
+
|
|
500
|
+
def _process_dataframe(
|
|
501
|
+
self,
|
|
502
|
+
df: Union[pd.DataFrame, "pl.DataFrame"],
|
|
503
|
+
engine: Literal["pandas", "polars"],
|
|
504
|
+
schema: Optional[Any] = None,
|
|
505
|
+
coerce: Literal["raise", "error"] = "raise",
|
|
506
|
+
any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
|
|
507
|
+
) -> Union[pd.DataFrame, "pl.DataFrame"]:
|
|
508
|
+
"""
|
|
509
|
+
Process DataFrame: decode struct values, flatten structs and
|
|
510
|
+
reconstruct ObjectIds.
|
|
511
|
+
|
|
512
|
+
Args:
|
|
513
|
+
df: DataFrame to process
|
|
514
|
+
engine: "pandas" or "polars"
|
|
515
|
+
schema: Schema for ObjectId reconstruction
|
|
516
|
+
coerce: Error handling mode ("raise" or "error")
|
|
517
|
+
any_type_strategy: How to decode Any() structs in Polars
|
|
518
|
+
(float/string/keep_struct)
|
|
519
|
+
|
|
520
|
+
Returns:
|
|
521
|
+
Processed DataFrame
|
|
522
|
+
"""
|
|
523
|
+
if engine == "pandas":
|
|
524
|
+
# First, decode Any-typed struct columns back to actual values
|
|
525
|
+
if schema is not None:
|
|
526
|
+
try:
|
|
527
|
+
df = self._decode_struct_values(df, schema) # type: ignore[arg-type]
|
|
528
|
+
except (AttributeError, KeyError, ValueError, TypeError) as e:
|
|
529
|
+
if coerce == "error":
|
|
530
|
+
logger.error("Error decoding struct values: %s", e)
|
|
531
|
+
else:
|
|
532
|
+
raise
|
|
533
|
+
|
|
534
|
+
# Flatten struct columns (e.g., metadata -> metadata.sensor_id)
|
|
535
|
+
df = self._flatten_struct_columns(df) # type: ignore[arg-type]
|
|
536
|
+
|
|
537
|
+
# Reconstruct ObjectIds from strings
|
|
538
|
+
if schema is not None:
|
|
539
|
+
try:
|
|
540
|
+
df = self._reconstruct_objectids(df, schema)
|
|
541
|
+
except (AttributeError, KeyError, ValueError, TypeError) as e:
|
|
542
|
+
if coerce == "error":
|
|
543
|
+
logger.error("Error reconstructing ObjectIds: %s", e)
|
|
544
|
+
else:
|
|
545
|
+
raise
|
|
546
|
+
|
|
547
|
+
return df
|
|
548
|
+
elif engine == "polars":
|
|
549
|
+
# Polars: decode Any-typed struct columns and keep dotted column names
|
|
550
|
+
if schema is not None:
|
|
551
|
+
try:
|
|
552
|
+
df = self._decode_struct_values_polars(
|
|
553
|
+
df, schema, any_type_strategy
|
|
554
|
+
) # type: ignore[arg-type]
|
|
555
|
+
except (AttributeError, KeyError, ValueError, TypeError) as e:
|
|
556
|
+
if coerce == "error":
|
|
557
|
+
logger.error("Error decoding struct values (polars): %s", e)
|
|
558
|
+
else:
|
|
559
|
+
raise
|
|
560
|
+
return df
|
|
561
|
+
|
|
562
|
+
def to_dataframe(
|
|
563
|
+
self,
|
|
564
|
+
engine: str = "pandas",
|
|
565
|
+
schema: Optional[Any] = None,
|
|
566
|
+
time_field: Optional[str] = None,
|
|
567
|
+
start_date: Optional[datetime] = None,
|
|
568
|
+
end_date: Optional[datetime] = None,
|
|
569
|
+
coerce: Literal["raise", "error"] = "raise",
|
|
570
|
+
any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
|
|
571
|
+
) -> Union[pd.DataFrame, "pl.DataFrame"]:
|
|
572
|
+
"""
|
|
573
|
+
Load all parquet files into a DataFrame.
|
|
574
|
+
|
|
575
|
+
Args:
|
|
576
|
+
engine: "pandas" or "polars"
|
|
577
|
+
schema: Schema for ObjectId reconstruction and struct flattening (required)
|
|
578
|
+
time_field: Name of time field for date filtering (from schema.time_field)
|
|
579
|
+
start_date: Filter data from this date (inclusive, tz-aware datetime)
|
|
580
|
+
end_date: Filter data until this date (exclusive, tz-aware datetime)
|
|
581
|
+
coerce: Error handling mode:
|
|
582
|
+
- "raise": Raise exceptions on schema validation errors (default)
|
|
583
|
+
- "error": Log errors and store None for invalid values
|
|
584
|
+
any_type_strategy: How to decode Types.Any() struct columns in Polars:
|
|
585
|
+
- "float": Coalesce to Float64, prioritize numeric (default)
|
|
586
|
+
- "string": Convert everything to string (lossless)
|
|
587
|
+
- "keep_struct": Keep raw struct, don't decode
|
|
588
|
+
|
|
589
|
+
Returns:
|
|
590
|
+
DataFrame with all documents (structs flattened, ObjectIds reconstructed)
|
|
591
|
+
|
|
592
|
+
Example:
|
|
593
|
+
>>> df = reader.to_dataframe(
|
|
594
|
+
... schema=schema,
|
|
595
|
+
... time_field="timestamp",
|
|
596
|
+
... start_date=datetime(2024, 6, 1, tzinfo=timezone.utc),
|
|
597
|
+
... end_date=datetime(2024, 6, 15, tzinfo=timezone.utc),
|
|
598
|
+
... )
|
|
599
|
+
"""
|
|
600
|
+
# Build PyArrow filter for date range (predicate pushdown)
|
|
601
|
+
# We'll determine the correct timestamp type from the first parquet file
|
|
602
|
+
filters = None
|
|
603
|
+
if time_field and (start_date or end_date) and self.parquet_files:
|
|
604
|
+
# Get the timestamp type from the parquet schema
|
|
605
|
+
first_file_schema = pq.read_schema(self.parquet_files[0])
|
|
606
|
+
field_idx = first_file_schema.get_field_index(time_field)
|
|
607
|
+
if field_idx >= 0:
|
|
608
|
+
ts_type = first_file_schema.field(field_idx).type
|
|
609
|
+
else:
|
|
610
|
+
# Fallback to ms if field not found
|
|
611
|
+
ts_type = pa.timestamp("ms")
|
|
612
|
+
|
|
613
|
+
filter_conditions = []
|
|
614
|
+
if start_date:
|
|
615
|
+
# Convert datetime to match parquet column type
|
|
616
|
+
start_ts = pa.scalar(start_date, type=ts_type)
|
|
617
|
+
filter_conditions.append((time_field, ">=", start_ts))
|
|
618
|
+
if end_date:
|
|
619
|
+
end_ts = pa.scalar(end_date, type=ts_type)
|
|
620
|
+
filter_conditions.append((time_field, "<", end_ts))
|
|
621
|
+
if filter_conditions:
|
|
622
|
+
filters = filter_conditions
|
|
623
|
+
|
|
624
|
+
if engine == "polars":
|
|
625
|
+
# Return empty DataFrame if no parquet files (query returned no results)
|
|
626
|
+
if not self.parquet_files:
|
|
627
|
+
return pl.DataFrame()
|
|
628
|
+
|
|
629
|
+
# Use scan_parquet for lazy evaluation with predicate pushdown
|
|
630
|
+
# This only reads the row groups that match the filter conditions
|
|
631
|
+
lf = pl.scan_parquet(self.parquet_files)
|
|
632
|
+
|
|
633
|
+
# Apply date filter with predicate pushdown (reads only matching data)
|
|
634
|
+
# Convert datetime to match Parquet column dtype (tz-aware or naive)
|
|
635
|
+
if time_field and (start_date or end_date):
|
|
636
|
+
# Get timestamp type from parquet to handle tz correctly
|
|
637
|
+
first_file_schema = pq.read_schema(self.parquet_files[0])
|
|
638
|
+
field_idx = first_file_schema.get_field_index(time_field)
|
|
639
|
+
ts_type = (
|
|
640
|
+
first_file_schema.field(field_idx).type
|
|
641
|
+
if field_idx >= 0
|
|
642
|
+
else pa.timestamp("ms")
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
if start_date:
|
|
646
|
+
start_converted = _convert_datetime_for_filter(start_date, ts_type)
|
|
647
|
+
lf = lf.filter(pl.col(time_field) >= start_converted)
|
|
648
|
+
if end_date:
|
|
649
|
+
end_converted = _convert_datetime_for_filter(end_date, ts_type)
|
|
650
|
+
lf = lf.filter(pl.col(time_field) < end_converted)
|
|
651
|
+
|
|
652
|
+
# Collect executes the query with predicate pushdown
|
|
653
|
+
df = lf.collect()
|
|
654
|
+
|
|
655
|
+
return self._process_dataframe(
|
|
656
|
+
df, engine, schema, coerce, any_type_strategy
|
|
657
|
+
)
|
|
658
|
+
|
|
659
|
+
elif engine == "pandas":
|
|
660
|
+
# Return empty DataFrame if no parquet files (query returned no results)
|
|
661
|
+
if not self.parquet_files:
|
|
662
|
+
return pd.DataFrame()
|
|
663
|
+
|
|
664
|
+
# Read all files with optional filter (predicate pushdown)
|
|
665
|
+
# Use PyArrow to read, then convert to pandas - this allows
|
|
666
|
+
# struct columns to stay in Arrow format for fast Rust decoding
|
|
667
|
+
tables = []
|
|
668
|
+
for parquet_file in self.parquet_files:
|
|
669
|
+
try:
|
|
670
|
+
# Use PyArrow filters for efficient predicate pushdown
|
|
671
|
+
table = pq.read_table(parquet_file, filters=filters)
|
|
672
|
+
tables.append(table)
|
|
673
|
+
except Exception as e:
|
|
674
|
+
if coerce == "error":
|
|
675
|
+
logger.error(f"Error reading {parquet_file}: {e}")
|
|
676
|
+
continue
|
|
677
|
+
raise
|
|
678
|
+
|
|
679
|
+
if not tables:
|
|
680
|
+
return pd.DataFrame()
|
|
681
|
+
|
|
682
|
+
# Concatenate Arrow tables
|
|
683
|
+
combined_table = pa.concat_tables(tables)
|
|
684
|
+
|
|
685
|
+
# FAST PATH: Decode Any-typed struct columns directly in Arrow
|
|
686
|
+
# This gives us 44x speedup because Rust reads Arrow memory directly
|
|
687
|
+
# without Python iteration over dicts
|
|
688
|
+
any_columns_decoded = {}
|
|
689
|
+
columns_to_drop = []
|
|
690
|
+
if schema and hasattr(schema, "fields"):
|
|
691
|
+
from xlr8.rust_backend import decode_any_struct_arrow
|
|
692
|
+
|
|
693
|
+
for field_name, field_type in schema.fields.items():
|
|
694
|
+
if (
|
|
695
|
+
self._is_any_type(field_type)
|
|
696
|
+
and field_name in combined_table.column_names
|
|
697
|
+
):
|
|
698
|
+
col = combined_table.column(field_name)
|
|
699
|
+
if pa.types.is_struct(col.type):
|
|
700
|
+
# Decode in Rust - returns Python list of mixed types
|
|
701
|
+
combined = col.combine_chunks()
|
|
702
|
+
decoded_values = decode_any_struct_arrow(combined)
|
|
703
|
+
any_columns_decoded[field_name] = decoded_values
|
|
704
|
+
# Mark for removal to avoid slow dict conversion
|
|
705
|
+
# in to_pandas()
|
|
706
|
+
columns_to_drop.append(field_name)
|
|
707
|
+
|
|
708
|
+
# Drop decoded struct columns before pandas conversion
|
|
709
|
+
# to avoid dict overhead
|
|
710
|
+
if columns_to_drop:
|
|
711
|
+
combined_table = combined_table.drop(columns_to_drop)
|
|
712
|
+
|
|
713
|
+
# Convert to pandas (non-Any columns go through normal path)
|
|
714
|
+
df = combined_table.to_pandas()
|
|
715
|
+
|
|
716
|
+
# Add back Any columns with decoded values
|
|
717
|
+
# (bypassing struct->dict->decode path)
|
|
718
|
+
for field_name, decoded_values in any_columns_decoded.items():
|
|
719
|
+
df[field_name] = decoded_values
|
|
720
|
+
|
|
721
|
+
return self._process_dataframe(df, engine, schema, coerce)
|
|
722
|
+
|
|
723
|
+
else:
|
|
724
|
+
raise ValueError(f"Unknown engine: {engine}. Use 'pandas' or 'polars'")
|
|
725
|
+
|
|
726
|
+
def iter_dataframe_batches(
|
|
727
|
+
self,
|
|
728
|
+
batch_size: int = 10000,
|
|
729
|
+
schema: Optional[Any] = None,
|
|
730
|
+
time_field: Optional[str] = None,
|
|
731
|
+
start_date: Optional[datetime] = None,
|
|
732
|
+
end_date: Optional[datetime] = None,
|
|
733
|
+
coerce: Literal["raise", "error"] = "raise",
|
|
734
|
+
) -> Generator[pd.DataFrame, None, None]:
|
|
735
|
+
"""
|
|
736
|
+
Yield DataFrames in batches without loading all data into memory.
|
|
737
|
+
|
|
738
|
+
This is memory-efficient: only batch_size rows are in memory at a time.
|
|
739
|
+
Uses PyArrow's batch iteration for efficient streaming.
|
|
740
|
+
|
|
741
|
+
Use this when NO sorting is needed. For sorted batches, use
|
|
742
|
+
iter_globally_sorted_batches().
|
|
743
|
+
|
|
744
|
+
Args:
|
|
745
|
+
batch_size: Number of rows per batch (default: 10,000)
|
|
746
|
+
schema: Schema for struct decoding and ObjectId reconstruction
|
|
747
|
+
time_field: Name of time field for date filtering
|
|
748
|
+
start_date: Filter data from this date (inclusive, tz-aware)
|
|
749
|
+
end_date: Filter data until this date (exclusive, tz-aware)
|
|
750
|
+
coerce: Error handling mode ("raise" or "error")
|
|
751
|
+
|
|
752
|
+
Yields:
|
|
753
|
+
pd.DataFrame: Batches of processed rows
|
|
754
|
+
|
|
755
|
+
Example:
|
|
756
|
+
>>> for batch_df in reader.iter_dataframe_batches(batch_size=5000):
|
|
757
|
+
... process(batch_df)
|
|
758
|
+
"""
|
|
759
|
+
import pyarrow.parquet as pq
|
|
760
|
+
|
|
761
|
+
batch_count = 0
|
|
762
|
+
total_rows = 0
|
|
763
|
+
|
|
764
|
+
# Pre-compute converted datetimes for filtering (tz-aware or naive)
|
|
765
|
+
start_converted = None
|
|
766
|
+
end_converted = None
|
|
767
|
+
if time_field and (start_date or end_date) and self.parquet_files:
|
|
768
|
+
first_file_schema = pq.read_schema(self.parquet_files[0])
|
|
769
|
+
field_idx = first_file_schema.get_field_index(time_field)
|
|
770
|
+
ts_type = (
|
|
771
|
+
first_file_schema.field(field_idx).type
|
|
772
|
+
if field_idx >= 0
|
|
773
|
+
else pa.timestamp("ms")
|
|
774
|
+
)
|
|
775
|
+
if start_date:
|
|
776
|
+
start_converted = _convert_datetime_for_filter(start_date, ts_type)
|
|
777
|
+
if end_date:
|
|
778
|
+
end_converted = _convert_datetime_for_filter(end_date, ts_type)
|
|
779
|
+
|
|
780
|
+
for parquet_file in self.parquet_files:
|
|
781
|
+
try:
|
|
782
|
+
# Open parquet file for batch iteration
|
|
783
|
+
parquet_file_obj = pq.ParquetFile(parquet_file)
|
|
784
|
+
|
|
785
|
+
for batch in parquet_file_obj.iter_batches(batch_size=batch_size):
|
|
786
|
+
# Convert Arrow batch to pandas
|
|
787
|
+
batch_df = batch.to_pandas()
|
|
788
|
+
|
|
789
|
+
# Apply date filter if specified
|
|
790
|
+
if time_field and (start_converted or end_converted):
|
|
791
|
+
if time_field in batch_df.columns:
|
|
792
|
+
if start_converted:
|
|
793
|
+
batch_df = batch_df[
|
|
794
|
+
batch_df[time_field] >= start_converted
|
|
795
|
+
]
|
|
796
|
+
if end_converted:
|
|
797
|
+
batch_df = batch_df[
|
|
798
|
+
batch_df[time_field] < end_converted
|
|
799
|
+
]
|
|
800
|
+
|
|
801
|
+
if len(batch_df) == 0:
|
|
802
|
+
continue
|
|
803
|
+
|
|
804
|
+
# Process the batch (decode structs, flatten, reconstruct ObjectIds)
|
|
805
|
+
processed_df = self._process_dataframe(
|
|
806
|
+
batch_df, "pandas", schema, coerce
|
|
807
|
+
)
|
|
808
|
+
|
|
809
|
+
batch_count += 1
|
|
810
|
+
total_rows += len(processed_df)
|
|
811
|
+
|
|
812
|
+
yield processed_df
|
|
813
|
+
|
|
814
|
+
except Exception as e:
|
|
815
|
+
if coerce == "error":
|
|
816
|
+
logger.error(f"Error reading batch from {parquet_file}: {e}")
|
|
817
|
+
continue
|
|
818
|
+
raise
|
|
819
|
+
|
|
820
|
+
logger.debug(f"Yielded {batch_count} batches, {total_rows} total rows")
|
|
821
|
+
|
|
822
|
+
def get_globally_sorted_dataframe(
|
|
823
|
+
self,
|
|
824
|
+
sort_spec: List[Tuple[str, int]],
|
|
825
|
+
schema: Optional[Any] = None,
|
|
826
|
+
time_field: Optional[str] = None,
|
|
827
|
+
start_date: Optional[datetime] = None,
|
|
828
|
+
end_date: Optional[datetime] = None,
|
|
829
|
+
coerce: Literal["raise", "error"] = "raise",
|
|
830
|
+
memory_limit_mb: Optional[int] = None,
|
|
831
|
+
threads: Optional[int] = None,
|
|
832
|
+
) -> pd.DataFrame:
|
|
833
|
+
"""
|
|
834
|
+
Return entire globally sorted DataFrame using DuckDB K-way merge.
|
|
835
|
+
|
|
836
|
+
More efficient than iter_globally_sorted_batches() when you want
|
|
837
|
+
the full result, as it avoids batch iteration overhead and just
|
|
838
|
+
fetches all rows at once.
|
|
839
|
+
for to_dataframe_batches() where streaming is required.
|
|
840
|
+
|
|
841
|
+
Args:
|
|
842
|
+
sort_spec: Sort specification as [(field, direction), ...]
|
|
843
|
+
schema: Schema for ObjectId reconstruction and advanced sorting
|
|
844
|
+
time_field: Field for date filtering
|
|
845
|
+
start_date: Filter data from this date (inclusive, tz-aware)
|
|
846
|
+
end_date: Filter data until this date (exclusive, tz-aware)
|
|
847
|
+
coerce: Error handling mode
|
|
848
|
+
memory_limit_mb: DuckDB memory limit
|
|
849
|
+
threads: DuckDB thread count
|
|
850
|
+
|
|
851
|
+
Returns:
|
|
852
|
+
pd.DataFrame: Complete sorted DataFrame
|
|
853
|
+
"""
|
|
854
|
+
if not self.parquet_files:
|
|
855
|
+
return pd.DataFrame()
|
|
856
|
+
|
|
857
|
+
# Expand parent fields to children in schema definition order
|
|
858
|
+
sort_spec = self._expand_parent_sort_fields(sort_spec, schema)
|
|
859
|
+
|
|
860
|
+
# Get list of parquet files
|
|
861
|
+
file_paths = [str(f) for f in self.parquet_files]
|
|
862
|
+
|
|
863
|
+
logger.debug(
|
|
864
|
+
f"DuckDB K-way merge (full): {len(file_paths)} files, sort_spec={sort_spec}"
|
|
865
|
+
)
|
|
866
|
+
|
|
867
|
+
try:
|
|
868
|
+
# Create DuckDB connection
|
|
869
|
+
conn = duckdb.connect(":memory:")
|
|
870
|
+
|
|
871
|
+
# Configure DuckDB to use allocated resources
|
|
872
|
+
if memory_limit_mb:
|
|
873
|
+
conn.execute(f"SET memory_limit = '{memory_limit_mb}MB'")
|
|
874
|
+
logger.info(f"DuckDB memory_limit set to {memory_limit_mb} MB")
|
|
875
|
+
|
|
876
|
+
if threads:
|
|
877
|
+
conn.execute(f"SET threads = {threads}")
|
|
878
|
+
logger.info(f"DuckDB threads set to {threads}")
|
|
879
|
+
|
|
880
|
+
# Build ORDER BY with MongoDB type ordering
|
|
881
|
+
# (same logic as iter_globally_sorted_batches)
|
|
882
|
+
order_clauses = []
|
|
883
|
+
for field_name, direction in sort_spec:
|
|
884
|
+
dir_sql = "ASC" if direction == 1 else "DESC"
|
|
885
|
+
if schema and schema.has_field(field_name):
|
|
886
|
+
field_type = schema.get_field_type(field_name)
|
|
887
|
+
else:
|
|
888
|
+
field_type = None
|
|
889
|
+
is_any = self._is_any_type(field_type) if field_type else True
|
|
890
|
+
|
|
891
|
+
if is_any:
|
|
892
|
+
# Complete MongoDB type ordering for Any() fields
|
|
893
|
+
type_clause = f"""CASE
|
|
894
|
+
WHEN "{field_name}" IS NULL OR "{field_name}".null_value IS TRUE
|
|
895
|
+
THEN 0
|
|
896
|
+
WHEN "{field_name}".float_value IS NOT NULL
|
|
897
|
+
OR "{field_name}".int32_value IS NOT NULL
|
|
898
|
+
OR "{field_name}".int64_value IS NOT NULL
|
|
899
|
+
OR "{field_name}".decimal128_value IS NOT NULL
|
|
900
|
+
THEN 1
|
|
901
|
+
WHEN "{field_name}".string_value IS NOT NULL THEN 2
|
|
902
|
+
WHEN "{field_name}".document_value IS NOT NULL THEN 3
|
|
903
|
+
WHEN "{field_name}".array_value IS NOT NULL THEN 4
|
|
904
|
+
WHEN "{field_name}".binary_value IS NOT NULL THEN 5
|
|
905
|
+
WHEN "{field_name}".objectid_value IS NOT NULL THEN 6
|
|
906
|
+
WHEN "{field_name}".bool_value IS NOT NULL THEN 7
|
|
907
|
+
WHEN "{field_name}".datetime_value IS NOT NULL THEN 8
|
|
908
|
+
WHEN "{field_name}".regex_value IS NOT NULL THEN 9
|
|
909
|
+
ELSE 10
|
|
910
|
+
END {dir_sql}"""
|
|
911
|
+
|
|
912
|
+
# Value comparisons for each type
|
|
913
|
+
num_clause = (
|
|
914
|
+
f'COALESCE("{field_name}".float_value, '
|
|
915
|
+
f'CAST("{field_name}".int32_value AS DOUBLE), '
|
|
916
|
+
f'CAST("{field_name}".int64_value AS DOUBLE)) {dir_sql}'
|
|
917
|
+
)
|
|
918
|
+
str_clause = f'"{field_name}".string_value {dir_sql}'
|
|
919
|
+
doc_clause = f'"{field_name}".document_value {dir_sql}'
|
|
920
|
+
arr_clause = f'"{field_name}".array_value {dir_sql}'
|
|
921
|
+
bin_clause = f'"{field_name}".binary_value {dir_sql}'
|
|
922
|
+
oid_clause = f'"{field_name}".objectid_value {dir_sql}'
|
|
923
|
+
bool_clause = f'"{field_name}".bool_value {dir_sql}'
|
|
924
|
+
date_clause = f'"{field_name}".datetime_value {dir_sql}'
|
|
925
|
+
regex_clause = f'"{field_name}".regex_value {dir_sql}'
|
|
926
|
+
|
|
927
|
+
order_clauses.extend(
|
|
928
|
+
[
|
|
929
|
+
type_clause,
|
|
930
|
+
num_clause,
|
|
931
|
+
str_clause,
|
|
932
|
+
doc_clause,
|
|
933
|
+
arr_clause,
|
|
934
|
+
bin_clause,
|
|
935
|
+
oid_clause,
|
|
936
|
+
bool_clause,
|
|
937
|
+
date_clause,
|
|
938
|
+
regex_clause,
|
|
939
|
+
]
|
|
940
|
+
)
|
|
941
|
+
else:
|
|
942
|
+
# Simple field - use direct comparison
|
|
943
|
+
order_clauses.append(f'"{field_name}" {dir_sql}')
|
|
944
|
+
|
|
945
|
+
order_by = ", ".join(order_clauses)
|
|
946
|
+
files = ", ".join([f"'{f}'" for f in file_paths])
|
|
947
|
+
query = f"SELECT * FROM read_parquet([{files}]) ORDER BY {order_by}"
|
|
948
|
+
|
|
949
|
+
logging.debug(f"[DuckDB] K-way merge (full): {len(file_paths)} files")
|
|
950
|
+
|
|
951
|
+
# Fetch entire result at once using df()
|
|
952
|
+
df = conn.execute(query).df()
|
|
953
|
+
|
|
954
|
+
# Ensure time field is UTC
|
|
955
|
+
if time_field and time_field in df.columns:
|
|
956
|
+
if pd.api.types.is_datetime64_any_dtype(df[time_field]):
|
|
957
|
+
if df[time_field].dt.tz is not None:
|
|
958
|
+
df[time_field] = df[time_field].dt.tz_convert("UTC")
|
|
959
|
+
else:
|
|
960
|
+
df[time_field] = df[time_field].dt.tz_localize("UTC")
|
|
961
|
+
|
|
962
|
+
# Apply date filtering if needed
|
|
963
|
+
# Convert datetimes to match the column's timezone state
|
|
964
|
+
if time_field and (start_date or end_date) and time_field in df.columns:
|
|
965
|
+
# After the above, time_field is always tz-aware (UTC)
|
|
966
|
+
# So we need tz-aware comparisons
|
|
967
|
+
from datetime import timezone
|
|
968
|
+
|
|
969
|
+
if start_date:
|
|
970
|
+
start_cmp = (
|
|
971
|
+
start_date
|
|
972
|
+
if start_date.tzinfo
|
|
973
|
+
else start_date.replace(tzinfo=timezone.utc)
|
|
974
|
+
)
|
|
975
|
+
df = df[df[time_field] >= start_cmp]
|
|
976
|
+
if end_date:
|
|
977
|
+
end_cmp = (
|
|
978
|
+
end_date
|
|
979
|
+
if end_date.tzinfo
|
|
980
|
+
else end_date.replace(tzinfo=timezone.utc)
|
|
981
|
+
)
|
|
982
|
+
df = df[df[time_field] < end_cmp]
|
|
983
|
+
|
|
984
|
+
# Process the DataFrame (decode structs, reconstruct ObjectIds)
|
|
985
|
+
df = self._process_dataframe(df, "pandas", schema, coerce)
|
|
986
|
+
|
|
987
|
+
conn.close()
|
|
988
|
+
logging.debug(f"[DuckDB] K-way merge complete: {len(df):,} rows")
|
|
989
|
+
logger.debug(f"DuckDB K-way merge complete: {len(df):,} rows")
|
|
990
|
+
|
|
991
|
+
return df
|
|
992
|
+
|
|
993
|
+
except Exception as e:
|
|
994
|
+
logger.error(f"DuckDB K-way merge failed: {e}")
|
|
995
|
+
raise
|
|
996
|
+
|
|
997
|
+
def _expand_parent_sort_fields(
|
|
998
|
+
self, sort_spec: List[Tuple[str, int]], schema: Optional[Any]
|
|
999
|
+
) -> List[Tuple[str, int]]:
|
|
1000
|
+
"""
|
|
1001
|
+
Expand parent field sorts to their child fields in schema definition order.
|
|
1002
|
+
|
|
1003
|
+
When user sorts by a parent field like "metadata" but the schema has
|
|
1004
|
+
flattened fields like "metadata.device_id", expand to all children.
|
|
1005
|
+
|
|
1006
|
+
Args:
|
|
1007
|
+
sort_spec: Original [(field, direction), ...]
|
|
1008
|
+
schema: XLR8 schema with field definitions
|
|
1009
|
+
|
|
1010
|
+
Returns:
|
|
1011
|
+
Expanded sort spec with parent fields replaced by children
|
|
1012
|
+
|
|
1013
|
+
Raises:
|
|
1014
|
+
ValueError: If field not found and no children exist
|
|
1015
|
+
"""
|
|
1016
|
+
if schema is None:
|
|
1017
|
+
return sort_spec
|
|
1018
|
+
|
|
1019
|
+
expanded = []
|
|
1020
|
+
# Schema.fields preserves insertion order (Python 3.7+)
|
|
1021
|
+
all_fields = list(schema.fields.keys())
|
|
1022
|
+
|
|
1023
|
+
for field_name, direction in sort_spec:
|
|
1024
|
+
if schema.has_field(field_name):
|
|
1025
|
+
# Field exists directly in schema
|
|
1026
|
+
expanded.append((field_name, direction))
|
|
1027
|
+
else:
|
|
1028
|
+
# Look for child fields with this prefix (in schema order)
|
|
1029
|
+
prefix = f"{field_name}."
|
|
1030
|
+
children = [f for f in all_fields if f.startswith(prefix)]
|
|
1031
|
+
|
|
1032
|
+
if children:
|
|
1033
|
+
logger.info(
|
|
1034
|
+
f"Sort field '{field_name}' expanded to children "
|
|
1035
|
+
f"(schema order): {children}"
|
|
1036
|
+
)
|
|
1037
|
+
for child in children:
|
|
1038
|
+
expanded.append((child, direction))
|
|
1039
|
+
else:
|
|
1040
|
+
raise ValueError(
|
|
1041
|
+
f"Sort field '{field_name}' not found in schema "
|
|
1042
|
+
f"and has no child fields. "
|
|
1043
|
+
f"Available fields: {sorted(all_fields)[:10]}"
|
|
1044
|
+
+ ("..." if len(all_fields) > 10 else "")
|
|
1045
|
+
)
|
|
1046
|
+
|
|
1047
|
+
return expanded
|
|
1048
|
+
|
|
1049
|
+
def iter_globally_sorted_batches(
|
|
1050
|
+
self,
|
|
1051
|
+
sort_field: Optional[str] = None,
|
|
1052
|
+
ascending: bool = True,
|
|
1053
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
1054
|
+
schema: Optional[Any] = None,
|
|
1055
|
+
time_field: Optional[str] = None,
|
|
1056
|
+
start_date: Optional[datetime] = None,
|
|
1057
|
+
end_date: Optional[datetime] = None,
|
|
1058
|
+
coerce: Literal["raise", "error"] = "raise",
|
|
1059
|
+
sort_spec: Optional[List[Tuple[str, int]]] = None,
|
|
1060
|
+
# DuckDB configuration
|
|
1061
|
+
memory_limit_mb: Optional[int] = None,
|
|
1062
|
+
threads: Optional[int] = None,
|
|
1063
|
+
) -> Generator[pd.DataFrame, None, None]:
|
|
1064
|
+
"""
|
|
1065
|
+
Yield globally sorted batches using DuckDB K-way merge.
|
|
1066
|
+
|
|
1067
|
+
This method reads all Parquet files in the cache directory and
|
|
1068
|
+
yields batches in globally sorted order. Uses Rust's K-way merge
|
|
1069
|
+
with MongoDB BSON comparison for 100% compatibility.
|
|
1070
|
+
|
|
1071
|
+
Supports advanced sorting:
|
|
1072
|
+
- Parent fields (e.g., "metadata" expands to all child fields)
|
|
1073
|
+
- Types.Any() with full MongoDB BSON type ordering (Objects, Arrays, Binary)
|
|
1074
|
+
|
|
1075
|
+
RAM Usage:
|
|
1076
|
+
O(K × batch_size) where K = number of files.
|
|
1077
|
+
Already handled by flush_ram_limit_mb.
|
|
1078
|
+
|
|
1079
|
+
Args:
|
|
1080
|
+
sort_field: Field to sort by (use sort_spec for multi-field sorting).
|
|
1081
|
+
ascending: Sort direction (use sort_spec for mixed directions).
|
|
1082
|
+
batch_size: Number of rows per yielded DataFrame (default: 10,000)
|
|
1083
|
+
schema: Schema for ObjectId reconstruction and advanced sorting
|
|
1084
|
+
time_field: Field for date filtering (usually same as sort_field)
|
|
1085
|
+
start_date: Filter data from this date (inclusive, tz-aware)
|
|
1086
|
+
end_date: Filter data until this date (exclusive, tz-aware)
|
|
1087
|
+
coerce: Error handling mode ("raise" or "error")
|
|
1088
|
+
sort_spec: Sort specification as [(field, direction), ...] where
|
|
1089
|
+
direction is 1 (ASC) or -1 (DESC). Preferred over sort_field.
|
|
1090
|
+
|
|
1091
|
+
Yields:
|
|
1092
|
+
pd.DataFrame: Batches in globally sorted order
|
|
1093
|
+
|
|
1094
|
+
Example:
|
|
1095
|
+
>>> reader = ParquetReader(".cache/abc123def")
|
|
1096
|
+
>>> # Simple sort
|
|
1097
|
+
>>> for batch in reader.iter_globally_sorted_batches(
|
|
1098
|
+
... sort_spec=[("timestamp", 1)],
|
|
1099
|
+
... schema=schema,
|
|
1100
|
+
... batch_size=10_000
|
|
1101
|
+
... ):
|
|
1102
|
+
... process(batch)
|
|
1103
|
+
>>>
|
|
1104
|
+
>>> # Advanced: parent field + Any type
|
|
1105
|
+
>>> for batch in reader.iter_globally_sorted_batches(
|
|
1106
|
+
... sort_spec=[("metadata", -1), ("value", 1)],
|
|
1107
|
+
... schema=schema,
|
|
1108
|
+
... ):
|
|
1109
|
+
... process(batch)
|
|
1110
|
+
"""
|
|
1111
|
+
|
|
1112
|
+
if not self.parquet_files:
|
|
1113
|
+
return
|
|
1114
|
+
|
|
1115
|
+
# Handle backwards compatibility
|
|
1116
|
+
if sort_spec is None and sort_field is not None:
|
|
1117
|
+
direction = 1 if ascending else -1
|
|
1118
|
+
sort_spec = [(sort_field, direction)]
|
|
1119
|
+
|
|
1120
|
+
if sort_spec is None:
|
|
1121
|
+
raise ValueError("sort_spec or sort_field is required")
|
|
1122
|
+
|
|
1123
|
+
# Expand parent fields to children in schema definition order
|
|
1124
|
+
sort_spec = self._expand_parent_sort_fields(sort_spec, schema)
|
|
1125
|
+
|
|
1126
|
+
# Get list of parquet files
|
|
1127
|
+
file_paths = [str(f) for f in self.parquet_files]
|
|
1128
|
+
|
|
1129
|
+
logger.debug(
|
|
1130
|
+
f"DuckDB K-way merge: {len(file_paths)} files, sort_spec={sort_spec}"
|
|
1131
|
+
)
|
|
1132
|
+
|
|
1133
|
+
try:
|
|
1134
|
+
# Create DuckDB connection
|
|
1135
|
+
conn = duckdb.connect(":memory:")
|
|
1136
|
+
|
|
1137
|
+
# Configure DuckDB to use allocated resources
|
|
1138
|
+
if memory_limit_mb:
|
|
1139
|
+
conn.execute(f"SET memory_limit = '{memory_limit_mb}MB'")
|
|
1140
|
+
logger.info(f"DuckDB memory_limit set to {memory_limit_mb} MB")
|
|
1141
|
+
|
|
1142
|
+
if threads:
|
|
1143
|
+
conn.execute(f"SET threads = {threads}")
|
|
1144
|
+
logger.info(f"DuckDB threads set to {threads}")
|
|
1145
|
+
|
|
1146
|
+
# Query DuckDB settings to verify
|
|
1147
|
+
memory_result = conn.execute(
|
|
1148
|
+
"SELECT current_setting('memory_limit')"
|
|
1149
|
+
).fetchone()
|
|
1150
|
+
actual_memory = memory_result[0] if memory_result else "unknown"
|
|
1151
|
+
threads_result = conn.execute(
|
|
1152
|
+
"SELECT current_setting('threads')"
|
|
1153
|
+
).fetchone()
|
|
1154
|
+
actual_threads = threads_result[0] if threads_result else "unknown"
|
|
1155
|
+
logger.debug(
|
|
1156
|
+
f"DuckDB configured: memory={actual_memory}, threads={actual_threads}"
|
|
1157
|
+
)
|
|
1158
|
+
|
|
1159
|
+
# Build ORDER BY with MongoDB type ordering
|
|
1160
|
+
order_clauses = []
|
|
1161
|
+
for field_name, direction in sort_spec:
|
|
1162
|
+
dir_sql = "ASC" if direction == 1 else "DESC"
|
|
1163
|
+
# Check if field exists in schema before getting type
|
|
1164
|
+
if schema and schema.has_field(field_name):
|
|
1165
|
+
field_type = schema.get_field_type(field_name)
|
|
1166
|
+
else:
|
|
1167
|
+
field_type = None
|
|
1168
|
+
is_any = self._is_any_type(field_type) if field_type else True
|
|
1169
|
+
|
|
1170
|
+
if is_any:
|
|
1171
|
+
# Complete MongoDB type ordering for Any() fields:
|
|
1172
|
+
# Reference: https://www.mongodb.com/docs/manual/reference/bson-type-comparison-order/
|
|
1173
|
+
# 1. MinKey (internal)
|
|
1174
|
+
# 2. Null
|
|
1175
|
+
# 3. Numbers (int, long, double, decimal)
|
|
1176
|
+
# 4. Symbol, String
|
|
1177
|
+
# 5. Object
|
|
1178
|
+
# 6. Array
|
|
1179
|
+
# 7. BinData
|
|
1180
|
+
# 8. ObjectId
|
|
1181
|
+
# 9. Boolean
|
|
1182
|
+
# 10. Date
|
|
1183
|
+
# 11. Timestamp
|
|
1184
|
+
# 12. Regular Expression
|
|
1185
|
+
# 13. MaxKey (internal)
|
|
1186
|
+
|
|
1187
|
+
# Type priority clause
|
|
1188
|
+
type_clause = f"""CASE
|
|
1189
|
+
WHEN "{field_name}" IS NULL OR "{field_name}".null_value IS TRUE
|
|
1190
|
+
THEN 0
|
|
1191
|
+
WHEN "{field_name}".float_value IS NOT NULL
|
|
1192
|
+
OR "{field_name}".int32_value IS NOT NULL
|
|
1193
|
+
OR "{field_name}".int64_value IS NOT NULL
|
|
1194
|
+
OR "{field_name}".decimal128_value IS NOT NULL
|
|
1195
|
+
THEN 1
|
|
1196
|
+
WHEN "{field_name}".string_value IS NOT NULL THEN 2
|
|
1197
|
+
WHEN "{field_name}".document_value IS NOT NULL THEN 3
|
|
1198
|
+
WHEN "{field_name}".array_value IS NOT NULL THEN 4
|
|
1199
|
+
WHEN "{field_name}".binary_value IS NOT NULL THEN 5
|
|
1200
|
+
WHEN "{field_name}".objectid_value IS NOT NULL THEN 6
|
|
1201
|
+
WHEN "{field_name}".bool_value IS NOT NULL THEN 7
|
|
1202
|
+
WHEN "{field_name}".datetime_value IS NOT NULL THEN 8
|
|
1203
|
+
WHEN "{field_name}".regex_value IS NOT NULL THEN 9
|
|
1204
|
+
ELSE 10
|
|
1205
|
+
END {dir_sql}"""
|
|
1206
|
+
|
|
1207
|
+
# Value comparisons for each type
|
|
1208
|
+
num_clause = (
|
|
1209
|
+
f'COALESCE("{field_name}".float_value, '
|
|
1210
|
+
f'CAST("{field_name}".int32_value AS DOUBLE), '
|
|
1211
|
+
f'CAST("{field_name}".int64_value AS DOUBLE)) {dir_sql}'
|
|
1212
|
+
)
|
|
1213
|
+
str_clause = f'"{field_name}".string_value {dir_sql}'
|
|
1214
|
+
# JSON strings compare lexicographically
|
|
1215
|
+
doc_clause = f'"{field_name}".document_value {dir_sql}'
|
|
1216
|
+
# JSON arrays compare lexicographically
|
|
1217
|
+
arr_clause = f'"{field_name}".array_value {dir_sql}'
|
|
1218
|
+
bin_clause = f'"{field_name}".binary_value {dir_sql}'
|
|
1219
|
+
oid_clause = f'"{field_name}".objectid_value {dir_sql}'
|
|
1220
|
+
bool_clause = f'"{field_name}".bool_value {dir_sql}'
|
|
1221
|
+
date_clause = f'"{field_name}".datetime_value {dir_sql}'
|
|
1222
|
+
regex_clause = f'"{field_name}".regex_value {dir_sql}'
|
|
1223
|
+
|
|
1224
|
+
order_clauses.extend(
|
|
1225
|
+
[
|
|
1226
|
+
type_clause,
|
|
1227
|
+
num_clause,
|
|
1228
|
+
str_clause,
|
|
1229
|
+
doc_clause,
|
|
1230
|
+
arr_clause,
|
|
1231
|
+
bin_clause,
|
|
1232
|
+
oid_clause,
|
|
1233
|
+
bool_clause,
|
|
1234
|
+
date_clause,
|
|
1235
|
+
regex_clause,
|
|
1236
|
+
]
|
|
1237
|
+
)
|
|
1238
|
+
else:
|
|
1239
|
+
# Simple field - use direct comparison
|
|
1240
|
+
order_clauses.append(f'"{field_name}" {dir_sql}')
|
|
1241
|
+
|
|
1242
|
+
order_by = ", ".join(order_clauses)
|
|
1243
|
+
files = ", ".join([f"'{f}'" for f in file_paths])
|
|
1244
|
+
query = f"SELECT * FROM read_parquet([{files}]) ORDER BY {order_by}"
|
|
1245
|
+
|
|
1246
|
+
result = conn.execute(query)
|
|
1247
|
+
|
|
1248
|
+
# Use fetchmany() cursor API - this ACTUALLY streams incrementally
|
|
1249
|
+
# without loading all data into memory (unlike fetch_df_chunk)
|
|
1250
|
+
# NOTE: DuckDB's k-way merge uses internal buffering
|
|
1251
|
+
# separate from batch_size.
|
|
1252
|
+
# batch_size only controls how much we pull at once,
|
|
1253
|
+
# not DuckDB's merge buffer.
|
|
1254
|
+
batch_count = 0
|
|
1255
|
+
total_rows = 0
|
|
1256
|
+
column_names = [desc[0] for desc in result.description]
|
|
1257
|
+
|
|
1258
|
+
logging.debug(
|
|
1259
|
+
f"[DuckDB] K-way merge started: {len(file_paths)} files, "
|
|
1260
|
+
f"batch_size={batch_size:,}"
|
|
1261
|
+
)
|
|
1262
|
+
|
|
1263
|
+
while True:
|
|
1264
|
+
# Fetch batch as list of tuples
|
|
1265
|
+
rows = result.fetchmany(batch_size)
|
|
1266
|
+
if not rows:
|
|
1267
|
+
break
|
|
1268
|
+
|
|
1269
|
+
batch_count += 1
|
|
1270
|
+
total_rows += len(rows)
|
|
1271
|
+
|
|
1272
|
+
# Convert to DataFrame
|
|
1273
|
+
batch_df = pd.DataFrame(rows, columns=column_names)
|
|
1274
|
+
logger.debug(
|
|
1275
|
+
f"Streamed batch {batch_count}: {len(batch_df)} rows "
|
|
1276
|
+
f"from DuckDB K-way merge"
|
|
1277
|
+
)
|
|
1278
|
+
|
|
1279
|
+
# Ensure time field is UTC (DuckDB might return naive)
|
|
1280
|
+
if time_field and time_field in batch_df.columns:
|
|
1281
|
+
if pd.api.types.is_datetime64_any_dtype(batch_df[time_field]):
|
|
1282
|
+
if batch_df[time_field].dt.tz is not None:
|
|
1283
|
+
batch_df[time_field] = batch_df[time_field].dt.tz_convert(
|
|
1284
|
+
"UTC"
|
|
1285
|
+
)
|
|
1286
|
+
else:
|
|
1287
|
+
batch_df[time_field] = batch_df[time_field].dt.tz_localize(
|
|
1288
|
+
"UTC"
|
|
1289
|
+
)
|
|
1290
|
+
|
|
1291
|
+
# Apply date filtering if needed
|
|
1292
|
+
# After UTC conversion above, time_field is tz-aware
|
|
1293
|
+
if time_field and (start_date or end_date):
|
|
1294
|
+
from datetime import timezone
|
|
1295
|
+
|
|
1296
|
+
if start_date:
|
|
1297
|
+
start_cmp = (
|
|
1298
|
+
start_date
|
|
1299
|
+
if start_date.tzinfo
|
|
1300
|
+
else start_date.replace(tzinfo=timezone.utc)
|
|
1301
|
+
)
|
|
1302
|
+
batch_df = batch_df[batch_df[time_field] >= start_cmp]
|
|
1303
|
+
if end_date:
|
|
1304
|
+
end_cmp = (
|
|
1305
|
+
end_date
|
|
1306
|
+
if end_date.tzinfo
|
|
1307
|
+
else end_date.replace(tzinfo=timezone.utc)
|
|
1308
|
+
)
|
|
1309
|
+
batch_df = batch_df[batch_df[time_field] < end_cmp]
|
|
1310
|
+
if len(batch_df) == 0:
|
|
1311
|
+
continue
|
|
1312
|
+
|
|
1313
|
+
# Process the batch (decode structs, reconstruct ObjectIds)
|
|
1314
|
+
processed_df = self._process_dataframe(
|
|
1315
|
+
batch_df, "pandas", schema, coerce
|
|
1316
|
+
)
|
|
1317
|
+
yield processed_df
|
|
1318
|
+
|
|
1319
|
+
conn.close()
|
|
1320
|
+
logger.debug("DuckDB K-way merge complete")
|
|
1321
|
+
|
|
1322
|
+
except Exception as e:
|
|
1323
|
+
if coerce == "error":
|
|
1324
|
+
logger.error(f"Error in globally sorted streaming: {e}")
|
|
1325
|
+
return
|
|
1326
|
+
raise
|
|
1327
|
+
|
|
1328
|
+
def get_statistics(self) -> Dict[str, Any]:
|
|
1329
|
+
"""
|
|
1330
|
+
Get statistics about cached data.
|
|
1331
|
+
|
|
1332
|
+
Returns:
|
|
1333
|
+
Dict with file count, total rows, size, schema info
|
|
1334
|
+
"""
|
|
1335
|
+
total_rows = 0
|
|
1336
|
+
total_size = 0
|
|
1337
|
+
schema = None
|
|
1338
|
+
|
|
1339
|
+
for parquet_file in self.parquet_files:
|
|
1340
|
+
# File size
|
|
1341
|
+
total_size += parquet_file.stat().st_size
|
|
1342
|
+
|
|
1343
|
+
# Read metadata
|
|
1344
|
+
parquet_meta = pq.read_metadata(parquet_file)
|
|
1345
|
+
total_rows += parquet_meta.num_rows
|
|
1346
|
+
|
|
1347
|
+
# Get schema from first file
|
|
1348
|
+
if schema is None:
|
|
1349
|
+
schema = parquet_meta.schema.to_arrow_schema()
|
|
1350
|
+
|
|
1351
|
+
return {
|
|
1352
|
+
"file_count": len(self.parquet_files),
|
|
1353
|
+
"total_rows": total_rows,
|
|
1354
|
+
"total_size_mb": round(total_size / (1024 * 1024), 2),
|
|
1355
|
+
"schema_fields": [field.name for field in schema] if schema else [],
|
|
1356
|
+
"cache_dir": str(self.cache_dir),
|
|
1357
|
+
}
|
|
1358
|
+
|
|
1359
|
+
def __repr__(self) -> str:
|
|
1360
|
+
stats = self.get_statistics()
|
|
1361
|
+
return (
|
|
1362
|
+
f"ParquetReader(files={stats['file_count']}, "
|
|
1363
|
+
f"rows={stats['total_rows']:,}, "
|
|
1364
|
+
f"size={stats['total_size_mb']:.1f}MB)"
|
|
1365
|
+
)
|
|
1366
|
+
|
|
1367
|
+
def __len__(self) -> int:
|
|
1368
|
+
"""Return total number of rows across all files."""
|
|
1369
|
+
return self.get_statistics()["total_rows"]
|