xlr8 0.1.7b3__cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xlr8/__init__.py +113 -0
- xlr8/_xlr8_rust.cpython-311-x86_64-linux-gnu.so +0 -0
- xlr8/_xlr8_rust.pyi +71 -0
- xlr8/analysis/__init__.py +58 -0
- xlr8/analysis/brackets.py +1201 -0
- xlr8/analysis/chunker.py +118 -0
- xlr8/analysis/inspector.py +1889 -0
- xlr8/collection/__init__.py +6 -0
- xlr8/collection/cursor.py +2161 -0
- xlr8/collection/cursor.pyi +179 -0
- xlr8/collection/wrapper.py +400 -0
- xlr8/collection/wrapper.pyi +420 -0
- xlr8/constants.py +24 -0
- xlr8/execution/__init__.py +43 -0
- xlr8/execution/callback.py +792 -0
- xlr8/execution/executor.py +500 -0
- xlr8/execution/planner.py +377 -0
- xlr8/py.typed +1 -0
- xlr8/rust_backend.py +40 -0
- xlr8/rust_backend.pyi +71 -0
- xlr8/schema/__init__.py +42 -0
- xlr8/schema/encoder.py +235 -0
- xlr8/schema/schema.py +265 -0
- xlr8/schema/types.py +239 -0
- xlr8/storage/__init__.py +17 -0
- xlr8/storage/cache.py +228 -0
- xlr8/storage/reader.py +1369 -0
- xlr8-0.1.7b3.dist-info/METADATA +176 -0
- xlr8-0.1.7b3.dist-info/RECORD +31 -0
- xlr8-0.1.7b3.dist-info/WHEEL +5 -0
- xlr8-0.1.7b3.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,792 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Partition-based callback streaming for data lake population among other use cases.
|
|
3
|
+
|
|
4
|
+
================================================================================
|
|
5
|
+
ARCHITECTURE - STREAM TO CALLBACK WITH PARTITIONING
|
|
6
|
+
================================================================================
|
|
7
|
+
|
|
8
|
+
This module implements a two-phase approach:
|
|
9
|
+
|
|
10
|
+
PHASE 1: Download to Cache (reuses existing Rust backend)
|
|
11
|
+
────────────────────────────────────────────────────────────────────────────────
|
|
12
|
+
MongoDB --> Rust Workers --> Parquet Cache (on disk)
|
|
13
|
+
|
|
14
|
+
Uses execute_parallel_stream_to_cache() - battle-tested, memory-safe.
|
|
15
|
+
|
|
16
|
+
PHASE 2: Partition + Parallel Callbacks
|
|
17
|
+
────────────────────────────────────────────────────────────────────────────────
|
|
18
|
+
1. Build partition plan using DuckDB:
|
|
19
|
+
- Discover unique (time_bucket, partition_key) combinations
|
|
20
|
+
- Create work items for each partition
|
|
21
|
+
|
|
22
|
+
2. Execute callbacks in parallel (ThreadPoolExecutor):
|
|
23
|
+
- Each worker: DuckDB query -> PyArrow Table -> decode -> callback()
|
|
24
|
+
- DuckDB releases GIL -> true parallelism
|
|
25
|
+
- User callbacks can use non-picklable objects (boto3, etc.)
|
|
26
|
+
|
|
27
|
+
EDGE CASES HANDLED:
|
|
28
|
+
────────────────────────────────────────────────────────────────────────────────
|
|
29
|
+
- NULL values in partition_by fields -> grouped as one partition
|
|
30
|
+
- Empty partitions (no data in time bucket) -> skipped
|
|
31
|
+
- Parent fields (e.g., "metadata") -> expanded to child fields
|
|
32
|
+
- Types.Any() fields -> decoded based on any_type_strategy
|
|
33
|
+
- ObjectIds -> converted to strings (same as to_polars)
|
|
34
|
+
- Large partitions -> DuckDB streams internally, memory-safe
|
|
35
|
+
- Timezone handling -> all datetimes normalized to UTC
|
|
36
|
+
|
|
37
|
+
================================================================================
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
import logging
|
|
41
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
42
|
+
from dataclasses import dataclass
|
|
43
|
+
from datetime import datetime, timedelta, timezone
|
|
44
|
+
from pathlib import Path
|
|
45
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, cast
|
|
46
|
+
|
|
47
|
+
import polars as pl
|
|
48
|
+
import pyarrow as pa
|
|
49
|
+
|
|
50
|
+
from xlr8.schema.types import Any as AnyType
|
|
51
|
+
from xlr8.schema.types import ObjectId as ObjectIdType
|
|
52
|
+
|
|
53
|
+
logger = logging.getLogger(__name__)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class PartitionWorkItem:
|
|
58
|
+
"""A single partition to process."""
|
|
59
|
+
|
|
60
|
+
index: int
|
|
61
|
+
total: int
|
|
62
|
+
time_start: datetime
|
|
63
|
+
time_end: datetime
|
|
64
|
+
partition_values: Optional[Dict[str, Any]] # None if no partition_by
|
|
65
|
+
partition_fields: Optional[List[str]] # Fields used for partitioning
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _expand_parent_fields(
|
|
69
|
+
fields: List[str],
|
|
70
|
+
schema: Any,
|
|
71
|
+
) -> List[str]:
|
|
72
|
+
"""
|
|
73
|
+
Expand parent fields to their children in schema definition order.
|
|
74
|
+
|
|
75
|
+
When user specifies a parent field like "metadata" but the schema has
|
|
76
|
+
flattened fields like "metadata.device_id", expand to all children.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
fields: Original field list
|
|
80
|
+
schema: XLR8 schema with field definitions
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Expanded field list with parent fields replaced by children
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
ValueError: If field not found and no children exist
|
|
87
|
+
"""
|
|
88
|
+
if schema is None:
|
|
89
|
+
return fields
|
|
90
|
+
|
|
91
|
+
all_schema_fields = list(schema.fields.keys())
|
|
92
|
+
expanded = []
|
|
93
|
+
|
|
94
|
+
for field_name in fields:
|
|
95
|
+
if schema.has_field(field_name):
|
|
96
|
+
# Field exists directly in schema
|
|
97
|
+
expanded.append(field_name)
|
|
98
|
+
else:
|
|
99
|
+
# Look for child fields with this prefix (in schema order)
|
|
100
|
+
prefix = f"{field_name}."
|
|
101
|
+
children = [f for f in all_schema_fields if f.startswith(prefix)]
|
|
102
|
+
|
|
103
|
+
if children:
|
|
104
|
+
logger.info(
|
|
105
|
+
"Partition field '%s' expanded to children: %s",
|
|
106
|
+
field_name,
|
|
107
|
+
children,
|
|
108
|
+
)
|
|
109
|
+
expanded.extend(children)
|
|
110
|
+
else:
|
|
111
|
+
raise ValueError(
|
|
112
|
+
(
|
|
113
|
+
f"Partition field '{field_name}' not found in schema. "
|
|
114
|
+
"No child fields. "
|
|
115
|
+
f"Available fields: {sorted(all_schema_fields)[:10]}"
|
|
116
|
+
+ ("..." if len(all_schema_fields) > 10 else "")
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
return expanded
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _timedelta_to_duckdb_interval(td: timedelta) -> str:
|
|
124
|
+
"""
|
|
125
|
+
Convert Python timedelta to DuckDB interval string.
|
|
126
|
+
|
|
127
|
+
Examples:
|
|
128
|
+
timedelta(days=7) -> "7 days"
|
|
129
|
+
timedelta(hours=16) -> "16 hours"
|
|
130
|
+
timedelta(minutes=30) -> "30 minutes"
|
|
131
|
+
"""
|
|
132
|
+
total_seconds = int(td.total_seconds())
|
|
133
|
+
|
|
134
|
+
if total_seconds >= 86400 and total_seconds % 86400 == 0:
|
|
135
|
+
days = total_seconds // 86400
|
|
136
|
+
return f"{days} day" if days == 1 else f"{days} days"
|
|
137
|
+
elif total_seconds >= 3600 and total_seconds % 3600 == 0:
|
|
138
|
+
hours = total_seconds // 3600
|
|
139
|
+
return f"{hours} hour" if hours == 1 else f"{hours} hours"
|
|
140
|
+
elif total_seconds >= 60 and total_seconds % 60 == 0:
|
|
141
|
+
minutes = total_seconds // 60
|
|
142
|
+
return f"{minutes} minute" if minutes == 1 else f"{minutes} minutes"
|
|
143
|
+
else:
|
|
144
|
+
return f"{total_seconds} seconds"
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _build_partition_plan(
|
|
148
|
+
cache_dir: str,
|
|
149
|
+
time_field: str,
|
|
150
|
+
partition_time_delta: timedelta,
|
|
151
|
+
partition_by: Optional[List[str]],
|
|
152
|
+
memory_limit_mb: int,
|
|
153
|
+
threads: int,
|
|
154
|
+
) -> List[PartitionWorkItem]:
|
|
155
|
+
"""
|
|
156
|
+
Build partition plan by discovering unique partitions in the cached data.
|
|
157
|
+
|
|
158
|
+
Uses DuckDB to efficiently scan all parquet files and find unique
|
|
159
|
+
(time_bucket, partition_key) combinations.
|
|
160
|
+
|
|
161
|
+
Natural Time Boundaries:
|
|
162
|
+
- First partition's start time is floored to the start of the day (00:00:00)
|
|
163
|
+
- End time of last partition is kept as actual max + partition_time_delta
|
|
164
|
+
- This creates clean, predictable partition boundaries for data lakes
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
cache_dir: Path to cache directory with parquet files
|
|
168
|
+
time_field: Name of the timestamp field
|
|
169
|
+
partition_time_delta: Time bucket size (e.g., 7 days)
|
|
170
|
+
partition_by: List of fields to partition by (e.g., ["metadata.instrument"])
|
|
171
|
+
memory_limit_mb: DuckDB memory limit
|
|
172
|
+
threads: DuckDB thread count
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
List of PartitionWorkItem to process
|
|
176
|
+
"""
|
|
177
|
+
import duckdb
|
|
178
|
+
|
|
179
|
+
cache_path = Path(cache_dir)
|
|
180
|
+
parquet_files = list(cache_path.glob("*.parquet"))
|
|
181
|
+
|
|
182
|
+
if not parquet_files:
|
|
183
|
+
logger.warning("No parquet files found in %s", cache_dir)
|
|
184
|
+
return []
|
|
185
|
+
|
|
186
|
+
file_paths = [str(f) for f in parquet_files]
|
|
187
|
+
files_list = ", ".join([f"'{f}'" for f in file_paths])
|
|
188
|
+
|
|
189
|
+
# STEP 1: Get global min timestamp to establish natural day boundary
|
|
190
|
+
global_min_query = f"""
|
|
191
|
+
SELECT MIN("{time_field}") AS global_min_time
|
|
192
|
+
FROM read_parquet([{files_list}])
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
try:
|
|
196
|
+
conn = duckdb.connect(":memory:")
|
|
197
|
+
|
|
198
|
+
# Configure DuckDB
|
|
199
|
+
if memory_limit_mb:
|
|
200
|
+
conn.execute(f"SET memory_limit = '{memory_limit_mb}MB'")
|
|
201
|
+
if threads:
|
|
202
|
+
conn.execute(f"SET threads = {threads}")
|
|
203
|
+
|
|
204
|
+
# Get global min and floor to start of day
|
|
205
|
+
global_result = cast(
|
|
206
|
+
Optional[Tuple[Any, ...]],
|
|
207
|
+
conn.execute(global_min_query).fetchone(),
|
|
208
|
+
)
|
|
209
|
+
if global_result is None or global_result[0] is None:
|
|
210
|
+
logger.warning("No data found in parquet files")
|
|
211
|
+
conn.close()
|
|
212
|
+
return []
|
|
213
|
+
|
|
214
|
+
global_min_time = global_result[0]
|
|
215
|
+
|
|
216
|
+
# Ensure timezone aware
|
|
217
|
+
if hasattr(global_min_time, "tzinfo") and global_min_time.tzinfo is None:
|
|
218
|
+
global_min_time = global_min_time.replace(tzinfo=timezone.utc)
|
|
219
|
+
|
|
220
|
+
# Floor to start of day (zero hours, mins, seconds, microseconds)
|
|
221
|
+
floored_start = global_min_time.replace(
|
|
222
|
+
hour=0,
|
|
223
|
+
minute=0,
|
|
224
|
+
second=0,
|
|
225
|
+
microsecond=0,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
logger.info(
|
|
229
|
+
"Natural time boundary: floored %s -> %s",
|
|
230
|
+
global_min_time.isoformat(),
|
|
231
|
+
floored_start.isoformat(),
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
# Convert timedelta to DuckDB interval
|
|
235
|
+
interval = _timedelta_to_duckdb_interval(partition_time_delta)
|
|
236
|
+
|
|
237
|
+
# STEP 2: Build partition query using floored start as origin
|
|
238
|
+
# DuckDB's time_bucket can take an origin parameter
|
|
239
|
+
origin_str = floored_start.strftime("%Y-%m-%d %H:%M:%S")
|
|
240
|
+
|
|
241
|
+
# Build SELECT clause for partition keys
|
|
242
|
+
select_parts = [
|
|
243
|
+
f"time_bucket(INTERVAL '{interval}', \"{time_field}\", "
|
|
244
|
+
f"TIMESTAMP '{origin_str}') AS time_bucket"
|
|
245
|
+
]
|
|
246
|
+
group_parts = ["time_bucket"]
|
|
247
|
+
|
|
248
|
+
if partition_by:
|
|
249
|
+
for field in partition_by:
|
|
250
|
+
# Quote field name properly for DuckDB (handles dots in names)
|
|
251
|
+
select_parts.append(f'"{field}" AS "{field}"')
|
|
252
|
+
group_parts.append(f'"{field}"')
|
|
253
|
+
|
|
254
|
+
select_clause = ", ".join(select_parts)
|
|
255
|
+
group_clause = ", ".join(group_parts)
|
|
256
|
+
|
|
257
|
+
# Build query to discover partitions
|
|
258
|
+
query = f"""
|
|
259
|
+
SELECT
|
|
260
|
+
{select_clause},
|
|
261
|
+
MIN("{time_field}") AS actual_min_time,
|
|
262
|
+
MAX("{time_field}") AS actual_max_time,
|
|
263
|
+
COUNT(*) AS row_count
|
|
264
|
+
FROM read_parquet([{files_list}])
|
|
265
|
+
GROUP BY {group_clause}
|
|
266
|
+
ORDER BY time_bucket
|
|
267
|
+
"""
|
|
268
|
+
|
|
269
|
+
result = conn.execute(query).fetchall()
|
|
270
|
+
columns = [desc[0] for desc in conn.execute(query).description] # type: ignore[union-attr]
|
|
271
|
+
conn.close()
|
|
272
|
+
|
|
273
|
+
# Build work items from results
|
|
274
|
+
work_items = []
|
|
275
|
+
total = len(result)
|
|
276
|
+
|
|
277
|
+
for idx, row in enumerate(result):
|
|
278
|
+
row_dict = dict(zip(columns, row))
|
|
279
|
+
|
|
280
|
+
# Extract time bucket
|
|
281
|
+
time_bucket = row_dict["time_bucket"]
|
|
282
|
+
|
|
283
|
+
# Calculate time_start and time_end
|
|
284
|
+
# time_bucket is the start of the bucket (aligned to floored origin)
|
|
285
|
+
if isinstance(time_bucket, datetime):
|
|
286
|
+
time_start = time_bucket
|
|
287
|
+
if time_start.tzinfo is None:
|
|
288
|
+
time_start = time_start.replace(tzinfo=timezone.utc)
|
|
289
|
+
time_end = time_start + partition_time_delta
|
|
290
|
+
else:
|
|
291
|
+
# Fallback: use actual min/max from data
|
|
292
|
+
time_start = row_dict["actual_min_time"]
|
|
293
|
+
time_end = row_dict["actual_max_time"]
|
|
294
|
+
if time_start.tzinfo is None:
|
|
295
|
+
time_start = time_start.replace(tzinfo=timezone.utc)
|
|
296
|
+
if time_end.tzinfo is None:
|
|
297
|
+
time_end = time_end.replace(tzinfo=timezone.utc)
|
|
298
|
+
|
|
299
|
+
# Extract partition values
|
|
300
|
+
partition_values = None
|
|
301
|
+
if partition_by:
|
|
302
|
+
partition_values = {}
|
|
303
|
+
for field in partition_by:
|
|
304
|
+
partition_values[field] = row_dict.get(field)
|
|
305
|
+
|
|
306
|
+
work_items.append(
|
|
307
|
+
PartitionWorkItem(
|
|
308
|
+
index=idx,
|
|
309
|
+
total=total,
|
|
310
|
+
time_start=time_start,
|
|
311
|
+
time_end=time_end,
|
|
312
|
+
partition_values=partition_values,
|
|
313
|
+
partition_fields=partition_by,
|
|
314
|
+
)
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
return work_items
|
|
318
|
+
|
|
319
|
+
except (duckdb.Error, KeyError, AttributeError, TypeError, ValueError) as e:
|
|
320
|
+
logger.error("Failed to build partition plan: %s", e)
|
|
321
|
+
raise
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _build_partition_query(
|
|
325
|
+
cache_dir: str,
|
|
326
|
+
time_field: str,
|
|
327
|
+
work_item: PartitionWorkItem,
|
|
328
|
+
sort_ascending: bool = True,
|
|
329
|
+
) -> str:
|
|
330
|
+
"""
|
|
331
|
+
Build DuckDB query to fetch data for a single partition.
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
cache_dir: Path to cache directory
|
|
335
|
+
time_field: Timestamp field name
|
|
336
|
+
work_item: Partition work item with time bounds and partition values
|
|
337
|
+
sort_ascending: Sort direction for time field
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
DuckDB SQL query string
|
|
341
|
+
"""
|
|
342
|
+
cache_path = Path(cache_dir)
|
|
343
|
+
parquet_files = list(cache_path.glob("*.parquet"))
|
|
344
|
+
files_list = ", ".join([f"'{str(f)}'" for f in parquet_files])
|
|
345
|
+
|
|
346
|
+
# Build WHERE clauses
|
|
347
|
+
where_clauses = []
|
|
348
|
+
|
|
349
|
+
# Time bounds - use proper timestamp formatting
|
|
350
|
+
time_start_iso = work_item.time_start.isoformat()
|
|
351
|
+
time_end_iso = work_item.time_end.isoformat()
|
|
352
|
+
where_clauses.append(f"\"{time_field}\" >= '{time_start_iso}'::TIMESTAMPTZ")
|
|
353
|
+
where_clauses.append(f"\"{time_field}\" < '{time_end_iso}'::TIMESTAMPTZ")
|
|
354
|
+
|
|
355
|
+
# Partition values
|
|
356
|
+
if work_item.partition_values:
|
|
357
|
+
for field, value in work_item.partition_values.items():
|
|
358
|
+
if value is None:
|
|
359
|
+
where_clauses.append(f'"{field}" IS NULL')
|
|
360
|
+
elif isinstance(value, str):
|
|
361
|
+
# Escape single quotes in string values
|
|
362
|
+
escaped = value.replace("'", "''")
|
|
363
|
+
where_clauses.append(f"\"{field}\" = '{escaped}'")
|
|
364
|
+
elif isinstance(value, bool):
|
|
365
|
+
where_clauses.append(f'"{field}" = {str(value).upper()}')
|
|
366
|
+
elif isinstance(value, (int, float)):
|
|
367
|
+
where_clauses.append(f'"{field}" = {value}')
|
|
368
|
+
else:
|
|
369
|
+
# Convert to string for complex types
|
|
370
|
+
escaped = str(value).replace("'", "''")
|
|
371
|
+
where_clauses.append(f"\"{field}\" = '{escaped}'")
|
|
372
|
+
|
|
373
|
+
where_clause = " AND ".join(where_clauses)
|
|
374
|
+
order_dir = "ASC" if sort_ascending else "DESC"
|
|
375
|
+
|
|
376
|
+
return f"""
|
|
377
|
+
SELECT *
|
|
378
|
+
FROM read_parquet([{files_list}])
|
|
379
|
+
WHERE {where_clause}
|
|
380
|
+
ORDER BY "{time_field}" {order_dir}
|
|
381
|
+
"""
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def _decode_struct_values_polars(
|
|
385
|
+
df: pl.DataFrame,
|
|
386
|
+
schema: Any,
|
|
387
|
+
any_type_strategy: Literal["float", "string", "keep_struct"] = "float",
|
|
388
|
+
) -> pl.DataFrame:
|
|
389
|
+
"""
|
|
390
|
+
Decode struct-encoded Any-typed columns back to actual values (Polars).
|
|
391
|
+
|
|
392
|
+
This is copied from reader.py to avoid circular imports and reuse
|
|
393
|
+
battle-tested Polars decode logic. DuckDB -> Polars -> decode -> Arrow.
|
|
394
|
+
|
|
395
|
+
Args:
|
|
396
|
+
df: Polars DataFrame from DuckDB
|
|
397
|
+
schema: Schema with field type info
|
|
398
|
+
any_type_strategy: How to decode:
|
|
399
|
+
- "float": Coalesce to Float64, prioritize numeric (default)
|
|
400
|
+
- "string": Convert everything to string (lossless)
|
|
401
|
+
- "keep_struct": Keep raw struct, don't decode
|
|
402
|
+
"""
|
|
403
|
+
if not hasattr(schema, "fields"):
|
|
404
|
+
return df
|
|
405
|
+
|
|
406
|
+
# Find Any-typed fields in schema
|
|
407
|
+
for field_name, field_type in schema.fields.items():
|
|
408
|
+
# Check if it's an Any type
|
|
409
|
+
is_any = isinstance(field_type, AnyType) or (
|
|
410
|
+
isinstance(field_type, type) and issubclass(field_type, AnyType)
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
if is_any and field_name in df.columns:
|
|
414
|
+
# Check if column is a struct
|
|
415
|
+
col_dtype = df.schema[field_name]
|
|
416
|
+
if str(col_dtype).startswith("Struct"):
|
|
417
|
+
# Strategy: keep_struct - don't decode at all
|
|
418
|
+
if any_type_strategy == "keep_struct":
|
|
419
|
+
continue
|
|
420
|
+
|
|
421
|
+
try:
|
|
422
|
+
# Get field names from the struct
|
|
423
|
+
struct_fields = cast(Any, getattr(col_dtype, "fields", []))
|
|
424
|
+
field_names = (
|
|
425
|
+
[f.name for f in struct_fields] if struct_fields else []
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
if any_type_strategy == "string":
|
|
429
|
+
# Convert ALL value types to string
|
|
430
|
+
coalesce_exprs = []
|
|
431
|
+
|
|
432
|
+
# String first (already string)
|
|
433
|
+
if "string_value" in field_names:
|
|
434
|
+
coalesce_exprs.append(
|
|
435
|
+
pl.col(field_name).struct.field("string_value")
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
# Float to string
|
|
439
|
+
if "float_value" in field_names:
|
|
440
|
+
coalesce_exprs.append(
|
|
441
|
+
pl.col(field_name)
|
|
442
|
+
.struct.field("float_value")
|
|
443
|
+
.cast(pl.Utf8)
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
# Int to string
|
|
447
|
+
for int_name in ["int64_value", "int32_value"]:
|
|
448
|
+
if int_name in field_names:
|
|
449
|
+
coalesce_exprs.append(
|
|
450
|
+
pl.col(field_name)
|
|
451
|
+
.struct.field(int_name)
|
|
452
|
+
.cast(pl.Utf8)
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
# Bool to string
|
|
456
|
+
if "bool_value" in field_names:
|
|
457
|
+
coalesce_exprs.append(
|
|
458
|
+
pl.col(field_name)
|
|
459
|
+
.struct.field("bool_value")
|
|
460
|
+
.cast(pl.Utf8)
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
# ObjectId, decimal, etc. (already strings)
|
|
464
|
+
for str_field in [
|
|
465
|
+
"objectid_value",
|
|
466
|
+
"decimal128_value",
|
|
467
|
+
"regex_value",
|
|
468
|
+
"binary_value",
|
|
469
|
+
"document_value",
|
|
470
|
+
"array_value",
|
|
471
|
+
]:
|
|
472
|
+
if str_field in field_names:
|
|
473
|
+
coalesce_exprs.append(
|
|
474
|
+
pl.col(field_name).struct.field(str_field)
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
if coalesce_exprs:
|
|
478
|
+
df = df.with_columns(
|
|
479
|
+
pl.coalesce(coalesce_exprs).alias(field_name)
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
else: # "float" strategy (default)
|
|
483
|
+
# Coalesce to Float64, prioritize numeric
|
|
484
|
+
coalesce_exprs = []
|
|
485
|
+
|
|
486
|
+
# Try float first (highest precision)
|
|
487
|
+
if "float_value" in field_names:
|
|
488
|
+
coalesce_exprs.append(
|
|
489
|
+
pl.col(field_name).struct.field("float_value")
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
# Try various int types, cast to float
|
|
493
|
+
for int_name in ["int64_value", "int32_value"]:
|
|
494
|
+
if int_name in field_names:
|
|
495
|
+
coalesce_exprs.append(
|
|
496
|
+
pl.col(field_name)
|
|
497
|
+
.struct.field(int_name)
|
|
498
|
+
.cast(pl.Float64)
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
# Try bool (as 0.0/1.0)
|
|
502
|
+
if "bool_value" in field_names:
|
|
503
|
+
coalesce_exprs.append(
|
|
504
|
+
pl.col(field_name)
|
|
505
|
+
.struct.field("bool_value")
|
|
506
|
+
.cast(pl.Float64)
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
if coalesce_exprs:
|
|
510
|
+
if len(coalesce_exprs) == 1:
|
|
511
|
+
df = df.with_columns(
|
|
512
|
+
coalesce_exprs[0].alias(field_name)
|
|
513
|
+
)
|
|
514
|
+
else:
|
|
515
|
+
df = df.with_columns(
|
|
516
|
+
pl.coalesce(coalesce_exprs).alias(field_name)
|
|
517
|
+
)
|
|
518
|
+
else:
|
|
519
|
+
logger.warning(
|
|
520
|
+
"Could not decode struct column '%s': "
|
|
521
|
+
"no numeric fields in %s",
|
|
522
|
+
field_name,
|
|
523
|
+
field_names,
|
|
524
|
+
)
|
|
525
|
+
except (KeyError, AttributeError, TypeError, ValueError) as e:
|
|
526
|
+
logger.warning("Error decoding struct '%s': %s", field_name, e)
|
|
527
|
+
|
|
528
|
+
return df
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
def _convert_objectids_to_strings_polars(
|
|
532
|
+
df: pl.DataFrame,
|
|
533
|
+
schema: Any,
|
|
534
|
+
) -> pl.DataFrame:
|
|
535
|
+
"""
|
|
536
|
+
Convert ObjectId columns to strings in Polars (same as reader.py behavior).
|
|
537
|
+
|
|
538
|
+
ObjectIds are stored as 24-char hex strings in Parquet. This ensures they
|
|
539
|
+
stay as strings in the final output.
|
|
540
|
+
"""
|
|
541
|
+
if not hasattr(schema, "fields"):
|
|
542
|
+
return df
|
|
543
|
+
|
|
544
|
+
# Find ObjectId columns
|
|
545
|
+
objectid_columns = []
|
|
546
|
+
for field_name, field_type in schema.fields.items():
|
|
547
|
+
is_oid = isinstance(field_type, ObjectIdType) or (
|
|
548
|
+
isinstance(field_type, type) and issubclass(field_type, ObjectIdType)
|
|
549
|
+
)
|
|
550
|
+
if is_oid and field_name in df.columns:
|
|
551
|
+
objectid_columns.append(field_name)
|
|
552
|
+
|
|
553
|
+
if not objectid_columns:
|
|
554
|
+
return df
|
|
555
|
+
|
|
556
|
+
# Convert to string in Polars
|
|
557
|
+
for col_name in objectid_columns:
|
|
558
|
+
df = df.with_columns(pl.col(col_name).cast(pl.Utf8))
|
|
559
|
+
|
|
560
|
+
return df
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def _execute_partition_callback(
|
|
564
|
+
work_item: PartitionWorkItem,
|
|
565
|
+
cache_dir: str,
|
|
566
|
+
callback: Callable[[pa.Table, Dict[str, Any]], None],
|
|
567
|
+
schema: Any,
|
|
568
|
+
time_field: str,
|
|
569
|
+
any_type_strategy: Literal["float", "string", "keep_struct"],
|
|
570
|
+
sort_ascending: bool,
|
|
571
|
+
memory_limit_mb: int,
|
|
572
|
+
threads: int = 1,
|
|
573
|
+
) -> Dict[str, Any]:
|
|
574
|
+
"""
|
|
575
|
+
Execute callback for a single partition (runs in thread).
|
|
576
|
+
|
|
577
|
+
This function:
|
|
578
|
+
1. Builds DuckDB query for the partition
|
|
579
|
+
2. Fetches data as PyArrow Table
|
|
580
|
+
3. Decodes Any() struct columns
|
|
581
|
+
4. Converts ObjectIds to strings
|
|
582
|
+
5. Calls user callback
|
|
583
|
+
|
|
584
|
+
Args:
|
|
585
|
+
work_item: Partition to process
|
|
586
|
+
cache_dir: Path to cache directory
|
|
587
|
+
callback: User callback function
|
|
588
|
+
schema: XLR8 schema
|
|
589
|
+
time_field: Timestamp field name
|
|
590
|
+
any_type_strategy: How to decode Any() columns
|
|
591
|
+
sort_ascending: Sort direction
|
|
592
|
+
memory_limit_mb: DuckDB memory limit
|
|
593
|
+
threads: DuckDB thread count (per worker, usually 1)
|
|
594
|
+
|
|
595
|
+
Returns:
|
|
596
|
+
Dict with rows processed and partition info
|
|
597
|
+
"""
|
|
598
|
+
import duckdb
|
|
599
|
+
|
|
600
|
+
try:
|
|
601
|
+
# Build query
|
|
602
|
+
query = _build_partition_query(
|
|
603
|
+
cache_dir=cache_dir,
|
|
604
|
+
time_field=time_field,
|
|
605
|
+
work_item=work_item,
|
|
606
|
+
sort_ascending=sort_ascending,
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
# Execute query
|
|
610
|
+
conn = duckdb.connect(":memory:")
|
|
611
|
+
|
|
612
|
+
# Configure DuckDB for this worker
|
|
613
|
+
# Use per-worker memory limit (divide total by num threads calling this)
|
|
614
|
+
if memory_limit_mb:
|
|
615
|
+
conn.execute(f"SET memory_limit = '{memory_limit_mb}MB'")
|
|
616
|
+
|
|
617
|
+
# ThreadPoolExecutor provides parallelism; set DuckDB threads per worker here.
|
|
618
|
+
conn.execute(f"SET threads = {threads}")
|
|
619
|
+
|
|
620
|
+
# Fetch as Arrow Table (DuckDB native support) and convert to Polars
|
|
621
|
+
arrow_tmp = conn.execute(query).fetch_arrow_table()
|
|
622
|
+
polars_df = cast(pl.DataFrame, pl.from_arrow(arrow_tmp))
|
|
623
|
+
conn.close()
|
|
624
|
+
|
|
625
|
+
if len(polars_df) == 0:
|
|
626
|
+
# Empty partition - skip callback
|
|
627
|
+
return {
|
|
628
|
+
"rows": 0,
|
|
629
|
+
"partition_index": work_item.index,
|
|
630
|
+
"skipped": True,
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
# Decode Any() struct columns using Polars (reuses reader.py logic)
|
|
634
|
+
polars_df = _decode_struct_values_polars(polars_df, schema, any_type_strategy)
|
|
635
|
+
|
|
636
|
+
# Convert ObjectIds to strings (Polars)
|
|
637
|
+
polars_df = _convert_objectids_to_strings_polars(polars_df, schema)
|
|
638
|
+
|
|
639
|
+
# Convert to Arrow for callback (zero-copy via Arrow C Data Interface)
|
|
640
|
+
arrow_table = polars_df.to_arrow()
|
|
641
|
+
|
|
642
|
+
# Build metadata for callback
|
|
643
|
+
metadata = {
|
|
644
|
+
"time_start": work_item.time_start,
|
|
645
|
+
"time_end": work_item.time_end,
|
|
646
|
+
"partition_values": work_item.partition_values or {},
|
|
647
|
+
"row_count": arrow_table.num_rows,
|
|
648
|
+
"partition_index": work_item.index,
|
|
649
|
+
"total_partitions": work_item.total,
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
# Call user callback
|
|
653
|
+
callback(arrow_table, metadata)
|
|
654
|
+
|
|
655
|
+
return {
|
|
656
|
+
"rows": arrow_table.num_rows,
|
|
657
|
+
"partition_index": work_item.index,
|
|
658
|
+
"skipped": False,
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
except Exception as e: # noqa: BLE001
|
|
662
|
+
logger.error("Partition %d failed: %s", work_item.index, e)
|
|
663
|
+
raise
|
|
664
|
+
|
|
665
|
+
|
|
666
|
+
def execute_partitioned_callback(
|
|
667
|
+
cache_dir: str,
|
|
668
|
+
schema: Any,
|
|
669
|
+
callback: Callable[[pa.Table, Dict[str, Any]], None],
|
|
670
|
+
partition_time_delta: timedelta,
|
|
671
|
+
partition_by: Optional[List[str]],
|
|
672
|
+
any_type_strategy: Literal["float", "string", "keep_struct"],
|
|
673
|
+
max_workers: int,
|
|
674
|
+
sort_ascending: bool,
|
|
675
|
+
memory_limit_mb: int,
|
|
676
|
+
) -> Dict[str, Any]:
|
|
677
|
+
"""
|
|
678
|
+
Orchestrate parallel callback execution for partitioned data.
|
|
679
|
+
|
|
680
|
+
This is the main entry point for Phase 2 (after cache is populated).
|
|
681
|
+
|
|
682
|
+
Args:
|
|
683
|
+
cache_dir: Path to cache directory with parquet files
|
|
684
|
+
schema: XLR8 schema
|
|
685
|
+
callback: User callback function(table, metadata)
|
|
686
|
+
partition_time_delta: Time bucket size
|
|
687
|
+
partition_by: Fields to partition by (optional)
|
|
688
|
+
any_type_strategy: How to decode Any() columns
|
|
689
|
+
max_workers: Number of parallel callback threads
|
|
690
|
+
sort_ascending: Sort direction for time field
|
|
691
|
+
memory_limit_mb: Total memory limit for DuckDB operations
|
|
692
|
+
|
|
693
|
+
Returns:
|
|
694
|
+
Dict with total_partitions, total_rows, skipped_partitions, duration_s
|
|
695
|
+
"""
|
|
696
|
+
import time
|
|
697
|
+
|
|
698
|
+
start_time = time.time()
|
|
699
|
+
|
|
700
|
+
time_field = schema.time_field
|
|
701
|
+
|
|
702
|
+
# Expand parent fields to children
|
|
703
|
+
# (e.g., "metadata" -> ["metadata.device_id", "metadata.sensor_id"])
|
|
704
|
+
if partition_by:
|
|
705
|
+
partition_by = _expand_parent_fields(partition_by, schema)
|
|
706
|
+
|
|
707
|
+
# Calculate per-worker memory limit
|
|
708
|
+
# Each worker gets an equal share
|
|
709
|
+
worker_memory_mb = max(64, memory_limit_mb // max_workers)
|
|
710
|
+
|
|
711
|
+
logging.debug("\n[Partition] Building partition plan...")
|
|
712
|
+
logging.debug(f" - Time bucket: {partition_time_delta}")
|
|
713
|
+
logging.debug(f" - Partition by: {partition_by or 'None (time only)'}")
|
|
714
|
+
|
|
715
|
+
# Build partition plan
|
|
716
|
+
work_items = _build_partition_plan(
|
|
717
|
+
cache_dir=cache_dir,
|
|
718
|
+
time_field=time_field,
|
|
719
|
+
partition_time_delta=partition_time_delta,
|
|
720
|
+
partition_by=partition_by,
|
|
721
|
+
memory_limit_mb=worker_memory_mb,
|
|
722
|
+
threads=1, # Single thread for planning
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
if not work_items:
|
|
726
|
+
logging.debug("[Partition] No partitions found!")
|
|
727
|
+
return {
|
|
728
|
+
"total_partitions": 0,
|
|
729
|
+
"total_rows": 0,
|
|
730
|
+
"skipped_partitions": 0,
|
|
731
|
+
"duration_s": time.time() - start_time,
|
|
732
|
+
}
|
|
733
|
+
|
|
734
|
+
logging.debug(f"[Partition] Found {len(work_items)} partitions")
|
|
735
|
+
logging.debug(
|
|
736
|
+
f"[Partition] Executing callbacks with {max_workers} workers "
|
|
737
|
+
f"(memory per worker: {worker_memory_mb}MB)"
|
|
738
|
+
)
|
|
739
|
+
|
|
740
|
+
# Execute callbacks in parallel
|
|
741
|
+
results = []
|
|
742
|
+
skipped = 0
|
|
743
|
+
|
|
744
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
745
|
+
futures = {
|
|
746
|
+
executor.submit(
|
|
747
|
+
_execute_partition_callback,
|
|
748
|
+
work_item=item,
|
|
749
|
+
cache_dir=cache_dir,
|
|
750
|
+
callback=callback,
|
|
751
|
+
schema=schema,
|
|
752
|
+
time_field=time_field,
|
|
753
|
+
any_type_strategy=any_type_strategy,
|
|
754
|
+
sort_ascending=sort_ascending,
|
|
755
|
+
memory_limit_mb=worker_memory_mb,
|
|
756
|
+
threads=1, # Each worker uses 1 DuckDB thread
|
|
757
|
+
): item
|
|
758
|
+
for item in work_items
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
for future in as_completed(futures):
|
|
762
|
+
work_item = futures[future]
|
|
763
|
+
try:
|
|
764
|
+
result = future.result()
|
|
765
|
+
results.append(result)
|
|
766
|
+
|
|
767
|
+
if result.get("skipped"):
|
|
768
|
+
skipped += 1
|
|
769
|
+
|
|
770
|
+
except Exception as e: # noqa: BLE001
|
|
771
|
+
logger.error("Partition %d failed: %s", work_item.index, e)
|
|
772
|
+
raise RuntimeError(
|
|
773
|
+
f"Callback failed for partition {work_item.index} "
|
|
774
|
+
f"(time: {work_item.time_start} to {work_item.time_end}, "
|
|
775
|
+
f"values: {work_item.partition_values}): {e}"
|
|
776
|
+
) from e
|
|
777
|
+
|
|
778
|
+
duration = time.time() - start_time
|
|
779
|
+
total_rows = sum(r.get("rows", 0) for r in results)
|
|
780
|
+
|
|
781
|
+
logging.debug("\n[Partition] Complete:")
|
|
782
|
+
logging.debug(f" - Total partitions: {len(work_items)}")
|
|
783
|
+
logging.debug(f" - Skipped (empty): {skipped}")
|
|
784
|
+
logging.debug(f" - Total rows: {total_rows:,}")
|
|
785
|
+
logging.debug(f" - Duration: {duration:.2f}s")
|
|
786
|
+
|
|
787
|
+
return {
|
|
788
|
+
"total_partitions": len(work_items),
|
|
789
|
+
"total_rows": total_rows,
|
|
790
|
+
"skipped_partitions": skipped,
|
|
791
|
+
"duration_s": duration,
|
|
792
|
+
}
|