xlr8 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xlr8/__init__.py +109 -0
- xlr8/_xlr8_rust.pyi +71 -0
- xlr8/analysis/__init__.py +58 -0
- xlr8/analysis/brackets.py +1201 -0
- xlr8/analysis/chunker.py +118 -0
- xlr8/analysis/inspector.py +1889 -0
- xlr8/collection/__init__.py +6 -0
- xlr8/collection/cursor.py +2145 -0
- xlr8/collection/cursor.pyi +173 -0
- xlr8/collection/wrapper.py +661 -0
- xlr8/collection/wrapper.pyi +218 -0
- xlr8/constants.py +24 -0
- xlr8/execution/__init__.py +43 -0
- xlr8/execution/callback.py +792 -0
- xlr8/execution/executor.py +500 -0
- xlr8/execution/planner.py +377 -0
- xlr8/py.typed +1 -0
- xlr8/rust_backend.py +42 -0
- xlr8/rust_backend.pyi +71 -0
- xlr8/schema/__init__.py +42 -0
- xlr8/schema/encoder.py +235 -0
- xlr8/schema/schema.py +265 -0
- xlr8/schema/types.py +239 -0
- xlr8/storage/__init__.py +17 -0
- xlr8/storage/cache.py +228 -0
- xlr8/storage/reader.py +1276 -0
- xlr8-0.1.2.dist-info/METADATA +177 -0
- xlr8-0.1.2.dist-info/RECORD +30 -0
- xlr8-0.1.2.dist-info/WHEEL +4 -0
- xlr8-0.1.2.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,1889 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MongoDB Query Validator for XLR8 Parallel Execution.
|
|
3
|
+
|
|
4
|
+
XLR8 accelerates MongoDB queries by splitting them into smaller time-based
|
|
5
|
+
chunks that can be fetched in parallel. This module validates if a query is
|
|
6
|
+
safe to split. It does NOT perform the actual splitting—that's handled by
|
|
7
|
+
brackets.py and chunker.py.
|
|
8
|
+
|
|
9
|
+
================================================================================
|
|
10
|
+
HOW XLR8 PARALLELIZES QUERIES
|
|
11
|
+
================================================================================
|
|
12
|
+
|
|
13
|
+
Simple example - fetch 1 year of sensor data:
|
|
14
|
+
|
|
15
|
+
# Original MongoDB query (fetches 365 days serially)
|
|
16
|
+
db.sensors.find({
|
|
17
|
+
"sensor_id": "temp_001",
|
|
18
|
+
"timestamp": {"$gte": jan_1, "$lt": jan_1_next_year}
|
|
19
|
+
})
|
|
20
|
+
|
|
21
|
+
# XLR8 automatically splits this into N parallel chunks
|
|
22
|
+
# fetched simultaneously using Rust workers
|
|
23
|
+
|
|
24
|
+
The process has two phases:
|
|
25
|
+
|
|
26
|
+
PHASE 1: Split $or branches into independent brackets (brackets.py)
|
|
27
|
+
Query with $or:
|
|
28
|
+
{"$or": [
|
|
29
|
+
{"region": "US", "timestamp": {"$gte": t1, "$lt": t2}},
|
|
30
|
+
{"region": "EU", "timestamp": {"$gte": t1, "$lt": t2}}
|
|
31
|
+
]}
|
|
32
|
+
|
|
33
|
+
Becomes 2 brackets:
|
|
34
|
+
Bracket 1: {"region": "US", "timestamp": {...}}
|
|
35
|
+
Bracket 2: {"region": "EU", "timestamp": {...}}
|
|
36
|
+
|
|
37
|
+
PHASE 2: Split each bracket's time range into smaller chunks (chunker.py)
|
|
38
|
+
Each bracket is split into N chunks (user sets chunking granularity
|
|
39
|
+
timedelta(hours=16) etc.) that are fetched in parallel.
|
|
40
|
+
Results are written to separate Parquet files, then merged.
|
|
41
|
+
|
|
42
|
+
================================================================================
|
|
43
|
+
WHAT MAKES A QUERY SAFE TO PARALLELIZE?
|
|
44
|
+
================================================================================
|
|
45
|
+
|
|
46
|
+
A query is safe for parallel execution if it meets ALL these requirements:
|
|
47
|
+
|
|
48
|
+
1. TIME BOUNDS - Query must have complete time range
|
|
49
|
+
SAFE: {"timestamp": {"$gte": t1, "$lt": t2}}
|
|
50
|
+
UNSAFE: {"timestamp": {"$gte": t1}} (unbounded upper)
|
|
51
|
+
UNSAFE: {} (no time reference at all)
|
|
52
|
+
|
|
53
|
+
2. DOCUMENT-LOCAL OPERATORS - Each document evaluated independently
|
|
54
|
+
SAFE FOR PARALLEL: {"value": {"$gt": 100}} (compare field to constant)
|
|
55
|
+
SINGLE-WORKER ONLY: {"$near": {"$geometry": ...}}
|
|
56
|
+
(needs all docs to sort by distance)
|
|
57
|
+
|
|
58
|
+
Why not parallel? If we split by time, $near would return "nearest in each chunk"
|
|
59
|
+
not "nearest overall", giving wrong results. But works fine with single-worker.
|
|
60
|
+
|
|
61
|
+
3. NO TIME FIELD NEGATION - Cannot parallelize $ne/$nin/$not on time field
|
|
62
|
+
SAFE FOR PARALLEL: {"status": {"$nin": ["deleted", "draft"]}}
|
|
63
|
+
SINGLE-WORKER ONLY: {"timestamp": {"$nin": [specific_date]}}
|
|
64
|
+
|
|
65
|
+
Why not parallel? Negating time creates unbounded ranges. Saying "not this date"
|
|
66
|
+
means you need ALL other dates, which breaks the ability to split by time.
|
|
67
|
+
But works fine with single-worker execution.
|
|
68
|
+
|
|
69
|
+
4. SIMPLE $or STRUCTURE - Nested $or too complex to parallelize
|
|
70
|
+
SAFE FOR PARALLEL: {"$or": [{"a": 1}, {"b": 2}]}
|
|
71
|
+
SINGLE-WORKER ONLY: {"$or": [{"$or": [{...}]}, {...}]}
|
|
72
|
+
|
|
73
|
+
Why not parallel? Nested $or creates complex overlaps that cannot be safely
|
|
74
|
+
split into independent brackets. But works fine with single-worker execution.
|
|
75
|
+
|
|
76
|
+
5. NO $natural SORT - Insertion order incompatible with time chunking
|
|
77
|
+
SAFE FOR PARALLEL: .sort([("timestamp", 1)])
|
|
78
|
+
SINGLE-WORKER ONLY: .sort([("$natural", 1)])
|
|
79
|
+
|
|
80
|
+
Why not parallel? $natural returns documents in insertion order. When we
|
|
81
|
+
split by time, each chunk is sorted by insertion within that chunk, not globally.
|
|
82
|
+
But works fine with single-worker execution.
|
|
83
|
+
|
|
84
|
+
================================================================================
|
|
85
|
+
THREE EXECUTION MODES
|
|
86
|
+
================================================================================
|
|
87
|
+
|
|
88
|
+
Every query is classified into one of three modes:
|
|
89
|
+
|
|
90
|
+
┌─────────────────────────────────────────────────────────────────────────┐
|
|
91
|
+
│ PARALLEL - Safe for parallel time-chunked execution │
|
|
92
|
+
│ - Complete time bounds: {"timestamp": {"$gte": t1, "$lt": t2}} │
|
|
93
|
+
│ - Document-local operators only ($gt, $in, $exists, etc.) │
|
|
94
|
+
│ - No time field negation │
|
|
95
|
+
│ - Simple $or structure (depth <= 1) │
|
|
96
|
+
│ - No $natural sort │
|
|
97
|
+
│ - All operators recognized and safe │
|
|
98
|
+
│ │
|
|
99
|
+
│ -> Parallel execution with Rust workers and Parquet caching │
|
|
100
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
101
|
+
|
|
102
|
+
┌─────────────────────────────────────────────────────────────────────────┐
|
|
103
|
+
│ SINGLE - Valid query, cannot parallelize safely │
|
|
104
|
+
│ - Operators requiring full dataset ($text, $near, $expr, geospatial) │
|
|
105
|
+
│ - Nested $or (depth > 1) - overlap handling too complex │
|
|
106
|
+
│ - Time field negation ($timestamp: {$nin: [...]}) │
|
|
107
|
+
│ - $natural sort (requires insertion order) │
|
|
108
|
+
│ - Unbounded/partial time ranges │
|
|
109
|
+
│ - No time field reference │
|
|
110
|
+
│ - Unknown operators (not yet classified) │
|
|
111
|
+
│ │
|
|
112
|
+
│ -> Single-worker execution │
|
|
113
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
114
|
+
|
|
115
|
+
┌─────────────────────────────────────────────────────────────────────────┐
|
|
116
|
+
│ REJECT - Invalid query (MongoDB would also reject or return wrong data) │
|
|
117
|
+
│ (X) Empty $or: {"$or": []} (invalid MongoDB syntax) │
|
|
118
|
+
│ (X) Contradictory bounds: $gte: t2, $lt: t1 where t2 > t1 │
|
|
119
|
+
│ │
|
|
120
|
+
│ -> Error raised with clear explanation │
|
|
121
|
+
│ -> User must fix the query │
|
|
122
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
123
|
+
|
|
124
|
+
================================================================================
|
|
125
|
+
OPERATOR CLASSIFICATION
|
|
126
|
+
================================================================================
|
|
127
|
+
|
|
128
|
+
ALWAYS_ALLOWED (23 operators) - Document-local evaluation
|
|
129
|
+
These are safe because they evaluate each document independently without
|
|
130
|
+
needing other documents.
|
|
131
|
+
9
|
|
132
|
+
Comparison: $eq, $ne, $gt, $gte, $lt, $lte, $in, $nin
|
|
133
|
+
Element: $exists, $type
|
|
134
|
+
Array: $all, $elemMatch, $size
|
|
135
|
+
Bitwise: $bitsAllClear, $bitsAllSet, $bitsAnyClear, $bitsAnySet
|
|
136
|
+
Evaluation: $regex, $mod, $jsonSchema
|
|
137
|
+
Logical: $and
|
|
138
|
+
Metadata: $comment, $options
|
|
139
|
+
|
|
140
|
+
Edge case: When used in $or branches, brackets.py performs additional overlap
|
|
141
|
+
checks to prevent duplicate results. For example:
|
|
142
|
+
{"$or": [{"x": {"$in": [1,2,3]}}, {"x": {"$in": [3,4,5]}}]}
|
|
143
|
+
The value 3 appears in both branches, so this needs special handling.
|
|
144
|
+
|
|
145
|
+
CONDITIONAL (3 operators) - Safe under specific conditions
|
|
146
|
+
$or -> Allowed at depth 1 only (no nested $or)
|
|
147
|
+
$nor -> Allowed if it does NOT reference the time field
|
|
148
|
+
$not -> Allowed if NOT applied to the time field
|
|
149
|
+
|
|
150
|
+
Examples:
|
|
151
|
+
SAFE: {"$or": [{"region": "US"}, {"region": "EU"}]}
|
|
152
|
+
UNSAFE: {"$or": [{"$or": [{...}]}, {...}]}
|
|
153
|
+
|
|
154
|
+
SAFE: {"$nor": [{"status": "deleted"}], "timestamp": {...}}
|
|
155
|
+
UNSAFE: {"$nor": [{"timestamp": {"$lt": t1}}]}
|
|
156
|
+
|
|
157
|
+
NEVER_ALLOWED (17 operators) - Require full dataset (triggers SINGLE mode)
|
|
158
|
+
Geospatial: $near, $nearSphere, $geoWithin, $geoIntersects, $geometry,
|
|
159
|
+
$box, $polygon, $center, $centerSphere, $maxDistance, $minDistance
|
|
160
|
+
Text: $text
|
|
161
|
+
Dynamic: $expr, $where
|
|
162
|
+
Atlas: $search, $vectorSearch
|
|
163
|
+
Legacy: $uniqueDocs
|
|
164
|
+
|
|
165
|
+
Why parallelization is disabled:
|
|
166
|
+
- $near/$nearSphere: Sort ALL docs by distance. If we split by time,
|
|
167
|
+
we'd get "nearest in chunk" not "nearest overall"
|
|
168
|
+
- $text: Uses corpus-wide IDF scores. Splitting changes term frequencies
|
|
169
|
+
- $expr/$where: Cannot statically analyze. May have arbitrary logic
|
|
170
|
+
- $search/$vectorSearch: Atlas-specific, require special infrastructure
|
|
171
|
+
|
|
172
|
+
These operators work fine with single-worker execution (no splitting).
|
|
173
|
+
|
|
174
|
+
UNKNOWN operators -> Also triggers SINGLE mode (conservative/experimental)
|
|
175
|
+
If XLR8 encounters an operator not in the above lists, it conservatively
|
|
176
|
+
falls back to single-worker execution rather than risk incorrect results.
|
|
177
|
+
|
|
178
|
+
================================================================================
|
|
179
|
+
API USAGE
|
|
180
|
+
================================================================================
|
|
181
|
+
|
|
182
|
+
from xlr8.analysis import is_chunkable_query, ChunkabilityMode
|
|
183
|
+
|
|
184
|
+
# Basic usage
|
|
185
|
+
query = {
|
|
186
|
+
"sensor_id": "temp_001",
|
|
187
|
+
"timestamp": {"$gte": jan_1, "$lt": feb_1}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
result = is_chunkable_query(query, "timestamp")
|
|
191
|
+
|
|
192
|
+
if result.mode == ChunkabilityMode.PARALLEL:
|
|
193
|
+
print(f"Can parallelize from {result.bounds[0]} to {result.bounds[1]}")
|
|
194
|
+
# Proceed with parallel execution
|
|
195
|
+
elif result.mode == ChunkabilityMode.SINGLE:
|
|
196
|
+
print(f"Single-worker mode: {result.reason}")
|
|
197
|
+
# Execute with one worker (still faster than PyMongo)
|
|
198
|
+
else: # REJECT
|
|
199
|
+
print(f"Cannot execute: {result.reason}")
|
|
200
|
+
# Raise error or fall back to PyMongo
|
|
201
|
+
|
|
202
|
+
# Backwards-compatible boolean properties
|
|
203
|
+
result.is_chunkable # True for PARALLEL only
|
|
204
|
+
result.is_executable # True for PARALLEL or SINGLE
|
|
205
|
+
|
|
206
|
+
# Can also unpack as tuple (backwards compatibility)
|
|
207
|
+
mode, reason, (start, end) = is_chunkable_query(query, "timestamp")
|
|
208
|
+
|
|
209
|
+
Common reasons by mode:
|
|
210
|
+
REJECT: "$or with empty array matches no documents"
|
|
211
|
+
"invalid time range: lower bound >= upper bound
|
|
212
|
+
(contradictory constraints)"
|
|
213
|
+
|
|
214
|
+
SINGLE: "operator '$text' requires full dataset (single-worker execution)"
|
|
215
|
+
"operator '$near' requires full dataset (single-worker execution)"
|
|
216
|
+
"nested $or operators (depth > 1) require single-worker execution"
|
|
217
|
+
"query contains negation operators ($ne/$nin) on time field"
|
|
218
|
+
"$natural sort requires insertion order (single-worker execution)"
|
|
219
|
+
"no time bounds found (requires single-worker execution)"
|
|
220
|
+
"unbounded $or branch (requires single-worker execution)"
|
|
221
|
+
"unknown operator '$futureOp' (experimental single-worker execution)"
|
|
222
|
+
|
|
223
|
+
================================================================================
|
|
224
|
+
"""
|
|
225
|
+
|
|
226
|
+
from __future__ import annotations
|
|
227
|
+
|
|
228
|
+
from dataclasses import dataclass
|
|
229
|
+
from datetime import datetime, timezone
|
|
230
|
+
from enum import Enum
|
|
231
|
+
from typing import Any, Dict, List, NamedTuple, Optional, Tuple
|
|
232
|
+
|
|
233
|
+
__all__ = [
|
|
234
|
+
# Classification sets
|
|
235
|
+
"ALWAYS_ALLOWED",
|
|
236
|
+
"CONDITIONAL",
|
|
237
|
+
"NEVER_ALLOWED",
|
|
238
|
+
# Validation
|
|
239
|
+
"ValidationResult",
|
|
240
|
+
"ChunkabilityMode",
|
|
241
|
+
"ChunkabilityResult",
|
|
242
|
+
"has_forbidden_ops",
|
|
243
|
+
"has_unknown_operators",
|
|
244
|
+
"check_conditional_operators",
|
|
245
|
+
"validate_query_for_chunking",
|
|
246
|
+
# Query analysis utilities
|
|
247
|
+
"or_depth",
|
|
248
|
+
"split_global_and",
|
|
249
|
+
"normalize_datetime",
|
|
250
|
+
"normalize_query",
|
|
251
|
+
"extract_time_bounds_recursive",
|
|
252
|
+
# Main entry point
|
|
253
|
+
"is_chunkable_query",
|
|
254
|
+
# Internal (exported for testing)
|
|
255
|
+
"_or_depth",
|
|
256
|
+
"_references_field",
|
|
257
|
+
]
|
|
258
|
+
|
|
259
|
+
# =============================================================================
|
|
260
|
+
# OPERATOR CLASSIFICATION
|
|
261
|
+
# =============================================================================
|
|
262
|
+
|
|
263
|
+
ALWAYS_ALLOWED: frozenset[str] = frozenset(
|
|
264
|
+
{
|
|
265
|
+
# -- Comparison ---------------------------------------------------------------
|
|
266
|
+
# Compare field value against a constant. Always document-local.
|
|
267
|
+
#
|
|
268
|
+
# Example: Find all sensors with readings above threshold
|
|
269
|
+
# {"value": {"$gt": 100}, "timestamp": {"$gte": t1, "$lt": t2}}
|
|
270
|
+
#
|
|
271
|
+
"$eq", # {"status": {"$eq": "active"}} - equals
|
|
272
|
+
"$ne", # {"status": {"$ne": "deleted"}} - not equals
|
|
273
|
+
"$gt", # {"value": {"$gt": 100}} - greater than
|
|
274
|
+
"$gte", # {"value": {"$gte": 100}} - greater or equal
|
|
275
|
+
"$lt", # {"value": {"$lt": 0}} - less than
|
|
276
|
+
"$lte", # {"value": {"$lte": 100}} - less or equal
|
|
277
|
+
"$in", # {"type": {"$in": ["A", "B"]}} - in set
|
|
278
|
+
"$nin", # {"type": {"$nin": ["X", "Y"]}} - not in set
|
|
279
|
+
# -- Element ------------------------------------------------------------------
|
|
280
|
+
# Check field existence or BSON type. Document-local metadata checks.
|
|
281
|
+
#
|
|
282
|
+
# Example: Only include documents with validated readings
|
|
283
|
+
# {"validated_at": {"$exists": true}, "value": {"$type": "double"}}
|
|
284
|
+
#
|
|
285
|
+
"$exists", # {"email": {"$exists": true}}
|
|
286
|
+
"$type", # {"value": {"$type": "double"}}
|
|
287
|
+
# -- Array --------------------------------------------------------------------
|
|
288
|
+
# Evaluate array fields within a single document.
|
|
289
|
+
#
|
|
290
|
+
# Example: Find sensors with all required tags
|
|
291
|
+
# {"tags": {"$all": ["calibrated", "production"]}}
|
|
292
|
+
#
|
|
293
|
+
"$all", # {"tags": {"$all": ["a", "b"]}}
|
|
294
|
+
"$elemMatch", # {"readings": {"$elemMatch": {"value": {"$gt": 100}}}}
|
|
295
|
+
"$size", # {"items": {"$size": 3}}
|
|
296
|
+
# -- Bitwise ------------------------------------------------------------------
|
|
297
|
+
# Compare integer bits against a bitmask. Document-local.
|
|
298
|
+
#
|
|
299
|
+
# Example: Find flags with specific bits set
|
|
300
|
+
# {"flags": {"$bitsAllSet": [0, 2, 4]}}
|
|
301
|
+
#
|
|
302
|
+
"$bitsAllClear",
|
|
303
|
+
"$bitsAllSet",
|
|
304
|
+
"$bitsAnyClear",
|
|
305
|
+
"$bitsAnySet",
|
|
306
|
+
# -- Evaluation (safe) --------------------------------------------------------
|
|
307
|
+
# Pattern matching and validation that is document-local.
|
|
308
|
+
#
|
|
309
|
+
# Example: Match sensor names by pattern
|
|
310
|
+
# {"sensor_id": {"$regex": "^TEMP_", "$options": "i"}}
|
|
311
|
+
#
|
|
312
|
+
"$regex", # {"name": {"$regex": "^sensor_"}}
|
|
313
|
+
"$options", # Modifier for $regex
|
|
314
|
+
"$mod", # {"value": {"$mod": [10, 0]}} - divisible by 10
|
|
315
|
+
"$jsonSchema", # {"$jsonSchema": {"required": ["name"]}}
|
|
316
|
+
"$comment", # {"$comment": "audit query"} - annotation only
|
|
317
|
+
# -- Logical (safe) -----------------------------------------------------------
|
|
318
|
+
# $and is always safe: conjunctions preserve correctness.
|
|
319
|
+
#
|
|
320
|
+
# Example: Multiple conditions all must match
|
|
321
|
+
# {"$and": [{"value": {"$gt": 0}}, {"status": "active"}]}
|
|
322
|
+
#
|
|
323
|
+
"$and",
|
|
324
|
+
}
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
CONDITIONAL: frozenset[str] = frozenset(
|
|
329
|
+
{
|
|
330
|
+
# -- $or ----------------------------------------------------------------------
|
|
331
|
+
# ALLOWED at depth 1 only. Top-level $or is decomposed into "brackets"
|
|
332
|
+
# which are executed and cached independently.
|
|
333
|
+
#
|
|
334
|
+
# [OK] ALLOWED (depth 1):
|
|
335
|
+
# {"$or": [
|
|
336
|
+
# {"sensor_id": "A", "timestamp": {"$gte": t1, "$lt": t2}},
|
|
337
|
+
# {"sensor_id": "B", "timestamp": {"$gte": t1, "$lt": t2}}
|
|
338
|
+
# ]}
|
|
339
|
+
#
|
|
340
|
+
# [X] Triggers SINGLE mode (depth 2 - nested $or):
|
|
341
|
+
# {"$or": [{"$or": [{...}, {...}]}, {...}]}
|
|
342
|
+
#
|
|
343
|
+
"$or",
|
|
344
|
+
# -- $nor ---------------------------------------------------------------------
|
|
345
|
+
# ALLOWED if not referencing time field. Negating time bounds creates
|
|
346
|
+
# unpredictable behavior when chunking.
|
|
347
|
+
#
|
|
348
|
+
# [OK] ALLOWED (excludes status values):
|
|
349
|
+
# {"$nor": [{"status": "deleted"}, {"status": "draft"}],
|
|
350
|
+
# "timestamp": {"$gte": t1, "$lt": t2}}
|
|
351
|
+
#
|
|
352
|
+
# [X] Triggers SINGLE mode (negates time constraint):
|
|
353
|
+
# {"$nor": [{"timestamp": {"$lt": "2024-01-01"}}]}
|
|
354
|
+
#
|
|
355
|
+
"$nor",
|
|
356
|
+
# -- $not ---------------------------------------------------------------------
|
|
357
|
+
# ALLOWED if not applied to time field. Same reasoning as $nor.
|
|
358
|
+
#
|
|
359
|
+
# [OK] ALLOWED (negates value constraint):
|
|
360
|
+
# {"value": {"$not": {"$lt": 0}}} - equivalent to value >= 0
|
|
361
|
+
#
|
|
362
|
+
# [X] Triggers SINGLE mode (negates time constraint):
|
|
363
|
+
# {"timestamp": {"$not": {"$lt": "2024-01-15"}}}
|
|
364
|
+
#
|
|
365
|
+
"$not",
|
|
366
|
+
}
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
NEVER_ALLOWED: frozenset[str] = frozenset(
|
|
371
|
+
{
|
|
372
|
+
# -- Evaluation (unsafe) ------------------------------------------------------
|
|
373
|
+
# $expr and $where cannot be statically analyzed for safety.
|
|
374
|
+
#
|
|
375
|
+
# $expr can contain arbitrary aggregation expressions:
|
|
376
|
+
# {"$expr": {"$gt": ["$endTime", "$startTime"]}}
|
|
377
|
+
# While this example IS document-local, we cannot prove safety for all cases.
|
|
378
|
+
#
|
|
379
|
+
# $where executes JavaScript on the server:
|
|
380
|
+
# {"$where": "this.endTime > this.startTime"}
|
|
381
|
+
# Cannot analyze, may have side effects.
|
|
382
|
+
#
|
|
383
|
+
"$expr",
|
|
384
|
+
"$where",
|
|
385
|
+
# -- Text Search --------------------------------------------------------------
|
|
386
|
+
# $text uses text indexes and corpus-wide IDF scoring.
|
|
387
|
+
# Splitting the corpus changes term frequencies and relevance scores.
|
|
388
|
+
#
|
|
389
|
+
# {"$text": {"$search": "mongodb performance tuning"}}
|
|
390
|
+
#
|
|
391
|
+
"$text",
|
|
392
|
+
# -- Atlas Search -------------------------------------------------------------
|
|
393
|
+
# Atlas-specific full-text and vector search operators.
|
|
394
|
+
#
|
|
395
|
+
"$search",
|
|
396
|
+
"$vectorSearch",
|
|
397
|
+
# -- Geospatial ---------------------------------------------------------------
|
|
398
|
+
# Geospatial operators require special indexes and often involve
|
|
399
|
+
# cross-document operations (sorting by distance, spatial joins).
|
|
400
|
+
#
|
|
401
|
+
# $near/$nearSphere return documents SORTED BY DISTANCE:
|
|
402
|
+
# {"location": {"$near": [lng, lat]}}
|
|
403
|
+
# If we chunk by time, we get "nearest in chunk" not "nearest overall".
|
|
404
|
+
#
|
|
405
|
+
# $geoWithin/$geoIntersects require 2dsphere indexes:
|
|
406
|
+
# {"location": {"$geoWithin": {"$geometry": {...}}}}
|
|
407
|
+
#
|
|
408
|
+
"$near",
|
|
409
|
+
"$nearSphere",
|
|
410
|
+
"$geoWithin",
|
|
411
|
+
"$geoIntersects",
|
|
412
|
+
"$geometry",
|
|
413
|
+
"$box",
|
|
414
|
+
"$polygon",
|
|
415
|
+
"$center",
|
|
416
|
+
"$centerSphere",
|
|
417
|
+
"$maxDistance",
|
|
418
|
+
"$minDistance",
|
|
419
|
+
"$uniqueDocs",
|
|
420
|
+
}
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
# =============================================================================
|
|
424
|
+
# VALIDATION RESULT
|
|
425
|
+
# =============================================================================
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
@dataclass(frozen=True, slots=True)
|
|
429
|
+
class ValidationResult:
|
|
430
|
+
"""Result of query validation for chunking."""
|
|
431
|
+
|
|
432
|
+
is_valid: bool
|
|
433
|
+
reason: str = ""
|
|
434
|
+
forbidden_operator: str | None = None
|
|
435
|
+
|
|
436
|
+
def __bool__(self) -> bool:
|
|
437
|
+
return self.is_valid
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
class ChunkabilityMode(Enum):
|
|
441
|
+
"""Execution mode for MongoDB queries in XLR8.
|
|
442
|
+
|
|
443
|
+
XLR8 classifies queries into three execution modes based on safety
|
|
444
|
+
and parallelizability:
|
|
445
|
+
|
|
446
|
+
- PARALLEL: Query can be safely executed with parallel time-chunked workers.
|
|
447
|
+
Example: {"timestamp": {"$gte": t1, "$lt": t2}, "status": "active"}
|
|
448
|
+
|
|
449
|
+
- SINGLE: Query is valid but cannot be safely parallelized. Executes with
|
|
450
|
+
single worker but still uses Rust backend and Parquet caching.
|
|
451
|
+
Example: {"timestamp": {"$gte": t1}}, sort=[("$natural", 1)]
|
|
452
|
+
|
|
453
|
+
- REJECT: Invalid query syntax or contradictory constraints that MongoDB
|
|
454
|
+
would also reject or return no results for (e.g., empty $or array,
|
|
455
|
+
contradictory lo >= hi bounds). These queries should NOT be executed.
|
|
456
|
+
Example: {"$or": []}
|
|
457
|
+
"""
|
|
458
|
+
|
|
459
|
+
PARALLEL = "parallel" # Safe for parallel time-chunked execution
|
|
460
|
+
SINGLE = "single" # Valid query, single-worker fallback
|
|
461
|
+
REJECT = "reject" # Invalid syntax or contradictory constraints
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
class ChunkabilityResult(NamedTuple):
|
|
465
|
+
"""Result of query chunkability analysis.
|
|
466
|
+
|
|
467
|
+
Provides structured result with execution mode, reason, and time bounds.
|
|
468
|
+
|
|
469
|
+
Attributes:
|
|
470
|
+
mode: Execution mode (PARALLEL/SINGLE/REJECT)
|
|
471
|
+
reason: Empty string if PARALLEL, explanation otherwise
|
|
472
|
+
bounds: Time bounds tuple (lo, hi) or (None, None)
|
|
473
|
+
|
|
474
|
+
Examples:
|
|
475
|
+
>>> result = ChunkabilityResult(
|
|
476
|
+
... mode=ChunkabilityMode.PARALLEL,
|
|
477
|
+
... reason="",
|
|
478
|
+
... bounds=(datetime(2024,1,1), datetime(2024,7,1))
|
|
479
|
+
... )
|
|
480
|
+
>>> result.mode == ChunkabilityMode.PARALLEL
|
|
481
|
+
True
|
|
482
|
+
|
|
483
|
+
>>> result = ChunkabilityResult(
|
|
484
|
+
... mode=ChunkabilityMode.SINGLE,
|
|
485
|
+
... reason="$natural sort requires insertion order",
|
|
486
|
+
... bounds=(datetime(2024,1,1), datetime(2024,7,1))
|
|
487
|
+
... )
|
|
488
|
+
>>> result.mode == ChunkabilityMode.SINGLE
|
|
489
|
+
True
|
|
490
|
+
|
|
491
|
+
>>> result = ChunkabilityResult(
|
|
492
|
+
... mode=ChunkabilityMode.REJECT,
|
|
493
|
+
... reason="empty $or array (invalid MongoDB syntax)",
|
|
494
|
+
... bounds=(None, None)
|
|
495
|
+
... )
|
|
496
|
+
>>> result.mode == ChunkabilityMode.REJECT
|
|
497
|
+
True
|
|
498
|
+
"""
|
|
499
|
+
|
|
500
|
+
mode: ChunkabilityMode
|
|
501
|
+
reason: str
|
|
502
|
+
bounds: Tuple[Optional[datetime], Optional[datetime]]
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
# =============================================================================
|
|
506
|
+
# CORE VALIDATION FUNCTIONS
|
|
507
|
+
# =============================================================================
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def has_forbidden_ops(query: Any) -> Tuple[bool, Optional[str]]:
|
|
511
|
+
"""
|
|
512
|
+
Check if query contains any NEVER_ALLOWED operator.
|
|
513
|
+
|
|
514
|
+
These operators require the full dataset and cannot be parallelized.
|
|
515
|
+
Triggers SINGLE mode (single-worker execution).
|
|
516
|
+
|
|
517
|
+
Recursively walks the query tree looking for forbidden operator keys.
|
|
518
|
+
Returns on first forbidden operator found (fail-fast).
|
|
519
|
+
|
|
520
|
+
Args:
|
|
521
|
+
query: MongoDB query (dict, list, or primitive)
|
|
522
|
+
|
|
523
|
+
Returns:
|
|
524
|
+
Tuple of (has_forbidden, operator_name)
|
|
525
|
+
|
|
526
|
+
Examples:
|
|
527
|
+
>>> has_forbidden_ops({"status": "active"})
|
|
528
|
+
(False, None)
|
|
529
|
+
|
|
530
|
+
>>> has_forbidden_ops({"location": {"$near": [0, 0]}})
|
|
531
|
+
(True, '$near')
|
|
532
|
+
|
|
533
|
+
>>> has_forbidden_ops({"$and": [{"$text": {"$search": "test"}}]})
|
|
534
|
+
(True, '$text')
|
|
535
|
+
"""
|
|
536
|
+
if isinstance(query, dict):
|
|
537
|
+
for key, value in query.items():
|
|
538
|
+
if key in NEVER_ALLOWED:
|
|
539
|
+
return True, key
|
|
540
|
+
found, op = has_forbidden_ops(value)
|
|
541
|
+
if found:
|
|
542
|
+
return True, op
|
|
543
|
+
elif isinstance(query, list):
|
|
544
|
+
for item in query:
|
|
545
|
+
found, op = has_forbidden_ops(item)
|
|
546
|
+
if found:
|
|
547
|
+
return True, op
|
|
548
|
+
return False, None
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def has_unknown_operators(query: Any) -> Tuple[bool, Optional[str]]:
|
|
552
|
+
"""
|
|
553
|
+
Check if query contains operators not in our classification lists.
|
|
554
|
+
|
|
555
|
+
This provides a conservative "fail-closed" approach for MongoDB operators
|
|
556
|
+
that are not yet classified. Unknown operators trigger SINGLE mode execution
|
|
557
|
+
(experimental/cautious path) rather than being silently allowed.
|
|
558
|
+
|
|
559
|
+
Args:
|
|
560
|
+
query: MongoDB query (dict, list, or primitive)
|
|
561
|
+
|
|
562
|
+
Returns:
|
|
563
|
+
Tuple of (has_unknown, operator_name)
|
|
564
|
+
|
|
565
|
+
Examples:
|
|
566
|
+
>>> has_unknown_operators({"status": "active"})
|
|
567
|
+
(False, None)
|
|
568
|
+
|
|
569
|
+
>>> has_unknown_operators({"$futureOp": {"$someLogic": "..."}})
|
|
570
|
+
(True, '$futureOp')
|
|
571
|
+
|
|
572
|
+
>>> has_unknown_operators({"value": {"$gt": 100}})
|
|
573
|
+
(False, None)
|
|
574
|
+
"""
|
|
575
|
+
KNOWN_OPS = ALWAYS_ALLOWED | CONDITIONAL | NEVER_ALLOWED
|
|
576
|
+
|
|
577
|
+
if isinstance(query, dict):
|
|
578
|
+
for key, value in query.items():
|
|
579
|
+
# Check if key is an operator (starts with $) and not in known lists
|
|
580
|
+
if key.startswith("$") and key not in KNOWN_OPS:
|
|
581
|
+
return True, key
|
|
582
|
+
# Recurse into value
|
|
583
|
+
found, op = has_unknown_operators(value)
|
|
584
|
+
if found:
|
|
585
|
+
return True, op
|
|
586
|
+
elif isinstance(query, list):
|
|
587
|
+
for item in query:
|
|
588
|
+
found, op = has_unknown_operators(item)
|
|
589
|
+
if found:
|
|
590
|
+
return True, op
|
|
591
|
+
return False, None
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
def _references_field(obj: Any, field_name: str) -> bool:
|
|
595
|
+
"""Check if query fragment references a specific field name."""
|
|
596
|
+
if isinstance(obj, dict):
|
|
597
|
+
if field_name in obj:
|
|
598
|
+
return True
|
|
599
|
+
return any(_references_field(v, field_name) for v in obj.values())
|
|
600
|
+
elif isinstance(obj, list):
|
|
601
|
+
return any(_references_field(item, field_name) for item in obj)
|
|
602
|
+
return False
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
def _or_depth(obj: Any, current: int = 0) -> int:
|
|
606
|
+
"""Calculate maximum nesting depth of $or operators."""
|
|
607
|
+
if isinstance(obj, dict):
|
|
608
|
+
depth = current + 1 if "$or" in obj else current
|
|
609
|
+
child_depths = [
|
|
610
|
+
_or_depth(v, current + 1) if k == "$or" else _or_depth(v, current)
|
|
611
|
+
for k, v in obj.items()
|
|
612
|
+
]
|
|
613
|
+
return max([depth] + child_depths) if child_depths else depth
|
|
614
|
+
elif isinstance(obj, list):
|
|
615
|
+
return max((_or_depth(item, current) for item in obj), default=current)
|
|
616
|
+
return current
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
def check_conditional_operators(
|
|
620
|
+
query: Dict[str, Any], time_field: str
|
|
621
|
+
) -> ValidationResult:
|
|
622
|
+
"""
|
|
623
|
+
Validate CONDITIONAL operators are used safely.
|
|
624
|
+
|
|
625
|
+
Rules:
|
|
626
|
+
- $or: max depth 1 (no nested $or)
|
|
627
|
+
- $nor: must not reference time_field
|
|
628
|
+
- $not: must not be applied to time_field
|
|
629
|
+
|
|
630
|
+
Args:
|
|
631
|
+
query: MongoDB query dict
|
|
632
|
+
time_field: Name of time field (e.g., "timestamp")
|
|
633
|
+
|
|
634
|
+
Returns:
|
|
635
|
+
ValidationResult with is_valid and reason
|
|
636
|
+
|
|
637
|
+
Examples:
|
|
638
|
+
>>> check_conditional_operators(
|
|
639
|
+
... {"$or": [{"a": 1}, {"b": 2}], "ts": {"$gte": t1}},
|
|
640
|
+
... "ts"
|
|
641
|
+
... )
|
|
642
|
+
ValidationResult(is_valid=True)
|
|
643
|
+
|
|
644
|
+
>>> check_conditional_operators(
|
|
645
|
+
... {"$or": [{"$or": [{...}]}, {...}]},
|
|
646
|
+
... "ts"
|
|
647
|
+
... )
|
|
648
|
+
ValidationResult(is_valid=False, reason="nested $or (depth 2 > 1)")
|
|
649
|
+
|
|
650
|
+
>>> check_conditional_operators(
|
|
651
|
+
... {"ts": {"$not": {"$lt": "2024-01-15"}}},
|
|
652
|
+
... "ts"
|
|
653
|
+
... )
|
|
654
|
+
ValidationResult(is_valid=False, reason="$not applied to time field 'ts'")
|
|
655
|
+
"""
|
|
656
|
+
# Check $or depth
|
|
657
|
+
depth = _or_depth(query)
|
|
658
|
+
if depth > 1:
|
|
659
|
+
return ValidationResult(False, f"nested $or (depth {depth} > 1)")
|
|
660
|
+
|
|
661
|
+
# Check for empty $or array
|
|
662
|
+
def check_empty_or(obj: Any) -> Optional[str]:
|
|
663
|
+
if isinstance(obj, dict):
|
|
664
|
+
for key, value in obj.items():
|
|
665
|
+
if key == "$or" and isinstance(value, list) and len(value) == 0:
|
|
666
|
+
return "$or with empty array matches no documents"
|
|
667
|
+
error = check_empty_or(value)
|
|
668
|
+
if error:
|
|
669
|
+
return error
|
|
670
|
+
elif isinstance(obj, list):
|
|
671
|
+
for item in obj:
|
|
672
|
+
error = check_empty_or(item)
|
|
673
|
+
if error:
|
|
674
|
+
return error
|
|
675
|
+
return None
|
|
676
|
+
|
|
677
|
+
error = check_empty_or(query)
|
|
678
|
+
if error:
|
|
679
|
+
return ValidationResult(False, error)
|
|
680
|
+
|
|
681
|
+
# Check $nor doesn't reference time field
|
|
682
|
+
def check_tree(obj: Any, parent_key: Optional[str] = None) -> Optional[str]:
|
|
683
|
+
if isinstance(obj, dict):
|
|
684
|
+
for key, value in obj.items():
|
|
685
|
+
if key == "$nor" and isinstance(value, list):
|
|
686
|
+
for clause in value:
|
|
687
|
+
if _references_field(clause, time_field):
|
|
688
|
+
return f"$nor references time field '{time_field}'"
|
|
689
|
+
if key == "$not" and parent_key == time_field:
|
|
690
|
+
return f"$not applied to time field '{time_field}'"
|
|
691
|
+
error = check_tree(value, key)
|
|
692
|
+
if error:
|
|
693
|
+
return error
|
|
694
|
+
elif isinstance(obj, list):
|
|
695
|
+
for item in obj:
|
|
696
|
+
error = check_tree(item, parent_key)
|
|
697
|
+
if error:
|
|
698
|
+
return error
|
|
699
|
+
return None
|
|
700
|
+
|
|
701
|
+
error = check_tree(query)
|
|
702
|
+
return ValidationResult(False, error) if error else ValidationResult(True)
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
def validate_query_for_chunking(
|
|
706
|
+
query: Dict[str, Any], time_field: str
|
|
707
|
+
) -> Tuple[bool, str]:
|
|
708
|
+
"""
|
|
709
|
+
Validate query operators are compatible with chunking.
|
|
710
|
+
|
|
711
|
+
This validates operators only - does not check for time bounds.
|
|
712
|
+
For full chunkability check including time bounds, use is_chunkable_query().
|
|
713
|
+
|
|
714
|
+
Args:
|
|
715
|
+
query: MongoDB find() filter
|
|
716
|
+
time_field: Name of time field for chunking
|
|
717
|
+
|
|
718
|
+
Returns:
|
|
719
|
+
Tuple of (is_valid, reason)
|
|
720
|
+
|
|
721
|
+
Examples:
|
|
722
|
+
# Valid query with common operators
|
|
723
|
+
>>> validate_query_for_chunking({
|
|
724
|
+
... "account_id": ObjectId("..."),
|
|
725
|
+
... "region_id": {"$in": [ObjectId("..."), ...]},
|
|
726
|
+
... "timestamp": {"$gte": t1, "$lt": t2}
|
|
727
|
+
... }, "timestamp")
|
|
728
|
+
(True, '')
|
|
729
|
+
|
|
730
|
+
# $or with per-branch time ranges (typical XLR8 pattern)
|
|
731
|
+
>>> validate_query_for_chunking({
|
|
732
|
+
... "$or": [
|
|
733
|
+
... {"sensor": "A", "timestamp": {"$gte": t1, "$lt": t2}},
|
|
734
|
+
... {"sensor": "B", "timestamp": {"$gte": t3, "$lt": t4}}
|
|
735
|
+
... ],
|
|
736
|
+
... "account_id": ObjectId("...")
|
|
737
|
+
... }, "timestamp")
|
|
738
|
+
(True, '')
|
|
739
|
+
|
|
740
|
+
# Cannot chunk: contains $expr (requires full dataset)
|
|
741
|
+
>>> validate_query_for_chunking({
|
|
742
|
+
... "$expr": {"$gt": ["$endTime", "$startTime"]}
|
|
743
|
+
... }, "timestamp")
|
|
744
|
+
(False, "operator '$expr' requires full dataset (cannot chunk)")
|
|
745
|
+
|
|
746
|
+
# Cannot chunk: geospatial operator
|
|
747
|
+
>>> validate_query_for_chunking({
|
|
748
|
+
... "location": {"$near": {"$geometry": {...}}}
|
|
749
|
+
... }, "timestamp")
|
|
750
|
+
(False, "operator '$near' requires full dataset (cannot chunk)")
|
|
751
|
+
"""
|
|
752
|
+
# Check for operators requiring full dataset (cannot chunk/parallelize)
|
|
753
|
+
# Recurses the query tree and returns on first forbidden operator found.
|
|
754
|
+
has_forbidden, op = has_forbidden_ops(query)
|
|
755
|
+
if has_forbidden:
|
|
756
|
+
return False, f"operator '{op}' requires full dataset (cannot chunk)"
|
|
757
|
+
|
|
758
|
+
# Validate conditional operators
|
|
759
|
+
result = check_conditional_operators(query, time_field)
|
|
760
|
+
if not result:
|
|
761
|
+
return False, result.reason
|
|
762
|
+
|
|
763
|
+
return True, ""
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
# =============================================================================
|
|
767
|
+
# QUERY STRUCTURE ANALYSIS
|
|
768
|
+
# =============================================================================
|
|
769
|
+
|
|
770
|
+
|
|
771
|
+
def or_depth(obj: Any, depth: int = 0) -> int:
|
|
772
|
+
"""
|
|
773
|
+
Calculate $or nesting depth (backwards-compatible API).
|
|
774
|
+
|
|
775
|
+
Returns 0 for no $or, 1 for top-level $or, 2+ for nested.
|
|
776
|
+
"""
|
|
777
|
+
if isinstance(obj, dict):
|
|
778
|
+
local = 1 if "$or" in obj else 0
|
|
779
|
+
return max(
|
|
780
|
+
[depth + local]
|
|
781
|
+
+ [or_depth(v, depth + (1 if k == "$or" else 0)) for k, v in obj.items()]
|
|
782
|
+
)
|
|
783
|
+
if isinstance(obj, list):
|
|
784
|
+
return max((or_depth(x, depth) for x in obj), default=depth)
|
|
785
|
+
return depth
|
|
786
|
+
|
|
787
|
+
|
|
788
|
+
def split_global_and(
|
|
789
|
+
query: Dict[str, Any],
|
|
790
|
+
) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
|
|
791
|
+
"""
|
|
792
|
+
Split query into global AND conditions and $or branches.
|
|
793
|
+
|
|
794
|
+
Used by brackets.py for bracket extraction.
|
|
795
|
+
Note: is_chunkable_query() uses normalize_query() for validation.
|
|
796
|
+
|
|
797
|
+
Used by bracket extraction to create parallel work units.
|
|
798
|
+
|
|
799
|
+
Args:
|
|
800
|
+
query: MongoDB query dict
|
|
801
|
+
|
|
802
|
+
Returns:
|
|
803
|
+
Tuple of (global_conditions, or_branches)
|
|
804
|
+
or_branches is empty list if no $or present
|
|
805
|
+
|
|
806
|
+
Examples:
|
|
807
|
+
# Simple query without $or
|
|
808
|
+
>>> split_global_and({"status": "active", "value": {"$gt": 0}})
|
|
809
|
+
({'status': 'active', 'value': {'$gt': 0}}, [])
|
|
810
|
+
|
|
811
|
+
# Query with $or - separates global from branches
|
|
812
|
+
>>> split_global_and({
|
|
813
|
+
... "$or": [{"sensor": "A"}, {"sensor": "B"}],
|
|
814
|
+
... "account_id": "123",
|
|
815
|
+
... "timestamp": {"$gte": t1, "$lt": t2}
|
|
816
|
+
... })
|
|
817
|
+
({'account_id': '123', 'timestamp': {...}}, [{'sensor': 'A'}, {'sensor': 'B'}])
|
|
818
|
+
|
|
819
|
+
# The global conditions apply to ALL branches:
|
|
820
|
+
# Bracket 1: {"account_id": "123", "timestamp": {...}, "sensor": "A"}
|
|
821
|
+
# Bracket 2: {"account_id": "123", "timestamp": {...}, "sensor": "B"}
|
|
822
|
+
"""
|
|
823
|
+
q = dict(query)
|
|
824
|
+
|
|
825
|
+
# Case 1: Direct top-level $or
|
|
826
|
+
if "$or" in q:
|
|
827
|
+
or_list = q.pop("$or")
|
|
828
|
+
if not isinstance(or_list, list):
|
|
829
|
+
return {}, []
|
|
830
|
+
|
|
831
|
+
global_and: Dict[str, Any] = {}
|
|
832
|
+
if "$and" in q and isinstance(q["$and"], list):
|
|
833
|
+
for item in q.pop("$and"):
|
|
834
|
+
if isinstance(item, dict):
|
|
835
|
+
global_and.update(item)
|
|
836
|
+
global_and.update(q)
|
|
837
|
+
return global_and, or_list
|
|
838
|
+
|
|
839
|
+
# Case 2: $or inside $and
|
|
840
|
+
if "$and" in q and isinstance(q["$and"], list):
|
|
841
|
+
and_items = q.pop("$and")
|
|
842
|
+
found_or: List[Dict[str, Any]] = []
|
|
843
|
+
global_and: Dict[str, Any] = {}
|
|
844
|
+
|
|
845
|
+
for item in and_items:
|
|
846
|
+
if not isinstance(item, dict):
|
|
847
|
+
return {}, []
|
|
848
|
+
if "$or" in item:
|
|
849
|
+
if found_or:
|
|
850
|
+
return {}, [] # Multiple $or not supported
|
|
851
|
+
or_content = item["$or"]
|
|
852
|
+
if not isinstance(or_content, list):
|
|
853
|
+
return {}, []
|
|
854
|
+
found_or = or_content
|
|
855
|
+
else:
|
|
856
|
+
global_and.update(item)
|
|
857
|
+
|
|
858
|
+
global_and.update(q)
|
|
859
|
+
return global_and, found_or
|
|
860
|
+
|
|
861
|
+
# Case 3: No $or
|
|
862
|
+
return q, []
|
|
863
|
+
|
|
864
|
+
|
|
865
|
+
# =============================================================================
|
|
866
|
+
# TIME BOUNDS EXTRACTION
|
|
867
|
+
# =============================================================================
|
|
868
|
+
|
|
869
|
+
|
|
870
|
+
def normalize_datetime(dt: Any) -> datetime | None:
|
|
871
|
+
"""
|
|
872
|
+
Normalize to timezone-aware UTC datetime.
|
|
873
|
+
|
|
874
|
+
Handles datetime objects and ISO format strings.
|
|
875
|
+
Returns None if parsing fails.
|
|
876
|
+
"""
|
|
877
|
+
if isinstance(dt, datetime):
|
|
878
|
+
return dt if dt.tzinfo else dt.replace(tzinfo=timezone.utc)
|
|
879
|
+
|
|
880
|
+
if isinstance(dt, str):
|
|
881
|
+
try:
|
|
882
|
+
parsed = datetime.fromisoformat(dt.replace("Z", "+00:00"))
|
|
883
|
+
return parsed if parsed.tzinfo else parsed.replace(tzinfo=timezone.utc)
|
|
884
|
+
except (ValueError, AttributeError):
|
|
885
|
+
return None
|
|
886
|
+
return None
|
|
887
|
+
|
|
888
|
+
|
|
889
|
+
# =============================================================================
|
|
890
|
+
# QUERY NORMALIZATION AND TIME BOUNDS EXTRACTION
|
|
891
|
+
# =============================================================================
|
|
892
|
+
|
|
893
|
+
|
|
894
|
+
def normalize_query(query: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, bool]]:
|
|
895
|
+
"""
|
|
896
|
+
Normalize query structure for consistent analysis.
|
|
897
|
+
|
|
898
|
+
Transformations:
|
|
899
|
+
- Flatten nested $and operators
|
|
900
|
+
- Detect complexity patterns (multiple $or, nested $or)
|
|
901
|
+
|
|
902
|
+
Args:
|
|
903
|
+
query: MongoDB find() filter
|
|
904
|
+
|
|
905
|
+
Returns:
|
|
906
|
+
Tuple of (normalized_query, complexity_flags)
|
|
907
|
+
- normalized_query: Flattened query
|
|
908
|
+
- complexity_flags: {multiple_or, nested_or, complex_negation}
|
|
909
|
+
"""
|
|
910
|
+
|
|
911
|
+
def flatten_and_operators(obj: Any) -> Any:
|
|
912
|
+
"""Recursively flatten nested $and operators."""
|
|
913
|
+
if not isinstance(obj, dict):
|
|
914
|
+
return obj
|
|
915
|
+
|
|
916
|
+
result = {}
|
|
917
|
+
for key, value in obj.items():
|
|
918
|
+
if key == "$and" and isinstance(value, list):
|
|
919
|
+
# Flatten nested $and
|
|
920
|
+
flattened = []
|
|
921
|
+
for item in value:
|
|
922
|
+
if isinstance(item, dict) and len(item) == 1 and "$and" in item:
|
|
923
|
+
# Nested $and - merge up
|
|
924
|
+
flattened.extend(flatten_and_operators(item)["$and"])
|
|
925
|
+
else:
|
|
926
|
+
flattened.append(flatten_and_operators(item))
|
|
927
|
+
result["$and"] = flattened
|
|
928
|
+
elif isinstance(value, dict):
|
|
929
|
+
result[key] = flatten_and_operators(value)
|
|
930
|
+
elif isinstance(value, list):
|
|
931
|
+
result[key] = [flatten_and_operators(item) for item in value]
|
|
932
|
+
else:
|
|
933
|
+
result[key] = value
|
|
934
|
+
|
|
935
|
+
return result
|
|
936
|
+
|
|
937
|
+
def count_or_operators(obj: Any, depth: int = 0) -> Tuple[int, int]:
|
|
938
|
+
"""
|
|
939
|
+
Count $or operators and find max nesting depth.
|
|
940
|
+
Returns (or_count, max_or_depth)
|
|
941
|
+
"""
|
|
942
|
+
if not isinstance(obj, dict):
|
|
943
|
+
return 0, depth
|
|
944
|
+
|
|
945
|
+
or_count = 0
|
|
946
|
+
max_depth = depth
|
|
947
|
+
|
|
948
|
+
for key, value in obj.items():
|
|
949
|
+
if key == "$or":
|
|
950
|
+
or_count += 1
|
|
951
|
+
current_depth = depth + 1
|
|
952
|
+
max_depth = max(max_depth, current_depth)
|
|
953
|
+
|
|
954
|
+
# Check for nested $or inside branches
|
|
955
|
+
if isinstance(value, list):
|
|
956
|
+
for branch in value:
|
|
957
|
+
sub_count, sub_depth = count_or_operators(branch, current_depth)
|
|
958
|
+
or_count += sub_count
|
|
959
|
+
max_depth = max(max_depth, sub_depth)
|
|
960
|
+
elif isinstance(value, dict):
|
|
961
|
+
sub_count, sub_depth = count_or_operators(value, depth)
|
|
962
|
+
or_count += sub_count
|
|
963
|
+
max_depth = max(max_depth, sub_depth)
|
|
964
|
+
elif isinstance(value, list):
|
|
965
|
+
for item in value:
|
|
966
|
+
if isinstance(item, dict):
|
|
967
|
+
sub_count, sub_depth = count_or_operators(item, depth)
|
|
968
|
+
or_count += sub_count
|
|
969
|
+
max_depth = max(max_depth, sub_depth)
|
|
970
|
+
|
|
971
|
+
return or_count, max_depth
|
|
972
|
+
|
|
973
|
+
# Step 1: Flatten nested $and
|
|
974
|
+
normalized = flatten_and_operators(query)
|
|
975
|
+
|
|
976
|
+
# Step 2: Detect $or complexity
|
|
977
|
+
or_count, max_or_depth = count_or_operators(normalized)
|
|
978
|
+
|
|
979
|
+
# Step 3: Build complexity flags
|
|
980
|
+
flags = {
|
|
981
|
+
"multiple_or": or_count > 1,
|
|
982
|
+
"nested_or": max_or_depth > 1,
|
|
983
|
+
"complex_negation": False, # Checked later by check_negation_safety()
|
|
984
|
+
}
|
|
985
|
+
|
|
986
|
+
return normalized, flags
|
|
987
|
+
|
|
988
|
+
|
|
989
|
+
def extract_time_bounds_recursive(
|
|
990
|
+
query: Dict[str, Any], time_field: str, context: str = "POSITIVE"
|
|
991
|
+
) -> Tuple[Optional[Tuple[datetime, datetime, bool, bool]], bool]:
|
|
992
|
+
"""
|
|
993
|
+
Recursively extract time bounds from query tree.
|
|
994
|
+
|
|
995
|
+
Handles nested structures, $and (intersection), $or (union).
|
|
996
|
+
|
|
997
|
+
Args:
|
|
998
|
+
query: Query dict
|
|
999
|
+
time_field: Name of time field
|
|
1000
|
+
context: "POSITIVE" or "NEGATED" (inside $nor/$not)
|
|
1001
|
+
|
|
1002
|
+
Returns:
|
|
1003
|
+
Tuple of (time_bounds, has_time_ref)
|
|
1004
|
+
- time_bounds: (lo, hi, hi_inclusive, lo_inclusive) or None
|
|
1005
|
+
- lo: Lower bound datetime
|
|
1006
|
+
- hi: Upper bound datetime
|
|
1007
|
+
- hi_inclusive: True if original query used $lte, False if $lt
|
|
1008
|
+
- lo_inclusive: True if original query used $gte, False if $gt
|
|
1009
|
+
- has_time_ref: True if query references time field anywhere
|
|
1010
|
+
"""
|
|
1011
|
+
|
|
1012
|
+
def extract_from_time_field(value: Any) -> Tuple[Optional[Tuple], bool]:
|
|
1013
|
+
"""Extract bounds from time field value."""
|
|
1014
|
+
if context == "NEGATED":
|
|
1015
|
+
# Time field in negated context -> can't use
|
|
1016
|
+
return None, True
|
|
1017
|
+
|
|
1018
|
+
if not isinstance(value, dict):
|
|
1019
|
+
# Direct equality: {"timestamp": t1}
|
|
1020
|
+
dt = normalize_datetime(value)
|
|
1021
|
+
# Equality is inclusive on both sides
|
|
1022
|
+
return ((dt, dt, True, True), True) if dt else (None, True)
|
|
1023
|
+
|
|
1024
|
+
lo, hi, hi_inclusive, lo_inclusive = None, None, False, True
|
|
1025
|
+
|
|
1026
|
+
for op, operand in value.items():
|
|
1027
|
+
if op == "$gte":
|
|
1028
|
+
new_lo = normalize_datetime(operand)
|
|
1029
|
+
# Take most restrictive lower bound
|
|
1030
|
+
if new_lo:
|
|
1031
|
+
if lo is None or new_lo > lo:
|
|
1032
|
+
lo = new_lo
|
|
1033
|
+
lo_inclusive = True
|
|
1034
|
+
elif new_lo == lo:
|
|
1035
|
+
lo_inclusive = True # Keep inclusive if same value
|
|
1036
|
+
elif op == "$gt":
|
|
1037
|
+
dt = normalize_datetime(operand)
|
|
1038
|
+
if dt:
|
|
1039
|
+
# $gt is exclusive - track the actual value, not adjusted
|
|
1040
|
+
if lo is None or dt > lo:
|
|
1041
|
+
lo = dt
|
|
1042
|
+
lo_inclusive = False
|
|
1043
|
+
elif dt == lo:
|
|
1044
|
+
# $gt is more restrictive than $gte at same value
|
|
1045
|
+
lo_inclusive = False
|
|
1046
|
+
elif op == "$lt":
|
|
1047
|
+
new_hi = normalize_datetime(operand)
|
|
1048
|
+
# Take most restrictive upper bound
|
|
1049
|
+
if new_hi:
|
|
1050
|
+
if hi is None or new_hi < hi:
|
|
1051
|
+
hi = new_hi
|
|
1052
|
+
hi_inclusive = False
|
|
1053
|
+
elif new_hi == hi:
|
|
1054
|
+
hi_inclusive = False # $lt is more restrictive
|
|
1055
|
+
elif op == "$lte":
|
|
1056
|
+
dt = normalize_datetime(operand)
|
|
1057
|
+
if dt:
|
|
1058
|
+
# $lte is inclusive - track the actual value
|
|
1059
|
+
if hi is None or dt < hi:
|
|
1060
|
+
hi = dt
|
|
1061
|
+
hi_inclusive = True
|
|
1062
|
+
elif dt == hi:
|
|
1063
|
+
hi_inclusive = True # Keep inclusive if same value
|
|
1064
|
+
elif op == "$eq":
|
|
1065
|
+
dt = normalize_datetime(operand)
|
|
1066
|
+
lo = hi = dt
|
|
1067
|
+
hi_inclusive = True # Equality is inclusive
|
|
1068
|
+
lo_inclusive = True # Equality is inclusive
|
|
1069
|
+
elif op == "$in":
|
|
1070
|
+
# Take envelope
|
|
1071
|
+
if isinstance(operand, list):
|
|
1072
|
+
if not operand:
|
|
1073
|
+
# Empty $in array matches no documents
|
|
1074
|
+
return None, True
|
|
1075
|
+
dates = [normalize_datetime(d) for d in operand]
|
|
1076
|
+
dates = [d for d in dates if d is not None]
|
|
1077
|
+
if dates:
|
|
1078
|
+
lo = min(dates)
|
|
1079
|
+
hi = max(dates)
|
|
1080
|
+
lo_inclusive = True # $in with dates is inclusive
|
|
1081
|
+
hi_inclusive = True # $in with dates is inclusive
|
|
1082
|
+
elif op in {"$ne", "$nin", "$not"}:
|
|
1083
|
+
# Negation on time field
|
|
1084
|
+
return None, True
|
|
1085
|
+
|
|
1086
|
+
if lo is not None and hi is not None:
|
|
1087
|
+
# Validate bounds are sensible
|
|
1088
|
+
if lo > hi or (lo == hi and not (hi_inclusive and lo_inclusive)):
|
|
1089
|
+
# Contradictory bounds (e.g., $gte: 2024-02-01, $lt: 2024-01-01)
|
|
1090
|
+
return None, True
|
|
1091
|
+
return (lo, hi, hi_inclusive, lo_inclusive), True
|
|
1092
|
+
|
|
1093
|
+
return None, True
|
|
1094
|
+
|
|
1095
|
+
def intersect_bounds(b1: Tuple, b2: Tuple) -> Optional[Tuple]:
|
|
1096
|
+
"""Intersect two bounds, taking most restrictive operators."""
|
|
1097
|
+
lo1, hi1, hi_inc1, lo_inc1 = b1
|
|
1098
|
+
lo2, hi2, hi_inc2, lo_inc2 = b2
|
|
1099
|
+
|
|
1100
|
+
# Take max lower bound
|
|
1101
|
+
if lo1 > lo2:
|
|
1102
|
+
lo = lo1
|
|
1103
|
+
lo_inclusive = lo_inc1
|
|
1104
|
+
elif lo2 > lo1:
|
|
1105
|
+
lo = lo2
|
|
1106
|
+
lo_inclusive = lo_inc2
|
|
1107
|
+
else: # lo1 == lo2
|
|
1108
|
+
lo = lo1
|
|
1109
|
+
lo_inclusive = lo_inc1 and lo_inc2 # Both must be inclusive
|
|
1110
|
+
|
|
1111
|
+
# Take min upper bound
|
|
1112
|
+
if hi1 < hi2:
|
|
1113
|
+
hi = hi1
|
|
1114
|
+
hi_inclusive = hi_inc1
|
|
1115
|
+
elif hi2 < hi1:
|
|
1116
|
+
hi = hi2
|
|
1117
|
+
hi_inclusive = hi_inc2
|
|
1118
|
+
else: # hi1 == hi2
|
|
1119
|
+
hi = hi1
|
|
1120
|
+
hi_inclusive = hi_inc1 and hi_inc2 # Both must be inclusive
|
|
1121
|
+
|
|
1122
|
+
if lo > hi or (lo == hi and not (hi_inclusive and lo_inclusive)):
|
|
1123
|
+
return None # Empty intersection
|
|
1124
|
+
|
|
1125
|
+
return (lo, hi, hi_inclusive, lo_inclusive)
|
|
1126
|
+
|
|
1127
|
+
# Check if this is time field directly
|
|
1128
|
+
if time_field in query:
|
|
1129
|
+
return extract_from_time_field(query[time_field])
|
|
1130
|
+
|
|
1131
|
+
# Handle $and (intersection of bounds)
|
|
1132
|
+
if "$and" in query:
|
|
1133
|
+
all_bounds = []
|
|
1134
|
+
has_time_ref = False
|
|
1135
|
+
|
|
1136
|
+
for item in query["$and"]:
|
|
1137
|
+
if isinstance(item, dict):
|
|
1138
|
+
bounds, has_ref = extract_time_bounds_recursive(
|
|
1139
|
+
item, time_field, context
|
|
1140
|
+
)
|
|
1141
|
+
if has_ref:
|
|
1142
|
+
has_time_ref = True
|
|
1143
|
+
if bounds:
|
|
1144
|
+
all_bounds.append(bounds)
|
|
1145
|
+
|
|
1146
|
+
if not all_bounds:
|
|
1147
|
+
return None, has_time_ref
|
|
1148
|
+
|
|
1149
|
+
# Intersection
|
|
1150
|
+
merged = all_bounds[0]
|
|
1151
|
+
for bounds in all_bounds[1:]:
|
|
1152
|
+
merged = intersect_bounds(merged, bounds)
|
|
1153
|
+
if merged is None:
|
|
1154
|
+
return None, has_time_ref
|
|
1155
|
+
|
|
1156
|
+
return merged, has_time_ref
|
|
1157
|
+
|
|
1158
|
+
# Handle $or (union/envelope of bounds)
|
|
1159
|
+
if "$or" in query:
|
|
1160
|
+
all_bounds = []
|
|
1161
|
+
all_have_time_ref = []
|
|
1162
|
+
has_time_ref = False
|
|
1163
|
+
has_any_partial_or_missing = False
|
|
1164
|
+
|
|
1165
|
+
for item in query["$or"]:
|
|
1166
|
+
if isinstance(item, dict):
|
|
1167
|
+
bounds, has_ref = extract_time_bounds_recursive(
|
|
1168
|
+
item, time_field, context
|
|
1169
|
+
)
|
|
1170
|
+
all_have_time_ref.append(has_ref)
|
|
1171
|
+
|
|
1172
|
+
if has_ref:
|
|
1173
|
+
has_time_ref = True
|
|
1174
|
+
|
|
1175
|
+
if bounds is None:
|
|
1176
|
+
# Branch references time field but has partial/no bounds
|
|
1177
|
+
has_any_partial_or_missing = True
|
|
1178
|
+
else:
|
|
1179
|
+
all_bounds.append(bounds)
|
|
1180
|
+
else:
|
|
1181
|
+
# Branch doesn't reference time field at all
|
|
1182
|
+
has_any_partial_or_missing = True
|
|
1183
|
+
|
|
1184
|
+
# CRITICAL: If ANY branch is unbounded, partial, or doesn't reference time,
|
|
1185
|
+
# we cannot safely extract bounds. Taking envelope of only bounded branches
|
|
1186
|
+
# would cause data loss from unbounded/unreferenced branches.
|
|
1187
|
+
if has_any_partial_or_missing:
|
|
1188
|
+
return None, has_time_ref
|
|
1189
|
+
|
|
1190
|
+
if not all_bounds:
|
|
1191
|
+
return None, has_time_ref
|
|
1192
|
+
|
|
1193
|
+
# All branches have full bounds - safe to take union (envelope)
|
|
1194
|
+
# For union: take min lo, max hi
|
|
1195
|
+
# For inclusivity: preserve inclusive if ANY branch uses it at the boundary
|
|
1196
|
+
min_lo = min(b[0] for b in all_bounds)
|
|
1197
|
+
max_hi = max(b[1] for b in all_bounds)
|
|
1198
|
+
|
|
1199
|
+
# hi_inclusive is True if ANY branch with max_hi uses $lte
|
|
1200
|
+
hi_inclusive = any(b[2] for b in all_bounds if b[1] == max_hi)
|
|
1201
|
+
# lo_inclusive is True if ANY branch with min_lo uses $gte
|
|
1202
|
+
lo_inclusive = any(b[3] for b in all_bounds if b[0] == min_lo)
|
|
1203
|
+
|
|
1204
|
+
return (min_lo, max_hi, hi_inclusive, lo_inclusive), has_time_ref
|
|
1205
|
+
|
|
1206
|
+
# Handle $nor (negates context)
|
|
1207
|
+
if "$nor" in query:
|
|
1208
|
+
new_context = "NEGATED" if context == "POSITIVE" else "POSITIVE"
|
|
1209
|
+
has_time_ref = False
|
|
1210
|
+
|
|
1211
|
+
for item in query["$nor"]:
|
|
1212
|
+
if isinstance(item, dict):
|
|
1213
|
+
_, has_ref = extract_time_bounds_recursive(
|
|
1214
|
+
item, time_field, new_context
|
|
1215
|
+
)
|
|
1216
|
+
if has_ref:
|
|
1217
|
+
has_time_ref = True
|
|
1218
|
+
|
|
1219
|
+
# $nor with time ref is unsafe (inverted bounds)
|
|
1220
|
+
return None, has_time_ref
|
|
1221
|
+
|
|
1222
|
+
# Check all nested dicts
|
|
1223
|
+
all_bounds = []
|
|
1224
|
+
has_time_ref = False
|
|
1225
|
+
|
|
1226
|
+
for _, value in query.items():
|
|
1227
|
+
if isinstance(value, dict):
|
|
1228
|
+
bounds, has_ref = extract_time_bounds_recursive(value, time_field, context)
|
|
1229
|
+
if has_ref:
|
|
1230
|
+
has_time_ref = True
|
|
1231
|
+
if bounds:
|
|
1232
|
+
all_bounds.append(bounds)
|
|
1233
|
+
elif isinstance(value, list):
|
|
1234
|
+
for item in value:
|
|
1235
|
+
if isinstance(item, dict):
|
|
1236
|
+
bounds, has_ref = extract_time_bounds_recursive(
|
|
1237
|
+
item, time_field, context
|
|
1238
|
+
)
|
|
1239
|
+
if has_ref:
|
|
1240
|
+
has_time_ref = True
|
|
1241
|
+
if bounds:
|
|
1242
|
+
all_bounds.append(bounds)
|
|
1243
|
+
|
|
1244
|
+
# Merge bounds (intersection)
|
|
1245
|
+
if not all_bounds:
|
|
1246
|
+
return None, has_time_ref
|
|
1247
|
+
|
|
1248
|
+
merged = all_bounds[0]
|
|
1249
|
+
for bounds in all_bounds[1:]:
|
|
1250
|
+
merged = intersect_bounds(merged, bounds)
|
|
1251
|
+
if merged is None:
|
|
1252
|
+
return None, has_time_ref
|
|
1253
|
+
|
|
1254
|
+
return merged, has_time_ref
|
|
1255
|
+
|
|
1256
|
+
|
|
1257
|
+
def check_negation_safety(query: Dict[str, Any], time_field: str) -> Tuple[bool, str]:
|
|
1258
|
+
"""
|
|
1259
|
+
Check if negation operators safely avoid time field.
|
|
1260
|
+
|
|
1261
|
+
Ensures $nor, $not, $ne, $nin don't reference time field.
|
|
1262
|
+
|
|
1263
|
+
Args:
|
|
1264
|
+
query: MongoDB find() filter
|
|
1265
|
+
time_field: Name of time field
|
|
1266
|
+
|
|
1267
|
+
Returns:
|
|
1268
|
+
Tuple of (is_safe, rejection_reason)
|
|
1269
|
+
"""
|
|
1270
|
+
|
|
1271
|
+
def references_time_field(obj: Any, depth: int = 0) -> bool:
|
|
1272
|
+
"""Check if query references time field at any nesting level."""
|
|
1273
|
+
if depth > 10: # Prevent infinite recursion
|
|
1274
|
+
return False
|
|
1275
|
+
|
|
1276
|
+
if not isinstance(obj, dict):
|
|
1277
|
+
return False
|
|
1278
|
+
|
|
1279
|
+
if time_field in obj:
|
|
1280
|
+
return True
|
|
1281
|
+
|
|
1282
|
+
for _key, value in obj.items():
|
|
1283
|
+
if isinstance(value, dict):
|
|
1284
|
+
if references_time_field(value, depth + 1):
|
|
1285
|
+
return True
|
|
1286
|
+
elif isinstance(value, list):
|
|
1287
|
+
for item in value:
|
|
1288
|
+
if isinstance(item, dict):
|
|
1289
|
+
if references_time_field(item, depth + 1):
|
|
1290
|
+
return True
|
|
1291
|
+
|
|
1292
|
+
return False
|
|
1293
|
+
|
|
1294
|
+
def find_time_negations(obj: Any) -> List[str]:
|
|
1295
|
+
"""Find negation operators applied to time field."""
|
|
1296
|
+
if not isinstance(obj, dict):
|
|
1297
|
+
return []
|
|
1298
|
+
|
|
1299
|
+
negations = []
|
|
1300
|
+
|
|
1301
|
+
if time_field in obj:
|
|
1302
|
+
time_value = obj[time_field]
|
|
1303
|
+
if isinstance(time_value, dict):
|
|
1304
|
+
for op in ["$ne", "$nin", "$not"]:
|
|
1305
|
+
if op in time_value:
|
|
1306
|
+
negations.append(op)
|
|
1307
|
+
|
|
1308
|
+
for _key, value in obj.items():
|
|
1309
|
+
if isinstance(value, dict):
|
|
1310
|
+
negations.extend(find_time_negations(value))
|
|
1311
|
+
elif isinstance(value, list):
|
|
1312
|
+
for item in value:
|
|
1313
|
+
if isinstance(item, dict):
|
|
1314
|
+
negations.extend(find_time_negations(item))
|
|
1315
|
+
|
|
1316
|
+
return negations
|
|
1317
|
+
|
|
1318
|
+
# Check $nor
|
|
1319
|
+
if "$nor" in query:
|
|
1320
|
+
for branch in query["$nor"]:
|
|
1321
|
+
if isinstance(branch, dict) and references_time_field(branch):
|
|
1322
|
+
return False, f"$nor references time field '{time_field}'"
|
|
1323
|
+
|
|
1324
|
+
# Check $not, $ne, $nin on time field
|
|
1325
|
+
unsafe = find_time_negations(query)
|
|
1326
|
+
if unsafe:
|
|
1327
|
+
return False, f"Negation operators on time field: {', '.join(set(unsafe))}"
|
|
1328
|
+
|
|
1329
|
+
return True, ""
|
|
1330
|
+
|
|
1331
|
+
|
|
1332
|
+
# =============================================================================
|
|
1333
|
+
# MAIN ENTRY POINT
|
|
1334
|
+
# =============================================================================
|
|
1335
|
+
|
|
1336
|
+
|
|
1337
|
+
def is_chunkable_query(
|
|
1338
|
+
query: Dict[str, Any],
|
|
1339
|
+
time_field: str,
|
|
1340
|
+
sort_spec: Optional[List[Tuple[str, int]]] = None,
|
|
1341
|
+
) -> ChunkabilityResult:
|
|
1342
|
+
"""
|
|
1343
|
+
Determine execution mode for query (PARALLEL/SINGLE/REJECT).
|
|
1344
|
+
|
|
1345
|
+
This is the MAIN DECISION POINT for query execution strategy. Every query
|
|
1346
|
+
must pass through this function before execution to ensure correctness.
|
|
1347
|
+
|
|
1348
|
+
Analyzes query to determine if it can be safely parallelized (PARALLEL mode),
|
|
1349
|
+
requires single-worker execution (SINGLE mode), or would produce incorrect
|
|
1350
|
+
results (REJECT mode).
|
|
1351
|
+
|
|
1352
|
+
Args:
|
|
1353
|
+
query: MongoDB find() filter dict
|
|
1354
|
+
time_field: Name of time field for chunking (e.g., "timestamp", "timestamp")
|
|
1355
|
+
sort_spec: Optional sort specification from cursor.sort() for
|
|
1356
|
+
$natural detection.
|
|
1357
|
+
Format: [("field", 1)] for ascending, [("field", -1)] for descending
|
|
1358
|
+
|
|
1359
|
+
Returns:
|
|
1360
|
+
ChunkabilityResult (NamedTuple) with:
|
|
1361
|
+
- mode: ChunkabilityMode enum (PARALLEL/SINGLE/REJECT)
|
|
1362
|
+
- reason: str explaining the decision (empty for PARALLEL)
|
|
1363
|
+
- bounds: Tuple[Optional[datetime], Optional[datetime]]
|
|
1364
|
+
time range extracted
|
|
1365
|
+
|
|
1366
|
+
Can be unpacked as tuple (NamedTuple feature):
|
|
1367
|
+
mode, reason, bounds = result
|
|
1368
|
+
|
|
1369
|
+
Execution Modes:
|
|
1370
|
+
PARALLEL: Safe for parallel time-chunked execution
|
|
1371
|
+
SINGLE: Valid query requiring single-worker fallback
|
|
1372
|
+
REJECT: Query would produce incorrect results
|
|
1373
|
+
|
|
1374
|
+
Examples:
|
|
1375
|
+
# PARALLEL mode - standard query
|
|
1376
|
+
>>> result = is_chunkable_query({
|
|
1377
|
+
... "account_id": ObjectId("..."),
|
|
1378
|
+
... "timestamp": {"$gte": t1, "$lt": t2}
|
|
1379
|
+
... }, "timestamp")
|
|
1380
|
+
>>> result.mode == ChunkabilityMode.PARALLEL
|
|
1381
|
+
True
|
|
1382
|
+
|
|
1383
|
+
# SINGLE mode - $natural sort
|
|
1384
|
+
>>> result = is_chunkable_query({
|
|
1385
|
+
... "timestamp": {"$gte": t1, "$lt": t2}
|
|
1386
|
+
... }, "timestamp", sort_spec=[("$natural", 1)])
|
|
1387
|
+
>>> result.mode == ChunkabilityMode.SINGLE
|
|
1388
|
+
True
|
|
1389
|
+
|
|
1390
|
+
# SINGLE mode - unbounded $or
|
|
1391
|
+
>>> result = is_chunkable_query({
|
|
1392
|
+
... "$or": [
|
|
1393
|
+
... {"sensor": "A", "timestamp": {"$gte": t1, "$lt": t2}},
|
|
1394
|
+
... {"sensor": "B"} # No time constraint
|
|
1395
|
+
... ]
|
|
1396
|
+
... }, "timestamp")
|
|
1397
|
+
>>> result.mode == ChunkabilityMode.SINGLE
|
|
1398
|
+
True
|
|
1399
|
+
|
|
1400
|
+
# SINGLE mode - nested $or (complex but executable)
|
|
1401
|
+
>>> result = is_chunkable_query({
|
|
1402
|
+
... "$or": [{"$or": [{"a": 1}]}]
|
|
1403
|
+
... }, "timestamp")
|
|
1404
|
+
>>> result.mode == ChunkabilityMode.SINGLE
|
|
1405
|
+
True
|
|
1406
|
+
|
|
1407
|
+
# SINGLE mode - operator requiring full dataset
|
|
1408
|
+
>>> result = is_chunkable_query({
|
|
1409
|
+
... "$text": {"$search": "test"}
|
|
1410
|
+
... }, "timestamp")
|
|
1411
|
+
>>> result.mode == ChunkabilityMode.SINGLE
|
|
1412
|
+
True
|
|
1413
|
+
|
|
1414
|
+
# REJECT mode - empty $or (invalid syntax)
|
|
1415
|
+
>>> result = is_chunkable_query({
|
|
1416
|
+
... "$or": []
|
|
1417
|
+
... }, "timestamp")
|
|
1418
|
+
>>> result.mode == ChunkabilityMode.REJECT
|
|
1419
|
+
True
|
|
1420
|
+
"""
|
|
1421
|
+
# =========================================================================
|
|
1422
|
+
# VALIDATION PIPELINE: 13 Steps from Most-to-Least Restrictive
|
|
1423
|
+
# =========================================================================
|
|
1424
|
+
# Step 3: Empty $or - REJECT (invalid MongoDB syntax)
|
|
1425
|
+
# Steps 2, 4, 4.5, 5, 6, 11: SINGLE tier (valid but not parallelizable)
|
|
1426
|
+
# Steps 7-9: Time reference checks - SINGLE tier (valid but not parallelizable)
|
|
1427
|
+
# Step 10: Contradictory bounds - REJECT (lo >= hi is impossible)
|
|
1428
|
+
# Step 12: Success - PARALLEL tier (safe for parallel execution)
|
|
1429
|
+
#
|
|
1430
|
+
# Philosophy: Only REJECT for truly invalid queries (empty $or,
|
|
1431
|
+
# contradictory bounds).
|
|
1432
|
+
# Everything else gets SINGLE mode - if MongoDB can execute it,
|
|
1433
|
+
# so can we (single-worker).
|
|
1434
|
+
# conditions (graceful degradation), then approve for PARALLEL (success path).
|
|
1435
|
+
# =========================================================================
|
|
1436
|
+
|
|
1437
|
+
# Step 1: Normalize query structure
|
|
1438
|
+
normalized, complexity_flags = normalize_query(query)
|
|
1439
|
+
|
|
1440
|
+
# Default bounds for cases where time_bounds is None
|
|
1441
|
+
defaults = (None, None, False, True)
|
|
1442
|
+
|
|
1443
|
+
# Step 2: Check nested $or (SINGLE tier - complex but executable)
|
|
1444
|
+
if complexity_flags["nested_or"]:
|
|
1445
|
+
# Extract time bounds for single-worker execution
|
|
1446
|
+
time_bounds, _has_time_ref = extract_time_bounds_recursive(
|
|
1447
|
+
normalized, time_field
|
|
1448
|
+
)
|
|
1449
|
+
lo, hi, hi_inclusive, lo_inclusive = time_bounds or defaults
|
|
1450
|
+
|
|
1451
|
+
return ChunkabilityResult(
|
|
1452
|
+
mode=ChunkabilityMode.SINGLE,
|
|
1453
|
+
reason="nested $or operators (depth > 1) require single-worker execution",
|
|
1454
|
+
bounds=(lo, hi),
|
|
1455
|
+
)
|
|
1456
|
+
|
|
1457
|
+
# Step 3: Check for empty $or array (REJECT tier)
|
|
1458
|
+
if (
|
|
1459
|
+
"$or" in normalized
|
|
1460
|
+
and isinstance(normalized.get("$or"), list)
|
|
1461
|
+
and len(normalized["$or"]) == 0
|
|
1462
|
+
):
|
|
1463
|
+
return ChunkabilityResult(
|
|
1464
|
+
mode=ChunkabilityMode.REJECT,
|
|
1465
|
+
reason="$or with empty array matches no documents",
|
|
1466
|
+
bounds=(None, None),
|
|
1467
|
+
)
|
|
1468
|
+
|
|
1469
|
+
# Step 4: Check operators requiring full dataset (SINGLE tier)
|
|
1470
|
+
has_forbidden, op = has_forbidden_ops(normalized)
|
|
1471
|
+
if has_forbidden:
|
|
1472
|
+
# Extract time bounds for single-worker execution
|
|
1473
|
+
time_bounds, has_time_ref = extract_time_bounds_recursive(
|
|
1474
|
+
normalized, time_field
|
|
1475
|
+
)
|
|
1476
|
+
lo, hi, hi_inclusive, lo_inclusive = time_bounds or defaults
|
|
1477
|
+
|
|
1478
|
+
return ChunkabilityResult(
|
|
1479
|
+
mode=ChunkabilityMode.SINGLE,
|
|
1480
|
+
reason=f"operator '{op}' requires full dataset (single-worker execution)",
|
|
1481
|
+
bounds=(lo, hi),
|
|
1482
|
+
)
|
|
1483
|
+
|
|
1484
|
+
# Step 4.5: Check for unknown operators (SINGLE tier - experimental)
|
|
1485
|
+
has_unknown, op = has_unknown_operators(normalized)
|
|
1486
|
+
if has_unknown:
|
|
1487
|
+
# Extract time bounds for single-worker execution
|
|
1488
|
+
time_bounds, has_time_ref = extract_time_bounds_recursive(
|
|
1489
|
+
normalized, time_field
|
|
1490
|
+
)
|
|
1491
|
+
lo, hi, hi_inclusive, lo_inclusive = time_bounds or defaults
|
|
1492
|
+
|
|
1493
|
+
return ChunkabilityResult(
|
|
1494
|
+
mode=ChunkabilityMode.SINGLE,
|
|
1495
|
+
reason=f"unknown operator '{op}' (experimental single-worker execution)",
|
|
1496
|
+
bounds=(lo, hi),
|
|
1497
|
+
)
|
|
1498
|
+
|
|
1499
|
+
# Step 5: Check conditional operators (SINGLE tier)
|
|
1500
|
+
result = check_conditional_operators(normalized, time_field)
|
|
1501
|
+
if not result:
|
|
1502
|
+
# Extract time bounds for single-worker execution
|
|
1503
|
+
time_bounds, has_time_ref = extract_time_bounds_recursive(
|
|
1504
|
+
normalized, time_field
|
|
1505
|
+
)
|
|
1506
|
+
lo, hi, hi_inclusive, lo_inclusive = time_bounds or defaults
|
|
1507
|
+
|
|
1508
|
+
return ChunkabilityResult(
|
|
1509
|
+
mode=ChunkabilityMode.SINGLE, reason=result.reason, bounds=(lo, hi)
|
|
1510
|
+
)
|
|
1511
|
+
|
|
1512
|
+
# Step 6: Check $natural sort (SINGLE tier - valid but not chunkable)
|
|
1513
|
+
if sort_spec and has_natural_sort(sort_spec):
|
|
1514
|
+
# Extract time bounds for single-worker execution
|
|
1515
|
+
time_bounds, has_time_ref = extract_time_bounds_recursive(
|
|
1516
|
+
normalized, time_field
|
|
1517
|
+
)
|
|
1518
|
+
lo, hi, hi_inclusive, lo_inclusive = time_bounds or defaults
|
|
1519
|
+
|
|
1520
|
+
return ChunkabilityResult(
|
|
1521
|
+
mode=ChunkabilityMode.SINGLE,
|
|
1522
|
+
reason="$natural sort requires insertion order (single-worker execution)",
|
|
1523
|
+
bounds=(lo, hi),
|
|
1524
|
+
)
|
|
1525
|
+
# Step 7: Extract time bounds
|
|
1526
|
+
time_bounds, has_time_ref = extract_time_bounds_recursive(normalized, time_field)
|
|
1527
|
+
|
|
1528
|
+
# Step 8: Check time field reference (SINGLE tier - no time bounds)
|
|
1529
|
+
if not has_time_ref:
|
|
1530
|
+
return ChunkabilityResult(
|
|
1531
|
+
mode=ChunkabilityMode.SINGLE,
|
|
1532
|
+
reason="no time field reference found",
|
|
1533
|
+
bounds=(None, None),
|
|
1534
|
+
)
|
|
1535
|
+
|
|
1536
|
+
# Step 9: Check time bounds validity (SINGLE tier - unbounded/partial)
|
|
1537
|
+
if time_bounds is None:
|
|
1538
|
+
# More specific error messages based on query structure
|
|
1539
|
+
if "$or" in normalized:
|
|
1540
|
+
reason = "$or query has unbounded or partial time constraints in one \
|
|
1541
|
+
or more branches"
|
|
1542
|
+
elif "$ne" in str(normalized) or "$nin" in str(normalized):
|
|
1543
|
+
reason = "query contains negation operators ($ne/$nin) on time field"
|
|
1544
|
+
elif "$in" in str(normalized) and "[]" in str(normalized):
|
|
1545
|
+
reason = "query contains empty $in array on time field"
|
|
1546
|
+
else:
|
|
1547
|
+
reason = "no complete time range (invalid or contradictory bounds)"
|
|
1548
|
+
|
|
1549
|
+
return ChunkabilityResult(
|
|
1550
|
+
mode=ChunkabilityMode.SINGLE, reason=reason, bounds=(None, None)
|
|
1551
|
+
)
|
|
1552
|
+
|
|
1553
|
+
lo, hi, hi_inclusive, lo_inclusive = time_bounds
|
|
1554
|
+
|
|
1555
|
+
# Step 10: Validate bounds are sensible (REJECT tier - contradictory)
|
|
1556
|
+
if lo > hi or (lo == hi and not (hi_inclusive and lo_inclusive)):
|
|
1557
|
+
return ChunkabilityResult(
|
|
1558
|
+
mode=ChunkabilityMode.REJECT,
|
|
1559
|
+
reason="invalid time range: lower bound >= upper bound \
|
|
1560
|
+
(contradictory constraints)",
|
|
1561
|
+
bounds=(None, None),
|
|
1562
|
+
)
|
|
1563
|
+
|
|
1564
|
+
# Step 11: Check negation safety (SINGLE tier - works but not parallelizable)
|
|
1565
|
+
is_safe, reason = check_negation_safety(normalized, time_field)
|
|
1566
|
+
if not is_safe:
|
|
1567
|
+
return ChunkabilityResult(
|
|
1568
|
+
mode=ChunkabilityMode.SINGLE, reason=reason, bounds=(lo, hi)
|
|
1569
|
+
)
|
|
1570
|
+
|
|
1571
|
+
# Step 12: All checks passed - PARALLEL mode
|
|
1572
|
+
return ChunkabilityResult(
|
|
1573
|
+
mode=ChunkabilityMode.PARALLEL, reason="", bounds=(lo, hi)
|
|
1574
|
+
)
|
|
1575
|
+
|
|
1576
|
+
|
|
1577
|
+
# =============================================================================
|
|
1578
|
+
# SORT VALIDATION
|
|
1579
|
+
# =============================================================================
|
|
1580
|
+
|
|
1581
|
+
|
|
1582
|
+
def has_natural_sort(sort_spec: Optional[List[Tuple[str, int]]]) -> bool:
|
|
1583
|
+
"""
|
|
1584
|
+
Check if sort specification uses $natural (insertion order).
|
|
1585
|
+
|
|
1586
|
+
MongoDB's $natural sort returns documents in the order they were inserted
|
|
1587
|
+
into the collection (or reverse order with -1). This is incompatible with
|
|
1588
|
+
time-based chunking because insertion order is collection-wide and cannot
|
|
1589
|
+
be preserved when splitting queries by time ranges.
|
|
1590
|
+
|
|
1591
|
+
CRITICAL: This validation prevents silent data corruption. If $natural sort
|
|
1592
|
+
is used with chunking, documents would be returned in arbitrary order within
|
|
1593
|
+
each chunk (time-sorted), not in true insertion order.
|
|
1594
|
+
|
|
1595
|
+
Args:
|
|
1596
|
+
sort_spec: Sort specification from cursor.sort(), e.g., [("timestamp", 1)]
|
|
1597
|
+
or [("$natural", 1)] for insertion order. Can be None or empty.
|
|
1598
|
+
|
|
1599
|
+
Returns:
|
|
1600
|
+
True if $natural sort is detected, False otherwise
|
|
1601
|
+
(including for malformed input)
|
|
1602
|
+
|
|
1603
|
+
Examples:
|
|
1604
|
+
>>> has_natural_sort([("$natural", 1)])
|
|
1605
|
+
True
|
|
1606
|
+
|
|
1607
|
+
>>> has_natural_sort([("$natural", -1)])
|
|
1608
|
+
True
|
|
1609
|
+
|
|
1610
|
+
>>> has_natural_sort([("timestamp", 1)])
|
|
1611
|
+
False
|
|
1612
|
+
|
|
1613
|
+
>>> has_natural_sort([("timestamp", 1), ("_id", 1)])
|
|
1614
|
+
False
|
|
1615
|
+
|
|
1616
|
+
>>> has_natural_sort(None)
|
|
1617
|
+
False
|
|
1618
|
+
|
|
1619
|
+
>>> has_natural_sort([]) # Empty list
|
|
1620
|
+
False
|
|
1621
|
+
"""
|
|
1622
|
+
# DEFENSE: Handle None or empty sort_spec
|
|
1623
|
+
if not sort_spec:
|
|
1624
|
+
return False
|
|
1625
|
+
|
|
1626
|
+
# DEFENSE: Validate sort_spec structure before iteration
|
|
1627
|
+
# Protects against malformed input that could cause exceptions
|
|
1628
|
+
if not isinstance(sort_spec, list):
|
|
1629
|
+
return False
|
|
1630
|
+
|
|
1631
|
+
# Check each sort field for $natural
|
|
1632
|
+
# Using try-except for robustness in case of unexpected tuple structure
|
|
1633
|
+
for item in sort_spec:
|
|
1634
|
+
try:
|
|
1635
|
+
# Expected format: (field_name, direction)
|
|
1636
|
+
if isinstance(item, (tuple, list)) and len(item) >= 2:
|
|
1637
|
+
field, _ = item[0], item[1]
|
|
1638
|
+
if field == "$natural":
|
|
1639
|
+
return True
|
|
1640
|
+
except (TypeError, ValueError, IndexError):
|
|
1641
|
+
# Malformed item - skip it gracefully rather than crashing
|
|
1642
|
+
continue
|
|
1643
|
+
|
|
1644
|
+
return False
|
|
1645
|
+
|
|
1646
|
+
|
|
1647
|
+
def validate_sort_field(
|
|
1648
|
+
sort_spec: Optional[List[Tuple[str, int]]],
|
|
1649
|
+
schema: Any,
|
|
1650
|
+
) -> ValidationResult:
|
|
1651
|
+
"""
|
|
1652
|
+
Validate that sort fields are compatible with XLR8.
|
|
1653
|
+
|
|
1654
|
+
Now supports:
|
|
1655
|
+
- Parent field sorting (e.g., "metadata" when schema has "metadata.region_id")
|
|
1656
|
+
- Types.Any() sorting with MongoDB-compatible type ordering
|
|
1657
|
+
|
|
1658
|
+
Args:
|
|
1659
|
+
sort_spec: Sort specification from cursor, e.g.,
|
|
1660
|
+
[("timestamp", 1), ("value", -1)]
|
|
1661
|
+
schema: XLR8 Schema object with field type definitions
|
|
1662
|
+
|
|
1663
|
+
Returns:
|
|
1664
|
+
ValidationResult with is_valid=True if sort is allowed.
|
|
1665
|
+
|
|
1666
|
+
Example:
|
|
1667
|
+
>>> from xlr8.schema import Schema, Types
|
|
1668
|
+
>>> schema = Schema(
|
|
1669
|
+
... time_field="timestamp",
|
|
1670
|
+
... fields={
|
|
1671
|
+
... "timestamp": Types.Timestamp("ms"),
|
|
1672
|
+
... "metadata.account_id": Types.ObjectId(),
|
|
1673
|
+
... "value": Types.Any(), # Now allowed!
|
|
1674
|
+
... }
|
|
1675
|
+
... )
|
|
1676
|
+
>>> validate_sort_field([("timestamp", 1)], schema)
|
|
1677
|
+
ValidationResult(is_valid=True, reason='')
|
|
1678
|
+
|
|
1679
|
+
>>> validate_sort_field([("metadata", 1)], schema) # Parent field
|
|
1680
|
+
ValidationResult(is_valid=True, reason='')
|
|
1681
|
+
|
|
1682
|
+
>>> validate_sort_field([("value", 1)], schema) # Any type
|
|
1683
|
+
ValidationResult(is_valid=True, reason='')
|
|
1684
|
+
"""
|
|
1685
|
+
if not sort_spec:
|
|
1686
|
+
return ValidationResult(True, "")
|
|
1687
|
+
|
|
1688
|
+
# Check for $natural sort (insertion order)
|
|
1689
|
+
if has_natural_sort(sort_spec):
|
|
1690
|
+
return ValidationResult(
|
|
1691
|
+
False,
|
|
1692
|
+
"$natural sort (insertion order) is incompatible with time-based chunking. "
|
|
1693
|
+
"Use time field sorting instead: [('timestamp', 1)]",
|
|
1694
|
+
)
|
|
1695
|
+
|
|
1696
|
+
if schema is None or not hasattr(schema, "fields"):
|
|
1697
|
+
# No schema to validate against - allow sort
|
|
1698
|
+
return ValidationResult(True, "")
|
|
1699
|
+
|
|
1700
|
+
for field_name, direction in sort_spec:
|
|
1701
|
+
# Check if field exists directly in schema
|
|
1702
|
+
if field_name in schema.fields:
|
|
1703
|
+
# Field exists - always valid now (Any() supported)
|
|
1704
|
+
continue
|
|
1705
|
+
|
|
1706
|
+
# Check if it's a parent field (e.g., "metadata" for "metadata.region_id")
|
|
1707
|
+
is_parent = False
|
|
1708
|
+
for schema_field in schema.fields.keys():
|
|
1709
|
+
if schema_field.startswith(field_name + "."):
|
|
1710
|
+
is_parent = True
|
|
1711
|
+
break
|
|
1712
|
+
|
|
1713
|
+
if is_parent:
|
|
1714
|
+
# Parent field sorting is valid
|
|
1715
|
+
continue
|
|
1716
|
+
|
|
1717
|
+
# Field not found in schema - error
|
|
1718
|
+
available_fields = sorted(schema.fields.keys())[:10]
|
|
1719
|
+
return ValidationResult(
|
|
1720
|
+
False,
|
|
1721
|
+
f"Sort field '{field_name}' not found in schema. "
|
|
1722
|
+
f"Available fields: {available_fields}"
|
|
1723
|
+
+ ("..." if len(schema.fields) > 10 else ""),
|
|
1724
|
+
)
|
|
1725
|
+
|
|
1726
|
+
return ValidationResult(True, "")
|
|
1727
|
+
|
|
1728
|
+
|
|
1729
|
+
def get_sort_field_info(
|
|
1730
|
+
sort_spec: List[Tuple[str, int]],
|
|
1731
|
+
schema: Any,
|
|
1732
|
+
) -> List[dict]:
|
|
1733
|
+
"""
|
|
1734
|
+
Analyze sort fields and return metadata for DuckDB sorting.
|
|
1735
|
+
|
|
1736
|
+
Returns a list of dicts with:
|
|
1737
|
+
- field: Original field name
|
|
1738
|
+
- direction: 1 (ASC) or -1 (DESC)
|
|
1739
|
+
- is_any: True if Types.Any()
|
|
1740
|
+
- is_list: True if Types.List() (requires DuckDB - pandas can't sort arrays)
|
|
1741
|
+
- is_parent: True if parent field (expand to children)
|
|
1742
|
+
- child_fields: List of child fields if is_parent
|
|
1743
|
+
"""
|
|
1744
|
+
# Import here to avoid circular dependency (schema imports analysis)
|
|
1745
|
+
try:
|
|
1746
|
+
from xlr8.schema.types import Any as AnyType
|
|
1747
|
+
from xlr8.schema.types import List as ListType
|
|
1748
|
+
except ImportError:
|
|
1749
|
+
AnyType = None
|
|
1750
|
+
ListType = None
|
|
1751
|
+
|
|
1752
|
+
result = []
|
|
1753
|
+
|
|
1754
|
+
for field_name, direction in sort_spec:
|
|
1755
|
+
info = {
|
|
1756
|
+
"field": field_name,
|
|
1757
|
+
"direction": direction,
|
|
1758
|
+
"is_any": False,
|
|
1759
|
+
"is_list": False,
|
|
1760
|
+
"is_parent": False,
|
|
1761
|
+
"child_fields": [],
|
|
1762
|
+
}
|
|
1763
|
+
|
|
1764
|
+
# Check if field is in schema
|
|
1765
|
+
if field_name in schema.fields:
|
|
1766
|
+
field_type = schema.fields[field_name]
|
|
1767
|
+
if AnyType and (
|
|
1768
|
+
isinstance(field_type, AnyType)
|
|
1769
|
+
or (isinstance(field_type, type) and issubclass(field_type, AnyType))
|
|
1770
|
+
):
|
|
1771
|
+
info["is_any"] = True
|
|
1772
|
+
elif ListType and isinstance(field_type, ListType):
|
|
1773
|
+
info["is_list"] = True
|
|
1774
|
+
else:
|
|
1775
|
+
# Check for parent field
|
|
1776
|
+
children = []
|
|
1777
|
+
for schema_field in schema.fields.keys():
|
|
1778
|
+
if schema_field.startswith(field_name + "."):
|
|
1779
|
+
children.append(schema_field)
|
|
1780
|
+
if children:
|
|
1781
|
+
info["is_parent"] = True
|
|
1782
|
+
info["child_fields"] = sorted(children) # Consistent order
|
|
1783
|
+
|
|
1784
|
+
result.append(info)
|
|
1785
|
+
|
|
1786
|
+
return result
|
|
1787
|
+
|
|
1788
|
+
|
|
1789
|
+
def generate_sort_sql(
|
|
1790
|
+
sort_spec: List[Tuple[str, int]],
|
|
1791
|
+
schema: Any,
|
|
1792
|
+
) -> str:
|
|
1793
|
+
"""
|
|
1794
|
+
Generate DuckDB ORDER BY clause for advanced sorting.
|
|
1795
|
+
|
|
1796
|
+
Handles:
|
|
1797
|
+
- Simple fields: ORDER BY "timestamp" ASC
|
|
1798
|
+
- Parent fields: ORDER BY "metadata.region_id" DESC, "metadata.source_id" DESC
|
|
1799
|
+
- Any() fields: Composite sort with type priority (MongoDB BSON order)
|
|
1800
|
+
|
|
1801
|
+
MongoDB BSON type ordering:
|
|
1802
|
+
1. MinKey (internal)
|
|
1803
|
+
2. Null
|
|
1804
|
+
3. Numbers (int, float, decimal)
|
|
1805
|
+
4. String
|
|
1806
|
+
5. Object/Document
|
|
1807
|
+
6. Array
|
|
1808
|
+
7. Binary
|
|
1809
|
+
8. ObjectId
|
|
1810
|
+
9. Boolean
|
|
1811
|
+
10. Date
|
|
1812
|
+
11. Timestamp (internal)
|
|
1813
|
+
12. Regex
|
|
1814
|
+
13. MaxKey (internal)
|
|
1815
|
+
|
|
1816
|
+
Returns:
|
|
1817
|
+
ORDER BY clause string (without "ORDER BY" prefix)
|
|
1818
|
+
"""
|
|
1819
|
+
field_infos = get_sort_field_info(sort_spec, schema)
|
|
1820
|
+
order_parts = []
|
|
1821
|
+
|
|
1822
|
+
for info in field_infos:
|
|
1823
|
+
order = "ASC" if info["direction"] == 1 else "DESC"
|
|
1824
|
+
|
|
1825
|
+
if info["is_any"]:
|
|
1826
|
+
# Composite sort for Any() type - MongoDB BSON ordering
|
|
1827
|
+
field = info["field"]
|
|
1828
|
+
# Type priority (matching MongoDB BSON order)
|
|
1829
|
+
# We use the struct fields: null_value, float_value, int32_value,
|
|
1830
|
+
# int64_value,
|
|
1831
|
+
# string_value, document_value, array_value, binary_value,
|
|
1832
|
+
# objectid_value,
|
|
1833
|
+
# bool_value, datetime_value, regex_value, decimal128_value
|
|
1834
|
+
type_priority = f"""
|
|
1835
|
+
CASE
|
|
1836
|
+
WHEN "{field}".null_value = true THEN 2
|
|
1837
|
+
WHEN "{field}".float_value IS NOT NULL THEN 3
|
|
1838
|
+
WHEN "{field}".int32_value IS NOT NULL THEN 3
|
|
1839
|
+
WHEN "{field}".int64_value IS NOT NULL THEN 3
|
|
1840
|
+
WHEN "{field}".string_value IS NOT NULL THEN 4
|
|
1841
|
+
WHEN "{field}".document_value IS NOT NULL THEN 5
|
|
1842
|
+
WHEN "{field}".array_value IS NOT NULL THEN 6
|
|
1843
|
+
WHEN "{field}".binary_value IS NOT NULL THEN 7
|
|
1844
|
+
WHEN "{field}".objectid_value IS NOT NULL THEN 8
|
|
1845
|
+
WHEN "{field}".bool_value IS NOT NULL THEN 9
|
|
1846
|
+
WHEN "{field}".datetime_value IS NOT NULL THEN 10
|
|
1847
|
+
WHEN "{field}".regex_value IS NOT NULL THEN 12
|
|
1848
|
+
WHEN "{field}".decimal128_value IS NOT NULL THEN 3
|
|
1849
|
+
ELSE 99
|
|
1850
|
+
END""".strip()
|
|
1851
|
+
|
|
1852
|
+
# Numeric value (for numeric types)
|
|
1853
|
+
numeric_val = f"""COALESCE(
|
|
1854
|
+
"{field}".float_value,
|
|
1855
|
+
CAST("{field}".int64_value AS DOUBLE),
|
|
1856
|
+
CAST("{field}".int32_value AS DOUBLE),
|
|
1857
|
+
0
|
|
1858
|
+
)"""
|
|
1859
|
+
|
|
1860
|
+
# String value (for string/objectid types)
|
|
1861
|
+
string_val = f"""COALESCE(
|
|
1862
|
+
"{field}".string_value,
|
|
1863
|
+
"{field}".objectid_value,
|
|
1864
|
+
"{field}".document_value,
|
|
1865
|
+
"{field}".array_value,
|
|
1866
|
+
''
|
|
1867
|
+
)"""
|
|
1868
|
+
|
|
1869
|
+
# Datetime value
|
|
1870
|
+
datetime_val = f'"{field}".datetime_value'
|
|
1871
|
+
|
|
1872
|
+
# Bool value
|
|
1873
|
+
bool_val = f'CAST("{field}".bool_value AS INTEGER)'
|
|
1874
|
+
|
|
1875
|
+
order_parts.append(f"({type_priority}) {order}")
|
|
1876
|
+
order_parts.append(f"({numeric_val}) {order}")
|
|
1877
|
+
order_parts.append(f"({string_val}) {order}")
|
|
1878
|
+
order_parts.append(f"({datetime_val}) {order}")
|
|
1879
|
+
order_parts.append(f"({bool_val}) {order}")
|
|
1880
|
+
|
|
1881
|
+
elif info["is_parent"]:
|
|
1882
|
+
# Parent field - expand to all children
|
|
1883
|
+
for child in info["child_fields"]:
|
|
1884
|
+
order_parts.append(f'"{child}" {order}')
|
|
1885
|
+
else:
|
|
1886
|
+
# Simple field
|
|
1887
|
+
order_parts.append(f'"{info["field"]}" {order}')
|
|
1888
|
+
|
|
1889
|
+
return ", ".join(order_parts)
|