xlr8 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xlr8/__init__.py +109 -0
- xlr8/_xlr8_rust.pyi +71 -0
- xlr8/analysis/__init__.py +58 -0
- xlr8/analysis/brackets.py +1201 -0
- xlr8/analysis/chunker.py +118 -0
- xlr8/analysis/inspector.py +1889 -0
- xlr8/collection/__init__.py +6 -0
- xlr8/collection/cursor.py +2145 -0
- xlr8/collection/cursor.pyi +173 -0
- xlr8/collection/wrapper.py +661 -0
- xlr8/collection/wrapper.pyi +218 -0
- xlr8/constants.py +24 -0
- xlr8/execution/__init__.py +43 -0
- xlr8/execution/callback.py +792 -0
- xlr8/execution/executor.py +500 -0
- xlr8/execution/planner.py +377 -0
- xlr8/py.typed +1 -0
- xlr8/rust_backend.py +42 -0
- xlr8/rust_backend.pyi +71 -0
- xlr8/schema/__init__.py +42 -0
- xlr8/schema/encoder.py +235 -0
- xlr8/schema/schema.py +265 -0
- xlr8/schema/types.py +239 -0
- xlr8/storage/__init__.py +17 -0
- xlr8/storage/cache.py +228 -0
- xlr8/storage/reader.py +1276 -0
- xlr8-0.1.2.dist-info/METADATA +177 -0
- xlr8-0.1.2.dist-info/RECORD +30 -0
- xlr8-0.1.2.dist-info/WHEEL +4 -0
- xlr8-0.1.2.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,1201 @@
|
|
|
1
|
+
"""Bracket-based query analysis for XLR8.
|
|
2
|
+
|
|
3
|
+
================================================================================
|
|
4
|
+
DATA FLOW - QUERY TO BRACKETS
|
|
5
|
+
================================================================================
|
|
6
|
+
|
|
7
|
+
This module transforms a MongoDB query into "Brackets" - the fundamental unit
|
|
8
|
+
of work for parallel execution.
|
|
9
|
+
|
|
10
|
+
WHAT IS A BRACKET?
|
|
11
|
+
--------------------------------------------------------------------------------
|
|
12
|
+
|
|
13
|
+
A Bracket = static_filter + TimeRange
|
|
14
|
+
|
|
15
|
+
It represents ONE chunk of work that can be executed independently:
|
|
16
|
+
- static_filter: Non-time conditions (e.g., {"region_id": "64a..."})
|
|
17
|
+
- timerange: Time bounds (lo, hi) that can be further chunked
|
|
18
|
+
|
|
19
|
+
EXAMPLE TRANSFORMATION:
|
|
20
|
+
--------------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
INPUT QUERY:
|
|
23
|
+
{
|
|
24
|
+
"$or": [
|
|
25
|
+
{"region_id": ObjectId("64a...")},
|
|
26
|
+
{"region_id": ObjectId("64b...")},
|
|
27
|
+
{"region_id": ObjectId("64c...")},
|
|
28
|
+
],
|
|
29
|
+
"account_id": ObjectId("123..."), # Global AND condition
|
|
30
|
+
"timestamp": {"$gte": datetime(2024,1,1), "$lt": datetime(2024,7,1)}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
STEP 1: split_global_and() extracts:
|
|
34
|
+
global_and = {"account_id": ObjectId("123..."),
|
|
35
|
+
"timestamp": {"$gte": ..., "$lt": ...}}
|
|
36
|
+
or_list = [{"region_id": "64a..."},
|
|
37
|
+
{"region_id": "64b..."}, ...]
|
|
38
|
+
|
|
39
|
+
STEP 2: For each $or branch, merge with global_and:
|
|
40
|
+
Branch 1: {"account_id": "123...", "region_id": "64a...", "timestamp": {...}}
|
|
41
|
+
Branch 2: {"account_id": "123...", "region_id": "64b...", "timestamp": {...}}
|
|
42
|
+
...
|
|
43
|
+
|
|
44
|
+
STEP 3: Extract time bounds and create Brackets:
|
|
45
|
+
|
|
46
|
+
OUTPUT: List[Bracket]
|
|
47
|
+
|
|
48
|
+
Bracket(
|
|
49
|
+
static_filter={"account_id": "123...", "region_id": "64a..."},
|
|
50
|
+
timerange=TimeRange(lo=2024-01-01, hi=2024-07-01, is_full=True)
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
Bracket(
|
|
54
|
+
static_filter={"account_id": "123...", "region_id": "64b..."},
|
|
55
|
+
timerange=TimeRange(lo=2024-01-01, hi=2024-07-01, is_full=True)
|
|
56
|
+
)
|
|
57
|
+
...
|
|
58
|
+
|
|
59
|
+
NEXT STEP: Each bracket's timerange is chunked (14-day chunks) and queued
|
|
60
|
+
for parallel execution.
|
|
61
|
+
|
|
62
|
+
WHY BRACKETS?
|
|
63
|
+
--------------------------------------------------------------------------------
|
|
64
|
+
1. Parallelization: Each bracket can be fetched independently
|
|
65
|
+
2. Caching: Same static_filter can reuse cached data
|
|
66
|
+
3. Time chunking: TimeRange can be split into smaller chunks for workers
|
|
67
|
+
|
|
68
|
+
================================================================================
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
import json
|
|
72
|
+
from copy import deepcopy
|
|
73
|
+
from dataclasses import dataclass
|
|
74
|
+
from datetime import datetime
|
|
75
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
76
|
+
|
|
77
|
+
from src.xlr8.analysis.inspector import (
|
|
78
|
+
ChunkabilityMode,
|
|
79
|
+
extract_time_bounds_recursive,
|
|
80
|
+
has_forbidden_ops,
|
|
81
|
+
is_chunkable_query,
|
|
82
|
+
normalize_query,
|
|
83
|
+
or_depth,
|
|
84
|
+
split_global_and,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
__all__ = [
|
|
88
|
+
# Data structures
|
|
89
|
+
"Bracket",
|
|
90
|
+
"TimeRange",
|
|
91
|
+
# Main public function
|
|
92
|
+
"build_brackets_for_find",
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# =============================================================================
|
|
97
|
+
# OVERLAP DETECTION HELPERS
|
|
98
|
+
# =============================================================================
|
|
99
|
+
# These helpers detect when $or branches may have overlapping result sets,
|
|
100
|
+
# which would cause duplicates when executing brackets independently.
|
|
101
|
+
#
|
|
102
|
+
# NEGATION OPERATORS: $nin, $ne, $not, $nor in an $or branch can overlap with
|
|
103
|
+
# other branches that use positive filters on the same field.
|
|
104
|
+
#
|
|
105
|
+
# $in OVERLAP: Two branches with $in on the same field may share values.
|
|
106
|
+
# Example: {"field": {"$in": [1,2,3]}} and {"field": {"$in": [3,4,5]}}
|
|
107
|
+
#
|
|
108
|
+
# INHERENTLY OVERLAPPING OPERATORS: Some operators can match the same document
|
|
109
|
+
# across different branches even with different values:
|
|
110
|
+
# - $all: {"tags": {"$all": ["a","b"]}} and {"tags": {"$all": ["b","c"]}}
|
|
111
|
+
# both match a document with tags: ["a","b","c"]
|
|
112
|
+
# - $elemMatch: array element matching can overlap
|
|
113
|
+
# - $regex: pattern matching can overlap
|
|
114
|
+
# - $mod: modulo conditions can overlap
|
|
115
|
+
# - Comparison operators ($gt, $lt, etc.): ranges can overlap
|
|
116
|
+
# =============================================================================
|
|
117
|
+
|
|
118
|
+
# Operators that create negation/exclusion filters
|
|
119
|
+
NEGATION_OPERATORS: Set[str] = {"$nin", "$ne", "$not", "$nor"}
|
|
120
|
+
|
|
121
|
+
# Operators that can cause overlap between branches even with different values
|
|
122
|
+
# These should trigger single-bracket execution when used on differentiating fields
|
|
123
|
+
OVERLAP_PRONE_OPERATORS: Set[str] = {
|
|
124
|
+
"$all", # Array superset matching
|
|
125
|
+
"$elemMatch", # Array element matching
|
|
126
|
+
"$regex", # Pattern matching
|
|
127
|
+
"$mod", # Modulo matching
|
|
128
|
+
"$gt", # Greater than - ranges can overlap
|
|
129
|
+
"$gte", # Greater than or equal
|
|
130
|
+
"$lt", # Less than - ranges can overlap
|
|
131
|
+
"$lte", # Less than or equal
|
|
132
|
+
"$bitsAllSet", # Bitwise operations can overlap
|
|
133
|
+
"$bitsAnySet",
|
|
134
|
+
"$bitsAllClear",
|
|
135
|
+
"$bitsAnyClear",
|
|
136
|
+
}
|
|
137
|
+
# both match documents where field=3.
|
|
138
|
+
# =============================================================================
|
|
139
|
+
|
|
140
|
+
# Operators that create negation/exclusion filters
|
|
141
|
+
NEGATION_OPERATORS: Set[str] = {"$nin", "$ne", "$not", "$nor"}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@dataclass
|
|
145
|
+
class TimeRange:
|
|
146
|
+
"""
|
|
147
|
+
Time range for a bracket.
|
|
148
|
+
|
|
149
|
+
Attributes:
|
|
150
|
+
lo: Lower bound datetime
|
|
151
|
+
hi: Upper bound datetime
|
|
152
|
+
is_full: Whether both lo and hi are specified
|
|
153
|
+
hi_inclusive: If True, use $lte; if False, use $lt (default: False for $lt)
|
|
154
|
+
lo_inclusive: If True, use $gte; if False, use $gt (default: True for $gte)
|
|
155
|
+
|
|
156
|
+
Example:
|
|
157
|
+
TimeRange(
|
|
158
|
+
lo=datetime(2024, 1, 1, tzinfo=UTC),
|
|
159
|
+
hi=datetime(2024, 7, 1, tzinfo=UTC),
|
|
160
|
+
is_full=True,
|
|
161
|
+
hi_inclusive=False, # Use $lt
|
|
162
|
+
lo_inclusive=True # Use $gte
|
|
163
|
+
)
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
lo: Optional[datetime]
|
|
167
|
+
hi: Optional[datetime]
|
|
168
|
+
is_full: bool
|
|
169
|
+
hi_inclusive: bool = False # Default to $lt for backward compatibility
|
|
170
|
+
lo_inclusive: bool = True # Default to $gte for backward compatibility
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
@dataclass
|
|
174
|
+
class Bracket:
|
|
175
|
+
"""
|
|
176
|
+
A unit of work for parallel execution.
|
|
177
|
+
|
|
178
|
+
Example:
|
|
179
|
+
Bracket(
|
|
180
|
+
static_filter={"account_id": ObjectId("123..."),
|
|
181
|
+
"region_id": ObjectId("64a...")},
|
|
182
|
+
timerange=TimeRange(lo=2024-01-01, hi=2024-07-01, is_full=True)
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
This bracket will be converted to a MongoDB query:
|
|
186
|
+
{
|
|
187
|
+
"account_id": ObjectId("123..."),
|
|
188
|
+
"region_id": ObjectId("64a..."),
|
|
189
|
+
"timestamp": {"$gte": 2024-01-01, "$lt": 2024-07-01}
|
|
190
|
+
}
|
|
191
|
+
"""
|
|
192
|
+
|
|
193
|
+
static_filter: Dict[str, Any]
|
|
194
|
+
timerange: TimeRange
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# =============================================================================
|
|
198
|
+
# Add overlap detection helpers
|
|
199
|
+
# =============================================================================
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _has_negation_operators(query: Dict[str, Any]) -> bool:
|
|
203
|
+
"""
|
|
204
|
+
Check if query contains any negation operators.
|
|
205
|
+
|
|
206
|
+
Negation operators ($nin, $ne, $not, $nor) in an $or branch create
|
|
207
|
+
potential overlap with other branches, leading to duplicate results.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
query: A query dict (typically an $or branch)
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
True if any negation operator is found at any nesting level
|
|
214
|
+
|
|
215
|
+
Examples:
|
|
216
|
+
>>> _has_negation_operators({"field": {"$in": [1,2,3]}})
|
|
217
|
+
False
|
|
218
|
+
>>> _has_negation_operators({"field": {"$nin": [1,2,3]}})
|
|
219
|
+
True
|
|
220
|
+
>>> _has_negation_operators({"$and": [{"field": {"$ne": 5}}]})
|
|
221
|
+
True
|
|
222
|
+
"""
|
|
223
|
+
|
|
224
|
+
def _check(obj: Any) -> bool:
|
|
225
|
+
if isinstance(obj, dict):
|
|
226
|
+
for key, value in obj.items():
|
|
227
|
+
if key in NEGATION_OPERATORS:
|
|
228
|
+
return True
|
|
229
|
+
if _check(value):
|
|
230
|
+
return True
|
|
231
|
+
elif isinstance(obj, list):
|
|
232
|
+
for item in obj:
|
|
233
|
+
if _check(item):
|
|
234
|
+
return True
|
|
235
|
+
return False
|
|
236
|
+
|
|
237
|
+
return _check(query)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _has_overlap_prone_operators(
|
|
241
|
+
query: Dict[str, Any], time_field: str
|
|
242
|
+
) -> Tuple[bool, Optional[str]]:
|
|
243
|
+
"""
|
|
244
|
+
Check if query contains operators that can cause overlap between branches.
|
|
245
|
+
|
|
246
|
+
These operators can match the same document even with different values:
|
|
247
|
+
- $all: array superset matching
|
|
248
|
+
- $elemMatch: array element matching
|
|
249
|
+
- $regex: pattern matching
|
|
250
|
+
- $mod: modulo matching
|
|
251
|
+
- Comparison operators ($gt, $lt, etc.): ranges can overlap
|
|
252
|
+
|
|
253
|
+
NOTE: Comparison operators on the TIME FIELD are allowed (that's how we chunk).
|
|
254
|
+
Only comparison operators on OTHER fields trigger this check.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
query: A query dict (typically an $or branch)
|
|
258
|
+
time_field: The time field name (excluded from comparison operator check)
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
Tuple of (has_overlap_prone, operator_name)
|
|
262
|
+
|
|
263
|
+
Examples:
|
|
264
|
+
>>> _has_overlap_prone_operators({"tags": {"$all": ["a", "b"]}}, "ts")
|
|
265
|
+
(True, '$all')
|
|
266
|
+
>>> _has_overlap_prone_operators({"name": {"$regex": "^John"}}, "ts")
|
|
267
|
+
(True, '$regex')
|
|
268
|
+
>>> _has_overlap_prone_operators({"ts": {"$gte": t1, "$lt": t2}}, "ts")
|
|
269
|
+
(False, None) # Time field comparison is OK
|
|
270
|
+
>>> _has_overlap_prone_operators({"value": {"$gt": 10}}, "ts")
|
|
271
|
+
(True, '$gt') # Non-time field comparison is problematic
|
|
272
|
+
"""
|
|
273
|
+
# Operators that are always problematic (not context-dependent)
|
|
274
|
+
always_problematic = {
|
|
275
|
+
"$all",
|
|
276
|
+
"$elemMatch",
|
|
277
|
+
"$regex",
|
|
278
|
+
"$mod",
|
|
279
|
+
"$bitsAllSet",
|
|
280
|
+
"$bitsAnySet",
|
|
281
|
+
"$bitsAllClear",
|
|
282
|
+
"$bitsAnyClear",
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
# Comparison operators - only problematic on non-time fields
|
|
286
|
+
comparison_ops = {"$gt", "$gte", "$lt", "$lte"}
|
|
287
|
+
|
|
288
|
+
def _check(obj: Any, current_field: Optional[str] = None) -> Optional[str]:
|
|
289
|
+
if isinstance(obj, dict):
|
|
290
|
+
for key, value in obj.items():
|
|
291
|
+
# Track current field for comparison operator check
|
|
292
|
+
field = key if not key.startswith("$") else current_field
|
|
293
|
+
|
|
294
|
+
if key in always_problematic:
|
|
295
|
+
return key
|
|
296
|
+
|
|
297
|
+
# Comparison operators are only problematic on non-time fields
|
|
298
|
+
if key in comparison_ops and current_field != time_field:
|
|
299
|
+
return key
|
|
300
|
+
|
|
301
|
+
result = _check(value, field)
|
|
302
|
+
if result:
|
|
303
|
+
return result
|
|
304
|
+
elif isinstance(obj, list):
|
|
305
|
+
for item in obj:
|
|
306
|
+
result = _check(item, current_field)
|
|
307
|
+
if result:
|
|
308
|
+
return result
|
|
309
|
+
return None
|
|
310
|
+
|
|
311
|
+
op = _check(query)
|
|
312
|
+
return (True, op) if op else (False, None)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def _extract_in_values(query: Dict[str, Any], field: str) -> Optional[Set[Any]]:
|
|
316
|
+
"""
|
|
317
|
+
Extract $in values for a specific field from query.
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
query: Query dict to search
|
|
321
|
+
field: Field name to look for $in on
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
Set of values if $in found, None if field uses different operator or not present
|
|
325
|
+
|
|
326
|
+
Examples:
|
|
327
|
+
>>> _extract_in_values({"field": {"$in": [1, 2, 3]}}, "field")
|
|
328
|
+
{1, 2, 3}
|
|
329
|
+
>>> _extract_in_values({"field": 5}, "field") # Equality, not $in
|
|
330
|
+
None
|
|
331
|
+
>>> _extract_in_values({"other": {"$in": [1]}}, "field") # Different field
|
|
332
|
+
None
|
|
333
|
+
"""
|
|
334
|
+
if field not in query:
|
|
335
|
+
return None
|
|
336
|
+
|
|
337
|
+
val = query[field]
|
|
338
|
+
if isinstance(val, dict) and "$in" in val:
|
|
339
|
+
in_vals = val["$in"]
|
|
340
|
+
if isinstance(in_vals, list):
|
|
341
|
+
# Convert to set of hashable representations
|
|
342
|
+
result = set()
|
|
343
|
+
for v in in_vals:
|
|
344
|
+
try:
|
|
345
|
+
result.add(v)
|
|
346
|
+
except TypeError:
|
|
347
|
+
# Unhashable value - convert to string
|
|
348
|
+
result.add(str(v))
|
|
349
|
+
return result
|
|
350
|
+
|
|
351
|
+
return None
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def _find_in_fields(query: Dict[str, Any]) -> Dict[str, Set[Any]]:
|
|
355
|
+
"""
|
|
356
|
+
Find all fields that use $in operator and their values.
|
|
357
|
+
|
|
358
|
+
Only looks at top-level fields (not nested in $and, etc.)
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
query: Query dict (typically an $or branch)
|
|
362
|
+
|
|
363
|
+
Returns:
|
|
364
|
+
Dict mapping field name to set of $in values
|
|
365
|
+
|
|
366
|
+
Examples:
|
|
367
|
+
>>> _find_in_fields({"a": {"$in": [1,2]}, "b": {"$in": [3,4]}})
|
|
368
|
+
{"a": {1, 2}, "b": {3, 4}}
|
|
369
|
+
>>> _find_in_fields({"a": 5, "b": {"$gt": 10}})
|
|
370
|
+
{}
|
|
371
|
+
"""
|
|
372
|
+
result: Dict[str, Set[Any]] = {}
|
|
373
|
+
|
|
374
|
+
for field, value in query.items():
|
|
375
|
+
if field.startswith("$"):
|
|
376
|
+
continue # Skip operators
|
|
377
|
+
if isinstance(value, dict) and "$in" in value:
|
|
378
|
+
in_vals = value["$in"]
|
|
379
|
+
if isinstance(in_vals, list):
|
|
380
|
+
try:
|
|
381
|
+
result[field] = set(in_vals)
|
|
382
|
+
except TypeError:
|
|
383
|
+
# Contains unhashable - convert to strings
|
|
384
|
+
result[field] = {str(v) for v in in_vals}
|
|
385
|
+
|
|
386
|
+
return result
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def _get_non_time_fields(branch: Dict[str, Any], time_field: str) -> Set[str]:
|
|
390
|
+
"""Get all top-level field names except the time field and operators."""
|
|
391
|
+
return {k for k in branch.keys() if not k.startswith("$") and k != time_field}
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def _check_or_branch_safety(
|
|
395
|
+
branches: List[Dict[str, Any]], global_and: Dict[str, Any], time_field: str
|
|
396
|
+
) -> Tuple[bool, str, Optional[List[Dict[str, Any]]]]:
|
|
397
|
+
"""
|
|
398
|
+
Analyze $or branches for safety (no overlapping result sets).
|
|
399
|
+
|
|
400
|
+
This function implements the safe algorithm for detecting when $or
|
|
401
|
+
branches can be executed independently as brackets vs when they must
|
|
402
|
+
be executed as a single query to avoid duplicates.
|
|
403
|
+
|
|
404
|
+
SAFETY RULES:
|
|
405
|
+
1. If ANY branch has negation operators -> UNSAFE (cannot transform)
|
|
406
|
+
2. If branches have different field sets -> UNSAFE (cannot determine overlap)
|
|
407
|
+
3. If exactly ONE $in field differs -> TRANSFORM (subtract overlapping values)
|
|
408
|
+
4. If multiple $in fields differ -> UNSAFE (explosion of combinations)
|
|
409
|
+
5. If same $in fields with disjoint values -> SAFE
|
|
410
|
+
6. If same equality values -> SAFE (same static_filter, handled by grouping)
|
|
411
|
+
|
|
412
|
+
Args:
|
|
413
|
+
branches: List of $or branch dicts
|
|
414
|
+
global_and: Global conditions applied to all branches
|
|
415
|
+
time_field: Time field name (excluded from field comparison)
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
Tuple of (is_safe, reason, transformed_branches)
|
|
419
|
+
- is_safe: True if brackets can be executed independently
|
|
420
|
+
- reason: Description of why unsafe (empty if safe)
|
|
421
|
+
- transformed_branches: Modified branches if transformation applied,
|
|
422
|
+
None otherwise
|
|
423
|
+
"""
|
|
424
|
+
if len(branches) <= 1:
|
|
425
|
+
return True, "", None # Single branch is always safe
|
|
426
|
+
|
|
427
|
+
# Rule 1a: Check for negation operators in any branch
|
|
428
|
+
for i, branch in enumerate(branches):
|
|
429
|
+
if _has_negation_operators(branch):
|
|
430
|
+
return (
|
|
431
|
+
False,
|
|
432
|
+
f"branch {i} contains negation operator ($nin/$ne/$not/$nor)",
|
|
433
|
+
None,
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
# Rule 1b: Check for overlap-prone operators in any branch
|
|
437
|
+
# These operators can match the same document across branches even with
|
|
438
|
+
# different values
|
|
439
|
+
for i, branch in enumerate(branches):
|
|
440
|
+
has_overlap_op, op = _has_overlap_prone_operators(branch, time_field)
|
|
441
|
+
if has_overlap_op:
|
|
442
|
+
return False, f"branch {i} contains overlap-prone operator ({op})", None
|
|
443
|
+
|
|
444
|
+
# Merge each branch with global_and for analysis
|
|
445
|
+
effective_branches = []
|
|
446
|
+
for br in branches:
|
|
447
|
+
eff = {**global_and, **br}
|
|
448
|
+
# Remove time field for field comparison
|
|
449
|
+
if time_field in eff:
|
|
450
|
+
eff_copy = dict(eff)
|
|
451
|
+
eff_copy.pop(time_field)
|
|
452
|
+
effective_branches.append(eff_copy)
|
|
453
|
+
else:
|
|
454
|
+
effective_branches.append(eff)
|
|
455
|
+
|
|
456
|
+
# Rule 2: Check if all branches have the same field set
|
|
457
|
+
field_sets = [_get_non_time_fields(eb, time_field) for eb in effective_branches]
|
|
458
|
+
first_fields = field_sets[0]
|
|
459
|
+
for i, fs in enumerate(field_sets[1:], 1):
|
|
460
|
+
if fs != first_fields:
|
|
461
|
+
return False, f"branch {i} has different field set than branch 0", None
|
|
462
|
+
|
|
463
|
+
# All branches have same fields - now check for $in overlap
|
|
464
|
+
# Find all $in fields in each branch
|
|
465
|
+
all_in_fields: List[Dict[str, Set[Any]]] = [
|
|
466
|
+
_find_in_fields(eb) for eb in effective_branches
|
|
467
|
+
]
|
|
468
|
+
|
|
469
|
+
# Collect all $in field names across all branches
|
|
470
|
+
in_field_names: Set[str] = set()
|
|
471
|
+
for in_dict in all_in_fields:
|
|
472
|
+
in_field_names.update(in_dict.keys())
|
|
473
|
+
|
|
474
|
+
if not in_field_names:
|
|
475
|
+
# No $in fields - check for equality overlap
|
|
476
|
+
# Branches with identical static_filters will be grouped/merged by
|
|
477
|
+
# the main algorithm. Different equality values are always disjoint (safe)
|
|
478
|
+
return True, "", None
|
|
479
|
+
|
|
480
|
+
# For each $in field, check if all branches use $in on it
|
|
481
|
+
# and identify overlapping values
|
|
482
|
+
fields_with_overlap: Dict[str, List[Tuple[int, int, Set[Any]]]] = {}
|
|
483
|
+
|
|
484
|
+
for field in in_field_names:
|
|
485
|
+
# Get $in values for this field from each branch
|
|
486
|
+
branch_values: List[Optional[Set[Any]]] = []
|
|
487
|
+
for in_dict in all_in_fields:
|
|
488
|
+
branch_values.append(in_dict.get(field))
|
|
489
|
+
|
|
490
|
+
# Check for overlap between any pair of branches
|
|
491
|
+
overlaps: List[Tuple[int, int, Set[Any]]] = []
|
|
492
|
+
for i in range(len(branches)):
|
|
493
|
+
vals_i = branch_values[i]
|
|
494
|
+
if vals_i is None:
|
|
495
|
+
# This branch doesn't use $in on this field - could be equality
|
|
496
|
+
# This creates potential overlap issues
|
|
497
|
+
continue
|
|
498
|
+
for j in range(i + 1, len(branches)):
|
|
499
|
+
vals_j = branch_values[j]
|
|
500
|
+
if vals_j is None:
|
|
501
|
+
continue
|
|
502
|
+
common = vals_i & vals_j
|
|
503
|
+
if common:
|
|
504
|
+
overlaps.append((i, j, common))
|
|
505
|
+
|
|
506
|
+
if overlaps:
|
|
507
|
+
fields_with_overlap[field] = overlaps
|
|
508
|
+
|
|
509
|
+
if not fields_with_overlap:
|
|
510
|
+
# No overlapping $in values - safe!
|
|
511
|
+
return True, "", None
|
|
512
|
+
|
|
513
|
+
# Rule 3 & 4: Handle overlapping $in values
|
|
514
|
+
# IMPORTANT: Transformation is ONLY safe when all branches have the SAME
|
|
515
|
+
# time bounds! If time bounds differ, we cannot subtract $in values because:
|
|
516
|
+
# - Branch A (IDs 1,2,3) with time [t1, t2]
|
|
517
|
+
# - Branch B (IDs 2,3,4) with time [t0, t3] (wider)
|
|
518
|
+
# If we remove 2,3 from Branch B, documents with IDs 2,3 in [t0,t1) and (t2,t3]
|
|
519
|
+
# would be LOST - not covered by either branch!
|
|
520
|
+
#
|
|
521
|
+
# So if overlapping $in values exist AND time ranges differ -> fall back
|
|
522
|
+
# to single bracket
|
|
523
|
+
|
|
524
|
+
# Extract time bounds from each branch to check if they're identical
|
|
525
|
+
time_bounds = []
|
|
526
|
+
for br in branches:
|
|
527
|
+
combined = {**global_and, **br}
|
|
528
|
+
bounds, _ = extract_time_bounds_recursive(combined, time_field)
|
|
529
|
+
if bounds is None:
|
|
530
|
+
lo, hi = None, None
|
|
531
|
+
else:
|
|
532
|
+
lo, hi, hi_inclusive, lo_inclusive = bounds
|
|
533
|
+
time_bounds.append((lo, hi))
|
|
534
|
+
|
|
535
|
+
# Check if all time bounds are identical
|
|
536
|
+
first_bounds = time_bounds[0]
|
|
537
|
+
all_same_time = all(bounds == first_bounds for bounds in time_bounds)
|
|
538
|
+
|
|
539
|
+
if not all_same_time:
|
|
540
|
+
# Overlapping $in with different time ranges - CANNOT safely transform
|
|
541
|
+
return (
|
|
542
|
+
False,
|
|
543
|
+
(
|
|
544
|
+
f"overlapping $in on '{list(fields_with_overlap.keys())[0]}' "
|
|
545
|
+
"with different time ranges"
|
|
546
|
+
),
|
|
547
|
+
None,
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
if len(fields_with_overlap) > 1:
|
|
551
|
+
# Multiple $in fields have overlap - too complex to transform
|
|
552
|
+
return (
|
|
553
|
+
False,
|
|
554
|
+
f"multiple $in fields have overlap: {list(fields_with_overlap.keys())}",
|
|
555
|
+
None,
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
# Exactly one $in field has overlap AND same time ranges - we can transform
|
|
559
|
+
field = list(fields_with_overlap.keys())[0]
|
|
560
|
+
overlaps = fields_with_overlap[field]
|
|
561
|
+
|
|
562
|
+
# Transform: For each pair with overlap, subtract overlapping values from one branch
|
|
563
|
+
# Strategy: Build a "seen" set and subtract from later branches
|
|
564
|
+
transformed = [deepcopy(br) for br in branches]
|
|
565
|
+
seen_values: Set[Any] = set()
|
|
566
|
+
|
|
567
|
+
for i, branch in enumerate(transformed):
|
|
568
|
+
# Get current $in values for this branch (merged with global)
|
|
569
|
+
eff = {**global_and, **branch}
|
|
570
|
+
in_vals = _extract_in_values(eff, field)
|
|
571
|
+
|
|
572
|
+
if in_vals is None:
|
|
573
|
+
# Branch uses equality on this field - add to seen
|
|
574
|
+
if field in eff and not isinstance(eff.get(field), dict):
|
|
575
|
+
try:
|
|
576
|
+
seen_values.add(eff[field])
|
|
577
|
+
except TypeError:
|
|
578
|
+
seen_values.add(str(eff[field]))
|
|
579
|
+
continue
|
|
580
|
+
|
|
581
|
+
# Subtract already-seen values
|
|
582
|
+
remaining = in_vals - seen_values
|
|
583
|
+
|
|
584
|
+
if not remaining:
|
|
585
|
+
# All values already covered - mark branch for removal
|
|
586
|
+
transformed[i] = None # type: ignore
|
|
587
|
+
elif remaining != in_vals:
|
|
588
|
+
# Some values removed - update the $in
|
|
589
|
+
if (
|
|
590
|
+
field in branch
|
|
591
|
+
and isinstance(branch.get(field), dict)
|
|
592
|
+
and "$in" in branch[field]
|
|
593
|
+
):
|
|
594
|
+
branch[field]["$in"] = list(remaining)
|
|
595
|
+
elif field in global_and:
|
|
596
|
+
# Field is in global_and - need to override in branch
|
|
597
|
+
branch[field] = {"$in": list(remaining)}
|
|
598
|
+
|
|
599
|
+
# Add all original values to seen (they're now covered by this bracket)
|
|
600
|
+
seen_values.update(in_vals)
|
|
601
|
+
|
|
602
|
+
# Filter out None branches (fully covered)
|
|
603
|
+
transformed = [b for b in transformed if b is not None]
|
|
604
|
+
|
|
605
|
+
if not transformed:
|
|
606
|
+
# Edge case: all branches were fully covered (shouldn't happen normally)
|
|
607
|
+
return True, "", None
|
|
608
|
+
|
|
609
|
+
return True, "", transformed
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
# ============================================================================
|
|
613
|
+
# MAIN INTERFACE/ ENTRY Point
|
|
614
|
+
# ============================================================================
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
def _json_key(d: Dict[str, Any]) -> str:
|
|
618
|
+
"""Create a deterministic JSON key for deduplication."""
|
|
619
|
+
return json.dumps(d, sort_keys=True, default=str)
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
def _merge_full_ranges(ranges: List[TimeRange]) -> List[TimeRange]:
|
|
623
|
+
"""Merge overlapping or adjacent time ranges into consolidated spans.
|
|
624
|
+
|
|
625
|
+
Sorts ranges by start time, then iterates through merging any
|
|
626
|
+
that overlap or touch (end of one equals start of next).
|
|
627
|
+
Preserves hi_inclusive and lo_inclusive flags.
|
|
628
|
+
"""
|
|
629
|
+
|
|
630
|
+
rs = [r for r in ranges if r.is_full and r.lo and r.hi]
|
|
631
|
+
if not rs:
|
|
632
|
+
return []
|
|
633
|
+
|
|
634
|
+
rs.sort(key=lambda r: r.lo) # type: ignore[arg-type]
|
|
635
|
+
out: List[TimeRange] = [
|
|
636
|
+
TimeRange(rs[0].lo, rs[0].hi, True, rs[0].hi_inclusive, rs[0].lo_inclusive)
|
|
637
|
+
]
|
|
638
|
+
for r in rs[1:]:
|
|
639
|
+
last = out[-1]
|
|
640
|
+
# Type assertions: we filtered for r.lo and r.hi being not None above
|
|
641
|
+
assert r.lo is not None and r.hi is not None
|
|
642
|
+
assert last.lo is not None and last.hi is not None
|
|
643
|
+
if r.lo <= last.hi: # overlap or touch
|
|
644
|
+
if r.hi > last.hi:
|
|
645
|
+
last.hi = r.hi
|
|
646
|
+
last.hi_inclusive = r.hi_inclusive
|
|
647
|
+
elif r.hi == last.hi:
|
|
648
|
+
last.hi_inclusive = last.hi_inclusive or r.hi_inclusive
|
|
649
|
+
else:
|
|
650
|
+
out.append(TimeRange(r.lo, r.hi, True, r.hi_inclusive, r.lo_inclusive))
|
|
651
|
+
return out
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
def _partial_covers_full(partial: TimeRange, full: TimeRange) -> bool:
|
|
655
|
+
"""Check if a partial time range completely covers a full time range.
|
|
656
|
+
|
|
657
|
+
A partial range covers a full range if:
|
|
658
|
+
- partial has only $gte (lo) and full.lo >= partial.lo
|
|
659
|
+
- partial has only $lt (hi) and full.hi <= partial.hi
|
|
660
|
+
|
|
661
|
+
Args:
|
|
662
|
+
partial: TimeRange with is_full=False (missing lo or hi)
|
|
663
|
+
full: TimeRange with is_full=True
|
|
664
|
+
|
|
665
|
+
Returns:
|
|
666
|
+
True if partial completely covers full, False otherwise
|
|
667
|
+
"""
|
|
668
|
+
if full.lo is None or full.hi is None:
|
|
669
|
+
return False
|
|
670
|
+
|
|
671
|
+
# Partial has only lower bound ($gte): covers if full starts at or after
|
|
672
|
+
if partial.lo is not None and partial.hi is None:
|
|
673
|
+
return full.lo >= partial.lo
|
|
674
|
+
|
|
675
|
+
# Partial has only upper bound ($lt): covers if full ends at or before
|
|
676
|
+
if partial.lo is None and partial.hi is not None:
|
|
677
|
+
return full.hi <= partial.hi
|
|
678
|
+
|
|
679
|
+
return False
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
def _merge_partial_ranges(partials: List[TimeRange]) -> List[TimeRange]:
|
|
683
|
+
"""Merge partial ranges where possible.
|
|
684
|
+
|
|
685
|
+
Priority:
|
|
686
|
+
- If ANY range is completely unbounded (no lo, no hi), it covers everything
|
|
687
|
+
- Two $gte-only: keep the one with smallest lo (covers most)
|
|
688
|
+
- Two $lt-only: keep the one with largest hi (covers most)
|
|
689
|
+
Preserves lo_inclusive and hi_inclusive flags.
|
|
690
|
+
"""
|
|
691
|
+
if not partials:
|
|
692
|
+
return []
|
|
693
|
+
|
|
694
|
+
# Check for completely unbounded ranges first - they cover everything
|
|
695
|
+
unbounded = [r for r in partials if r.lo is None and r.hi is None]
|
|
696
|
+
if unbounded:
|
|
697
|
+
# One unbounded range covers all other partials
|
|
698
|
+
return [TimeRange(None, None, False, False, True)]
|
|
699
|
+
|
|
700
|
+
gte_only = [r for r in partials if r.lo is not None and r.hi is None]
|
|
701
|
+
lt_only = [r for r in partials if r.lo is None and r.hi is not None]
|
|
702
|
+
|
|
703
|
+
merged: List[TimeRange] = []
|
|
704
|
+
|
|
705
|
+
# For $gte-only, keep the smallest lo (covers most data)
|
|
706
|
+
assert gte_only or lt_only, "No partial ranges to merge"
|
|
707
|
+
|
|
708
|
+
if gte_only:
|
|
709
|
+
# Filter out None values for type safety
|
|
710
|
+
min_lo = min(r.lo for r in gte_only if r.lo is not None)
|
|
711
|
+
# Find the lo_inclusive from the range with min_lo
|
|
712
|
+
lo_inclusive = next(r.lo_inclusive for r in gte_only if r.lo == min_lo)
|
|
713
|
+
merged.append(TimeRange(min_lo, None, False, False, lo_inclusive))
|
|
714
|
+
|
|
715
|
+
# For $lt-only, keep the largest hi (covers most data)
|
|
716
|
+
if lt_only:
|
|
717
|
+
max_hi = max(r.hi for r in lt_only if r.hi is not None)
|
|
718
|
+
# Find the hi_inclusive from the range with max_hi
|
|
719
|
+
hi_inclusive = next(r.hi_inclusive for r in lt_only if r.hi == max_hi)
|
|
720
|
+
merged.append(TimeRange(None, max_hi, False, hi_inclusive, True))
|
|
721
|
+
|
|
722
|
+
return merged
|
|
723
|
+
|
|
724
|
+
|
|
725
|
+
def build_brackets_for_find(
|
|
726
|
+
query: Dict[str, Any],
|
|
727
|
+
time_field: str,
|
|
728
|
+
sort_spec: Optional[List[Tuple[str, int]]] = None,
|
|
729
|
+
) -> Tuple[
|
|
730
|
+
bool, str, List[Bracket], Optional[Tuple[Optional[datetime], Optional[datetime]]]
|
|
731
|
+
]:
|
|
732
|
+
"""
|
|
733
|
+
Build bracket list for a find() query based on its chunkability.
|
|
734
|
+
|
|
735
|
+
This is the SINGLE ENTRY POINT for bracket creation. All queries flow through
|
|
736
|
+
here to ensure consistent validation and bracket generation.
|
|
737
|
+
|
|
738
|
+
IMPORTANT: Internally calls is_chunkable_query() to validate the query and
|
|
739
|
+
determine execution mode (PARALLEL/SINGLE/REJECT). Cursor methods should NOT
|
|
740
|
+
call is_chunkable_query() separately - this function handles all validation.
|
|
741
|
+
|
|
742
|
+
Args:
|
|
743
|
+
query: MongoDB find() filter dict
|
|
744
|
+
time_field: Name of the timestamp field used for time-based chunking
|
|
745
|
+
(e.g., "timestamp", "recordedAt", "createdAt")
|
|
746
|
+
sort_spec: Optional MongoDB sort specification as list of
|
|
747
|
+
(field, direction) tuples. Required for detecting $natural
|
|
748
|
+
sort. Format: [("field", 1)] or [("field", -1)]
|
|
749
|
+
Example: [("timestamp", 1)] or [("$natural", -1)]
|
|
750
|
+
|
|
751
|
+
Returns:
|
|
752
|
+
Tuple of (is_chunkable, reason, brackets, bounds):
|
|
753
|
+
|
|
754
|
+
- is_chunkable: bool
|
|
755
|
+
- True: Query is valid and executable (PARALLEL or SINGLE mode)
|
|
756
|
+
- False: Invalid query syntax or contradictory constraints (REJECT mode)
|
|
757
|
+
|
|
758
|
+
- reason: str
|
|
759
|
+
- Empty string "" for PARALLEL mode (successful parallelization)
|
|
760
|
+
- Descriptive message for SINGLE mode
|
|
761
|
+
(e.g., "$natural sort requires insertion order")
|
|
762
|
+
- Error description for REJECT mode
|
|
763
|
+
(e.g., "empty $or array (invalid MongoDB syntax)")
|
|
764
|
+
|
|
765
|
+
- brackets: List[Bracket]
|
|
766
|
+
- PARALLEL mode: Non-empty list of Bracket objects for parallel execution
|
|
767
|
+
- SINGLE mode: Empty list [] (signals to use single worker)
|
|
768
|
+
- REJECT mode: Empty list []
|
|
769
|
+
|
|
770
|
+
- bounds: Tuple[Optional[datetime], Optional[datetime]]
|
|
771
|
+
- Time range extracted from query (lo, hi)
|
|
772
|
+
- (None, None) if no time bounds found or query rejected
|
|
773
|
+
|
|
774
|
+
CRITICAL: Empty brackets list has TWO meanings:
|
|
775
|
+
1. If is_chunkable=True + brackets=[]: SINGLE mode (valid, use single worker)
|
|
776
|
+
2. If is_chunkable=False + brackets=[]: REJECT mode (invalid, don't execute)
|
|
777
|
+
|
|
778
|
+
Callers MUST check is_chunkable first, then interpret empty brackets accordingly.
|
|
779
|
+
|
|
780
|
+
Example:
|
|
781
|
+
>>> query = {
|
|
782
|
+
... "$or": [
|
|
783
|
+
... {"region_id": ObjectId("64a...")},
|
|
784
|
+
... {"region_id": ObjectId("64b...")},
|
|
785
|
+
... ],
|
|
786
|
+
... "account_id": ObjectId("123..."),
|
|
787
|
+
... "timestamp": {"$gte": datetime(2024,1,1), "$lt": datetime(2024,7,1)}
|
|
788
|
+
... }
|
|
789
|
+
>>> ok, reason, brackets, bounds = build_brackets_for_find(query, "timestamp")
|
|
790
|
+
>>> # Returns:
|
|
791
|
+
>>> # (True, "", [
|
|
792
|
+
>>> # Bracket(static_filter={"account_id": "123...",
|
|
793
|
+
>>> # "region_id": "64a..."},
|
|
794
|
+
>>> # timerange=TimeRange(lo=2024-01-01, hi=2024-07-01,
|
|
795
|
+
>>> # is_full=True)),
|
|
796
|
+
>>> # Bracket(static_filter={"account_id": "123...",
|
|
797
|
+
>>> # "region_id": "64b..."},
|
|
798
|
+
>>> # timerange=TimeRange(lo=2024-01-01, hi=2024-07-01,
|
|
799
|
+
>>> # is_full=True)),
|
|
800
|
+
>>> # ], (datetime(2024,1,1), datetime(2024,7,1)))
|
|
801
|
+
|
|
802
|
+
Rejection Cases (returns is_chunkable=False):
|
|
803
|
+
- Empty $or array (invalid MongoDB syntax) -> REJECT
|
|
804
|
+
- Contradictory time bounds (lo >= hi) -> REJECT
|
|
805
|
+
|
|
806
|
+
Single-Worker Cases (returns is_chunkable=True, empty brackets):
|
|
807
|
+
- $natural sort (insertion order) -> SINGLE
|
|
808
|
+
- Forbidden operators ($expr, $text, $near, etc.) -> SINGLE
|
|
809
|
+
- Nested $or (depth > 1) -> SINGLE
|
|
810
|
+
- Time field negation ($ne/$nin/$not/$nor on time field) -> SINGLE
|
|
811
|
+
- Unbounded $or branches -> SINGLE
|
|
812
|
+
- No time field reference -> SINGLE
|
|
813
|
+
|
|
814
|
+
Implementation Note - Multiple Time Bounds Extraction:
|
|
815
|
+
This function calls extract_time_bounds_recursive() multiple times in different
|
|
816
|
+
code paths for different purposes:
|
|
817
|
+
|
|
818
|
+
1. Via is_chunkable_query() - Validates overall query has time bounds
|
|
819
|
+
Returns: result.bounds = union of all time ranges in query
|
|
820
|
+
|
|
821
|
+
2. In _check_or_branch_safety() - Checks if $or branches have
|
|
822
|
+
identical time bounds
|
|
823
|
+
Purpose: Overlapping $in values can only be safely transformed
|
|
824
|
+
when all branches have the SAME time range. Different
|
|
825
|
+
ranges would cause data loss.
|
|
826
|
+
Example: Branch A [Jan 1-15] with IDs {1,2,3} vs Branch B
|
|
827
|
+
[Jan 10-31] with IDs {2,3,4}. Cannot remove overlap {2,3}
|
|
828
|
+
because documents in [Jan 1-10) would be lost!
|
|
829
|
+
|
|
830
|
+
3. In merge attempt (unsafe $or handling) - Extracts bounds from
|
|
831
|
+
each branch
|
|
832
|
+
Purpose: If branches have overlapping results (unsafe), check if
|
|
833
|
+
they can be merged into a single bracket. Only possible if
|
|
834
|
+
time ranges are contiguous with no gaps.
|
|
835
|
+
Example: Branch A [Jan 1-15], Branch B [Jan 10-20]
|
|
836
|
+
-> Merged [Jan 1-20] ✓
|
|
837
|
+
Branch A [Jan 1-15], Branch B [Jan 20-31]
|
|
838
|
+
-> Cannot merge (gap!) ✗
|
|
839
|
+
|
|
840
|
+
4. In final bracket creation - Sets TimeRange for each output
|
|
841
|
+
bracket
|
|
842
|
+
Purpose: Each bracket needs its specific time range for chunking.
|
|
843
|
+
Example: {"sensor": "A", ts: [Jan 1-15]}
|
|
844
|
+
-> Bracket with TimeRange(Jan 1, Jan 15)
|
|
845
|
+
{"sensor": "B", ts: [Feb 1-28]}
|
|
846
|
+
-> Bracket with TimeRange(Feb 1, Feb 28)
|
|
847
|
+
|
|
848
|
+
Why multiple calls are necessary:
|
|
849
|
+
- is_chunkable_query() returns UNION of time bounds (overall range)
|
|
850
|
+
- Each $or branch may have DIFFERENT time bounds (per-branch ranges)
|
|
851
|
+
- Safety checks need to compare bounds across branches (identical?)
|
|
852
|
+
- Merge logic needs to check contiguity (adjacent/overlapping?)
|
|
853
|
+
- Final brackets need their specific ranges (individual TimeRange objects)
|
|
854
|
+
|
|
855
|
+
This is NOT redundant - each extraction serves a different purpose in the
|
|
856
|
+
validation -> optimization -> construction pipeline.
|
|
857
|
+
"""
|
|
858
|
+
|
|
859
|
+
# PHASE 0: Validate query using is_chunkable_query
|
|
860
|
+
# This is now the ONLY validation point - cursor methods don't need to
|
|
861
|
+
# call it separately
|
|
862
|
+
result = is_chunkable_query(query, time_field, sort_spec)
|
|
863
|
+
|
|
864
|
+
bounds = result.bounds
|
|
865
|
+
|
|
866
|
+
# Handle REJECT mode - invalid query syntax or contradictory constraints
|
|
867
|
+
if result.mode == ChunkabilityMode.REJECT:
|
|
868
|
+
return False, result.reason, [], (None, None)
|
|
869
|
+
|
|
870
|
+
# Handle SINGLE mode - valid query, but single-worker fallback needed
|
|
871
|
+
if result.mode == ChunkabilityMode.SINGLE:
|
|
872
|
+
# Return empty brackets as signal to use single worker
|
|
873
|
+
# is_chunkable=True means query is VALID and executable
|
|
874
|
+
# Empty brackets means "don't parallelize, use single worker"
|
|
875
|
+
return True, result.reason, [], bounds
|
|
876
|
+
|
|
877
|
+
# =========================================================================
|
|
878
|
+
# DEFENSE-IN-DEPTH: Redundant safety checks
|
|
879
|
+
# =========================================================================
|
|
880
|
+
# These checks duplicate validation already done in is_chunkable_query().
|
|
881
|
+
# They're kept as a safety net in case:
|
|
882
|
+
# 1. is_chunkable_query() has a bug and returns PARALLEL incorrectly
|
|
883
|
+
# 2. Future code changes bypass is_chunkable_query() validation
|
|
884
|
+
# 3. Query is mutated between validation and bracket building
|
|
885
|
+
#
|
|
886
|
+
# PARANOID but JUSTIFIED: Better to catch issues twice than produce
|
|
887
|
+
# incorrect results. These checks are fast and prevent data corruption.
|
|
888
|
+
# =========================================================================
|
|
889
|
+
|
|
890
|
+
# High-level safety checks (kept for defense-in-depth)
|
|
891
|
+
has_forbidden, forbidden_op = has_forbidden_ops(query)
|
|
892
|
+
if has_forbidden:
|
|
893
|
+
return False, f"forbidden-operator: {forbidden_op}", [], (None, None)
|
|
894
|
+
|
|
895
|
+
# PHASE 1: Normalize query (flatten nested $and, detect complexity)
|
|
896
|
+
normalized, complexity_flags = normalize_query(query)
|
|
897
|
+
|
|
898
|
+
# Use normalized query for all subsequent operations
|
|
899
|
+
global_and, or_list = split_global_and(normalized)
|
|
900
|
+
|
|
901
|
+
# Check for nested $or or multiple $or
|
|
902
|
+
if complexity_flags["nested_or"]:
|
|
903
|
+
return False, "nested-or-depth>1", [], (None, None)
|
|
904
|
+
|
|
905
|
+
if or_depth(normalized) > 1:
|
|
906
|
+
return False, "nested-or-depth>1", [], (None, None)
|
|
907
|
+
|
|
908
|
+
# No $or: treat as single branch represented by global_and
|
|
909
|
+
if not or_list:
|
|
910
|
+
branches: List[Dict[str, Any]] = [global_and]
|
|
911
|
+
else:
|
|
912
|
+
# =====================================================================
|
|
913
|
+
# SAFETY CHECK: Detect overlapping $or branches
|
|
914
|
+
# =====================================================================
|
|
915
|
+
# Before splitting $or into independent brackets, we must verify that
|
|
916
|
+
# branches don't have overlapping result sets. Overlap causes duplicates.
|
|
917
|
+
#
|
|
918
|
+
# Cases that cause overlap:
|
|
919
|
+
# - Negation operators ($nin, $ne, $not, $nor) in any branch
|
|
920
|
+
# - Overlapping $in values across branches
|
|
921
|
+
# - Different field sets (can't determine disjointness)
|
|
922
|
+
#
|
|
923
|
+
# If overlap is detected and cannot be transformed, we return a single
|
|
924
|
+
# bracket covering the entire query (executed as unchunked).
|
|
925
|
+
# =====================================================================
|
|
926
|
+
is_safe, reason, transformed = _check_or_branch_safety(
|
|
927
|
+
or_list, global_and, time_field
|
|
928
|
+
)
|
|
929
|
+
|
|
930
|
+
if not is_safe:
|
|
931
|
+
# Unsafe $or pattern detected - but check if we can MERGE branches
|
|
932
|
+
#
|
|
933
|
+
# OPTIMIZATION: If all branches have IDENTICAL static filters
|
|
934
|
+
# (excluding time), AND their time ranges are contiguous (no gaps),
|
|
935
|
+
# we can MERGE them into a single bracket with the union of time
|
|
936
|
+
# ranges.
|
|
937
|
+
#
|
|
938
|
+
# Example (mergeable - overlapping):
|
|
939
|
+
# $or: [
|
|
940
|
+
# {filter_A, timestamp: {$gte: Jan 1, $lt: Jan 20}},
|
|
941
|
+
# {filter_A, timestamp: {$gte: Jan 15, $lt: Feb 1}},
|
|
942
|
+
# ]
|
|
943
|
+
# -> Merged: {filter_A, timestamp: {$gte: Jan 1, $lt: Feb 1}}
|
|
944
|
+
#
|
|
945
|
+
# Example (NOT mergeable - disjoint with gap):
|
|
946
|
+
# $or: [
|
|
947
|
+
# {filter_A, timestamp: {$gte: Jan 1, $lt: Jan 15}},
|
|
948
|
+
# {filter_A, timestamp: {$gte: Feb 1, $lt: Feb 15}},
|
|
949
|
+
# ]
|
|
950
|
+
# -> Cannot merge! Gap from Jan 15 to Feb 1 would include unwanted data.
|
|
951
|
+
# -> Fall back to single bracket with full $or query.
|
|
952
|
+
|
|
953
|
+
# Extract static filters (without time) from each branch
|
|
954
|
+
static_filters = []
|
|
955
|
+
time_bounds_list = []
|
|
956
|
+
has_unbounded_branch = False
|
|
957
|
+
has_partial_branch = False # Only $gte or only $lt
|
|
958
|
+
|
|
959
|
+
for branch in or_list:
|
|
960
|
+
combined = {**global_and, **branch}
|
|
961
|
+
bounds, _ = extract_time_bounds_recursive(combined, time_field)
|
|
962
|
+
if bounds is None:
|
|
963
|
+
branch_lo, branch_hi, branch_hi_inc, branch_lo_inc = (
|
|
964
|
+
None,
|
|
965
|
+
None,
|
|
966
|
+
False,
|
|
967
|
+
True,
|
|
968
|
+
)
|
|
969
|
+
else:
|
|
970
|
+
branch_lo, branch_hi, branch_hi_inc, branch_lo_inc = bounds
|
|
971
|
+
|
|
972
|
+
# Check if this branch has NO time constraint at all
|
|
973
|
+
if branch_lo is None and branch_hi is None:
|
|
974
|
+
has_unbounded_branch = True
|
|
975
|
+
# Check if partial (only one bound)
|
|
976
|
+
elif branch_lo is None or branch_hi is None:
|
|
977
|
+
has_partial_branch = True
|
|
978
|
+
|
|
979
|
+
time_bounds_list.append(
|
|
980
|
+
(branch_lo, branch_hi, branch_hi_inc, branch_lo_inc)
|
|
981
|
+
)
|
|
982
|
+
|
|
983
|
+
# Extract static filter (without time)
|
|
984
|
+
static_wo_time = dict(combined)
|
|
985
|
+
if time_field in static_wo_time:
|
|
986
|
+
static_wo_time.pop(time_field)
|
|
987
|
+
static_filters.append(static_wo_time)
|
|
988
|
+
|
|
989
|
+
# Check if all static filters are identical
|
|
990
|
+
all_static_identical = all(
|
|
991
|
+
_json_key(sf) == _json_key(static_filters[0])
|
|
992
|
+
for sf in static_filters[1:]
|
|
993
|
+
)
|
|
994
|
+
|
|
995
|
+
# Can only merge if:
|
|
996
|
+
# 1. All static filters identical
|
|
997
|
+
# 2. All time ranges are FULL (both lo and hi)
|
|
998
|
+
# 3. Time ranges are contiguous (no gaps)
|
|
999
|
+
can_merge = False
|
|
1000
|
+
merged_lo, merged_hi = None, None
|
|
1001
|
+
merged_hi_inclusive, merged_lo_inclusive = False, True
|
|
1002
|
+
|
|
1003
|
+
if (
|
|
1004
|
+
all_static_identical
|
|
1005
|
+
and not has_unbounded_branch
|
|
1006
|
+
and not has_partial_branch
|
|
1007
|
+
):
|
|
1008
|
+
# All branches have identical static filters and full time ranges
|
|
1009
|
+
# Check if time ranges are contiguous (no gaps)
|
|
1010
|
+
#
|
|
1011
|
+
# Algorithm: Sort by start time, then verify each range starts
|
|
1012
|
+
# at or before the previous range's end (overlap or adjacent)
|
|
1013
|
+
full_ranges = [
|
|
1014
|
+
(lo, hi, hi_inc, lo_inc)
|
|
1015
|
+
for lo, hi, hi_inc, lo_inc in time_bounds_list
|
|
1016
|
+
]
|
|
1017
|
+
sorted_ranges = sorted(full_ranges, key=lambda r: r[0])
|
|
1018
|
+
|
|
1019
|
+
# Start with first range
|
|
1020
|
+
running_lo = sorted_ranges[0][0]
|
|
1021
|
+
running_hi = sorted_ranges[0][1]
|
|
1022
|
+
running_lo_inclusive = sorted_ranges[0][3]
|
|
1023
|
+
running_hi_inclusive = sorted_ranges[0][2]
|
|
1024
|
+
has_gap = False
|
|
1025
|
+
|
|
1026
|
+
for lo, hi, hi_inc, lo_inc in sorted_ranges[1:]:
|
|
1027
|
+
if lo > running_hi:
|
|
1028
|
+
# Gap detected! This range starts after the previous ends
|
|
1029
|
+
has_gap = True
|
|
1030
|
+
break
|
|
1031
|
+
# Extend running_hi if this range extends further
|
|
1032
|
+
if hi > running_hi:
|
|
1033
|
+
running_hi = hi
|
|
1034
|
+
running_hi_inclusive = hi_inc
|
|
1035
|
+
elif hi == running_hi:
|
|
1036
|
+
running_hi_inclusive = running_hi_inclusive or hi_inc
|
|
1037
|
+
|
|
1038
|
+
if not has_gap:
|
|
1039
|
+
# All ranges are contiguous - we can merge!
|
|
1040
|
+
merged_lo = running_lo
|
|
1041
|
+
merged_hi = running_hi
|
|
1042
|
+
merged_hi_inclusive = running_hi_inclusive
|
|
1043
|
+
merged_lo_inclusive = running_lo_inclusive
|
|
1044
|
+
can_merge = True
|
|
1045
|
+
|
|
1046
|
+
if can_merge:
|
|
1047
|
+
# Merge into single clean bracket
|
|
1048
|
+
return (
|
|
1049
|
+
True,
|
|
1050
|
+
f"merged-branches:{reason}",
|
|
1051
|
+
[
|
|
1052
|
+
Bracket(
|
|
1053
|
+
static_filter=static_filters[0],
|
|
1054
|
+
timerange=TimeRange(
|
|
1055
|
+
merged_lo,
|
|
1056
|
+
merged_hi,
|
|
1057
|
+
True,
|
|
1058
|
+
merged_hi_inclusive,
|
|
1059
|
+
merged_lo_inclusive,
|
|
1060
|
+
),
|
|
1061
|
+
)
|
|
1062
|
+
],
|
|
1063
|
+
(merged_lo, merged_hi),
|
|
1064
|
+
)
|
|
1065
|
+
|
|
1066
|
+
# Cannot merge - fall back to single bracket with full $or
|
|
1067
|
+
# This preserves the original $or semantics
|
|
1068
|
+
lo, hi = None, None
|
|
1069
|
+
hi_inclusive, lo_inclusive = False, True
|
|
1070
|
+
|
|
1071
|
+
for branch_lo, branch_hi, branch_hi_inc, branch_lo_inc in time_bounds_list:
|
|
1072
|
+
if branch_lo is not None:
|
|
1073
|
+
if lo is None or branch_lo < lo:
|
|
1074
|
+
lo = branch_lo
|
|
1075
|
+
lo_inclusive = branch_lo_inc
|
|
1076
|
+
elif branch_lo == lo:
|
|
1077
|
+
lo_inclusive = lo_inclusive or branch_lo_inc
|
|
1078
|
+
if branch_hi is not None:
|
|
1079
|
+
if hi is None or branch_hi > hi:
|
|
1080
|
+
hi = branch_hi
|
|
1081
|
+
hi_inclusive = branch_hi_inc
|
|
1082
|
+
elif branch_hi == hi:
|
|
1083
|
+
hi_inclusive = hi_inclusive or branch_hi_inc
|
|
1084
|
+
|
|
1085
|
+
# If any branch is unbounded, the whole query is unbounded
|
|
1086
|
+
if has_unbounded_branch:
|
|
1087
|
+
lo, hi = None, None
|
|
1088
|
+
hi_inclusive, lo_inclusive = False, True
|
|
1089
|
+
|
|
1090
|
+
# Build the single bracket with original query structure
|
|
1091
|
+
single_filter = dict(query)
|
|
1092
|
+
if time_field in single_filter:
|
|
1093
|
+
single_filter.pop(time_field)
|
|
1094
|
+
|
|
1095
|
+
is_full = lo is not None and hi is not None
|
|
1096
|
+
return (
|
|
1097
|
+
True,
|
|
1098
|
+
f"single-bracket:{reason}",
|
|
1099
|
+
[
|
|
1100
|
+
Bracket(
|
|
1101
|
+
static_filter=single_filter,
|
|
1102
|
+
timerange=TimeRange(
|
|
1103
|
+
lo, hi, is_full, hi_inclusive, lo_inclusive
|
|
1104
|
+
),
|
|
1105
|
+
)
|
|
1106
|
+
],
|
|
1107
|
+
(lo, hi),
|
|
1108
|
+
)
|
|
1109
|
+
|
|
1110
|
+
# Use transformed branches if available
|
|
1111
|
+
branches = transformed if transformed else or_list
|
|
1112
|
+
|
|
1113
|
+
prelim: List[Bracket] = []
|
|
1114
|
+
for br in branches:
|
|
1115
|
+
if not isinstance(br, Dict):
|
|
1116
|
+
return False, "branch-not-dict", [], (None, None)
|
|
1117
|
+
|
|
1118
|
+
eff: Dict[str, Any] = {}
|
|
1119
|
+
if global_and:
|
|
1120
|
+
eff.update(global_and)
|
|
1121
|
+
eff.update(br)
|
|
1122
|
+
|
|
1123
|
+
br_bounds, _ = extract_time_bounds_recursive(eff, time_field)
|
|
1124
|
+
if br_bounds is None:
|
|
1125
|
+
lo, hi, hi_inclusive, lo_inclusive = None, None, False, True
|
|
1126
|
+
else:
|
|
1127
|
+
lo, hi, hi_inclusive, lo_inclusive = br_bounds
|
|
1128
|
+
is_full = lo is not None and hi is not None
|
|
1129
|
+
|
|
1130
|
+
# Remove time field from static filter
|
|
1131
|
+
static_wo_time = dict(eff)
|
|
1132
|
+
if time_field in static_wo_time:
|
|
1133
|
+
static_wo_time.pop(time_field)
|
|
1134
|
+
|
|
1135
|
+
if "$or" in static_wo_time:
|
|
1136
|
+
return False, "nested-or-in-branch", [], (None, None)
|
|
1137
|
+
|
|
1138
|
+
prelim.append(
|
|
1139
|
+
Bracket(
|
|
1140
|
+
static_filter=static_wo_time,
|
|
1141
|
+
timerange=TimeRange(lo, hi, is_full, hi_inclusive, lo_inclusive),
|
|
1142
|
+
)
|
|
1143
|
+
)
|
|
1144
|
+
|
|
1145
|
+
grouped: Dict[str, Dict[str, Any]] = {}
|
|
1146
|
+
for b in prelim:
|
|
1147
|
+
key = _json_key(b.static_filter)
|
|
1148
|
+
g = grouped.get(key)
|
|
1149
|
+
if g is None:
|
|
1150
|
+
g = {"static": b.static_filter, "full": [], "partial": []}
|
|
1151
|
+
grouped[key] = g
|
|
1152
|
+
(g["full"] if b.timerange.is_full else g["partial"]).append(b.timerange)
|
|
1153
|
+
|
|
1154
|
+
out_brackets: List[Bracket] = []
|
|
1155
|
+
for g in grouped.values():
|
|
1156
|
+
static = g["static"]
|
|
1157
|
+
full_ranges = g["full"]
|
|
1158
|
+
partial_ranges = g["partial"]
|
|
1159
|
+
|
|
1160
|
+
# Merge partial ranges first (keep most inclusive)
|
|
1161
|
+
# NOTE: _merge_partial_ranges handles unbounded (lo=None, hi=None) by
|
|
1162
|
+
# returning just the unbounded range, which covers everything
|
|
1163
|
+
merged_partials = _merge_partial_ranges(partial_ranges)
|
|
1164
|
+
|
|
1165
|
+
# Check if any partial is completely unbounded - if so, it covers ALL
|
|
1166
|
+
# (both other partials AND all full ranges in this group)
|
|
1167
|
+
has_unbounded = any(r.lo is None and r.hi is None for r in merged_partials)
|
|
1168
|
+
if has_unbounded:
|
|
1169
|
+
# Unbounded covers everything - just emit the unbounded bracket
|
|
1170
|
+
out_brackets.append(
|
|
1171
|
+
Bracket(
|
|
1172
|
+
static_filter=static,
|
|
1173
|
+
timerange=TimeRange(None, None, False, False, True),
|
|
1174
|
+
)
|
|
1175
|
+
)
|
|
1176
|
+
continue # Skip all full and other partial for this static_filter
|
|
1177
|
+
|
|
1178
|
+
# Check if any partial covers all full ranges
|
|
1179
|
+
# If so, we only need the partial (it fetches everything the fulls would)
|
|
1180
|
+
remaining_fulls: List[TimeRange] = []
|
|
1181
|
+
for fr in full_ranges:
|
|
1182
|
+
covered = False
|
|
1183
|
+
for pr in merged_partials:
|
|
1184
|
+
if _partial_covers_full(pr, fr):
|
|
1185
|
+
covered = True
|
|
1186
|
+
break
|
|
1187
|
+
if not covered:
|
|
1188
|
+
remaining_fulls.append(fr)
|
|
1189
|
+
|
|
1190
|
+
# Merge remaining full ranges
|
|
1191
|
+
for r in _merge_full_ranges(remaining_fulls):
|
|
1192
|
+
out_brackets.append(Bracket(static_filter=static, timerange=r))
|
|
1193
|
+
|
|
1194
|
+
# Add merged partial ranges (these will be executed as single unchunked queries)
|
|
1195
|
+
for r in merged_partials:
|
|
1196
|
+
out_brackets.append(Bracket(static_filter=static, timerange=r))
|
|
1197
|
+
|
|
1198
|
+
if not out_brackets:
|
|
1199
|
+
return False, "no-complete-time-range", [], (None, None)
|
|
1200
|
+
|
|
1201
|
+
return True, "", out_brackets, bounds
|