xlr8 0.1.7b3__cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xlr8/__init__.py +113 -0
- xlr8/_xlr8_rust.cpython-311-x86_64-linux-gnu.so +0 -0
- xlr8/_xlr8_rust.pyi +71 -0
- xlr8/analysis/__init__.py +58 -0
- xlr8/analysis/brackets.py +1201 -0
- xlr8/analysis/chunker.py +118 -0
- xlr8/analysis/inspector.py +1889 -0
- xlr8/collection/__init__.py +6 -0
- xlr8/collection/cursor.py +2161 -0
- xlr8/collection/cursor.pyi +179 -0
- xlr8/collection/wrapper.py +400 -0
- xlr8/collection/wrapper.pyi +420 -0
- xlr8/constants.py +24 -0
- xlr8/execution/__init__.py +43 -0
- xlr8/execution/callback.py +792 -0
- xlr8/execution/executor.py +500 -0
- xlr8/execution/planner.py +377 -0
- xlr8/py.typed +1 -0
- xlr8/rust_backend.py +40 -0
- xlr8/rust_backend.pyi +71 -0
- xlr8/schema/__init__.py +42 -0
- xlr8/schema/encoder.py +235 -0
- xlr8/schema/schema.py +265 -0
- xlr8/schema/types.py +239 -0
- xlr8/storage/__init__.py +17 -0
- xlr8/storage/cache.py +228 -0
- xlr8/storage/reader.py +1369 -0
- xlr8-0.1.7b3.dist-info/METADATA +176 -0
- xlr8-0.1.7b3.dist-info/RECORD +31 -0
- xlr8-0.1.7b3.dist-info/WHEEL +5 -0
- xlr8-0.1.7b3.dist-info/licenses/LICENSE +201 -0
xlr8/analysis/chunker.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Time-range chunking utilities for XLR8.
|
|
3
|
+
|
|
4
|
+
This module splits time ranges into day-aligned chunks for parallel processing.
|
|
5
|
+
Each chunk becomes a work item that a worker can fetch independently.
|
|
6
|
+
|
|
7
|
+
WHY CHUNK BY TIME?
|
|
8
|
+
------------------
|
|
9
|
+
|
|
10
|
+
MongoDB time-series data is typically indexed by time. Chunking allows:
|
|
11
|
+
1. Parallel fetches - Multiple workers can fetch different time chunks
|
|
12
|
+
2. Incremental caching - Cache chunks separately, reuse when time range overlaps
|
|
13
|
+
3. Memory control - Each chunk fits in worker's RAM budget
|
|
14
|
+
|
|
15
|
+
CHUNKING ALGORITHM
|
|
16
|
+
------------------
|
|
17
|
+
|
|
18
|
+
INPUT:
|
|
19
|
+
start = datetime(2024, 1, 5, 12, 30) # Mid-day start
|
|
20
|
+
end = datetime(2024, 1, 15, 8, 0) # Mid-day end
|
|
21
|
+
chunk_days = 3
|
|
22
|
+
|
|
23
|
+
OUTPUT (day-aligned chunks):
|
|
24
|
+
|
|
25
|
+
Chunk 1: 2024-01-05 12:30 -> 2024-01-08 00:00 (partial first chunk)
|
|
26
|
+
Chunk 2: 2024-01-08 00:00 -> 2024-01-11 00:00 (full 3-day chunk)
|
|
27
|
+
Chunk 3: 2024-01-11 00:00 -> 2024-01-14 00:00 (full 3-day chunk)
|
|
28
|
+
Chunk 4: 2024-01-14 00:00 -> 2024-01-15 08:00 (partial last chunk)
|
|
29
|
+
|
|
30
|
+
Note: First boundary is aligned to day start + step after the start time.
|
|
31
|
+
|
|
32
|
+
TYPICAL USAGE
|
|
33
|
+
-------------
|
|
34
|
+
|
|
35
|
+
6-month query with 14-day chunks:
|
|
36
|
+
start = 2024-01-01
|
|
37
|
+
end = 2024-07-01
|
|
38
|
+
chunk_days = 14 (default)
|
|
39
|
+
|
|
40
|
+
Result: ~13 chunks
|
|
41
|
+
Chunk 1: Jan 1-15
|
|
42
|
+
Chunk 2: Jan 15-29
|
|
43
|
+
Chunk 3: Jan 29 - Feb 12
|
|
44
|
+
...
|
|
45
|
+
Chunk 13: Jun 17 - Jul 1
|
|
46
|
+
|
|
47
|
+
With 10 workers, chunks are processed in parallel:
|
|
48
|
+
Workers 0-9 grab chunks 1-10 immediately
|
|
49
|
+
As workers finish, they grab chunks 11-13
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
from datetime import datetime, timedelta, timezone
|
|
53
|
+
from typing import List, Optional, Tuple
|
|
54
|
+
|
|
55
|
+
__all__ = [
|
|
56
|
+
"chunk_time_range",
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def chunk_time_range(
|
|
61
|
+
start: datetime,
|
|
62
|
+
end: datetime,
|
|
63
|
+
chunk_size: Optional[timedelta] = None,
|
|
64
|
+
) -> List[Tuple[datetime, datetime]]:
|
|
65
|
+
"""
|
|
66
|
+
Split time range into chunks.
|
|
67
|
+
|
|
68
|
+
Creates chunks of specified size, aligned to boundaries.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
start: Start datetime (inclusive)
|
|
72
|
+
end: End datetime (exclusive)
|
|
73
|
+
chunk_size: Size of each chunk as timedelta (default: 1 day)
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
List of (chunk_start, chunk_end) tuples
|
|
77
|
+
|
|
78
|
+
Examples:
|
|
79
|
+
Day-level chunking:
|
|
80
|
+
>>> start = datetime(2024, 1, 1, 12, 0, 0, tzinfo=timezone.utc)
|
|
81
|
+
>>> end = datetime(2024, 1, 5, 8, 0, 0, tzinfo=timezone.utc)
|
|
82
|
+
>>> chunks = chunk_time_range(start, end, chunk_size=timedelta(days=1))
|
|
83
|
+
|
|
84
|
+
Hour-level chunking:
|
|
85
|
+
>>> chunks = chunk_time_range(start, end, chunk_size=timedelta(hours=8))
|
|
86
|
+
"""
|
|
87
|
+
# Ensure timezone-aware
|
|
88
|
+
if start.tzinfo is None:
|
|
89
|
+
start = start.replace(tzinfo=timezone.utc)
|
|
90
|
+
if end.tzinfo is None:
|
|
91
|
+
end = end.replace(tzinfo=timezone.utc)
|
|
92
|
+
|
|
93
|
+
if start >= end:
|
|
94
|
+
return []
|
|
95
|
+
|
|
96
|
+
# Determine step size
|
|
97
|
+
if chunk_size is not None:
|
|
98
|
+
step = chunk_size
|
|
99
|
+
else:
|
|
100
|
+
step = timedelta(days=1) # Default to 1 day
|
|
101
|
+
|
|
102
|
+
out: List[Tuple[datetime, datetime]] = []
|
|
103
|
+
|
|
104
|
+
# First boundary strictly AFTER start, aligned to day start + step
|
|
105
|
+
first_boundary = (
|
|
106
|
+
datetime(start.year, start.month, start.day, tzinfo=timezone.utc) + step
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
lo = start
|
|
110
|
+
cur = first_boundary
|
|
111
|
+
|
|
112
|
+
while lo < end:
|
|
113
|
+
chunk_end = cur if cur < end else end
|
|
114
|
+
out.append((lo, chunk_end))
|
|
115
|
+
lo = cur
|
|
116
|
+
cur = cur + step
|
|
117
|
+
|
|
118
|
+
return out
|