isage-middleware 0.2.4.3__cp311-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isage_middleware-0.2.4.3.dist-info/METADATA +266 -0
- isage_middleware-0.2.4.3.dist-info/RECORD +94 -0
- isage_middleware-0.2.4.3.dist-info/WHEEL +5 -0
- isage_middleware-0.2.4.3.dist-info/top_level.txt +1 -0
- sage/middleware/__init__.py +59 -0
- sage/middleware/_version.py +6 -0
- sage/middleware/components/__init__.py +30 -0
- sage/middleware/components/extensions_compat.py +141 -0
- sage/middleware/components/sage_db/__init__.py +116 -0
- sage/middleware/components/sage_db/backend.py +136 -0
- sage/middleware/components/sage_db/service.py +15 -0
- sage/middleware/components/sage_flow/__init__.py +76 -0
- sage/middleware/components/sage_flow/python/__init__.py +14 -0
- sage/middleware/components/sage_flow/python/micro_service/__init__.py +4 -0
- sage/middleware/components/sage_flow/python/micro_service/sage_flow_service.py +88 -0
- sage/middleware/components/sage_flow/python/sage_flow.py +30 -0
- sage/middleware/components/sage_flow/service.py +14 -0
- sage/middleware/components/sage_mem/__init__.py +83 -0
- sage/middleware/components/sage_sias/__init__.py +59 -0
- sage/middleware/components/sage_sias/continual_learner.py +184 -0
- sage/middleware/components/sage_sias/coreset_selector.py +302 -0
- sage/middleware/components/sage_sias/types.py +94 -0
- sage/middleware/components/sage_tsdb/__init__.py +81 -0
- sage/middleware/components/sage_tsdb/python/__init__.py +21 -0
- sage/middleware/components/sage_tsdb/python/_sage_tsdb.pyi +17 -0
- sage/middleware/components/sage_tsdb/python/algorithms/__init__.py +17 -0
- sage/middleware/components/sage_tsdb/python/algorithms/base.py +51 -0
- sage/middleware/components/sage_tsdb/python/algorithms/out_of_order_join.py +248 -0
- sage/middleware/components/sage_tsdb/python/algorithms/window_aggregator.py +296 -0
- sage/middleware/components/sage_tsdb/python/micro_service/__init__.py +7 -0
- sage/middleware/components/sage_tsdb/python/micro_service/sage_tsdb_service.py +365 -0
- sage/middleware/components/sage_tsdb/python/sage_tsdb.py +523 -0
- sage/middleware/components/sage_tsdb/service.py +17 -0
- sage/middleware/components/vector_stores/__init__.py +25 -0
- sage/middleware/components/vector_stores/chroma.py +483 -0
- sage/middleware/components/vector_stores/chroma_adapter.py +185 -0
- sage/middleware/components/vector_stores/milvus.py +677 -0
- sage/middleware/operators/__init__.py +56 -0
- sage/middleware/operators/agent/__init__.py +24 -0
- sage/middleware/operators/agent/planning/__init__.py +5 -0
- sage/middleware/operators/agent/planning/llm_adapter.py +41 -0
- sage/middleware/operators/agent/planning/planner_adapter.py +98 -0
- sage/middleware/operators/agent/planning/router.py +107 -0
- sage/middleware/operators/agent/runtime.py +296 -0
- sage/middleware/operators/agentic/__init__.py +41 -0
- sage/middleware/operators/agentic/config.py +254 -0
- sage/middleware/operators/agentic/planning_operator.py +125 -0
- sage/middleware/operators/agentic/refined_searcher.py +132 -0
- sage/middleware/operators/agentic/runtime.py +241 -0
- sage/middleware/operators/agentic/timing_operator.py +125 -0
- sage/middleware/operators/agentic/tool_selection_operator.py +127 -0
- sage/middleware/operators/context/__init__.py +17 -0
- sage/middleware/operators/context/critic_evaluation.py +16 -0
- sage/middleware/operators/context/model_context.py +565 -0
- sage/middleware/operators/context/quality_label.py +12 -0
- sage/middleware/operators/context/search_query_results.py +61 -0
- sage/middleware/operators/context/search_result.py +42 -0
- sage/middleware/operators/context/search_session.py +79 -0
- sage/middleware/operators/filters/__init__.py +26 -0
- sage/middleware/operators/filters/context_sink.py +387 -0
- sage/middleware/operators/filters/context_source.py +376 -0
- sage/middleware/operators/filters/evaluate_filter.py +83 -0
- sage/middleware/operators/filters/tool_filter.py +74 -0
- sage/middleware/operators/llm/__init__.py +18 -0
- sage/middleware/operators/llm/sagellm_generator.py +432 -0
- sage/middleware/operators/rag/__init__.py +147 -0
- sage/middleware/operators/rag/arxiv.py +331 -0
- sage/middleware/operators/rag/chunk.py +13 -0
- sage/middleware/operators/rag/document_loaders.py +23 -0
- sage/middleware/operators/rag/evaluate.py +658 -0
- sage/middleware/operators/rag/generator.py +340 -0
- sage/middleware/operators/rag/index_builder/__init__.py +48 -0
- sage/middleware/operators/rag/index_builder/builder.py +363 -0
- sage/middleware/operators/rag/index_builder/manifest.py +101 -0
- sage/middleware/operators/rag/index_builder/storage.py +131 -0
- sage/middleware/operators/rag/pipeline.py +46 -0
- sage/middleware/operators/rag/profiler.py +59 -0
- sage/middleware/operators/rag/promptor.py +400 -0
- sage/middleware/operators/rag/refiner.py +231 -0
- sage/middleware/operators/rag/reranker.py +364 -0
- sage/middleware/operators/rag/retriever.py +1308 -0
- sage/middleware/operators/rag/searcher.py +37 -0
- sage/middleware/operators/rag/types.py +28 -0
- sage/middleware/operators/rag/writer.py +80 -0
- sage/middleware/operators/tools/__init__.py +71 -0
- sage/middleware/operators/tools/arxiv_paper_searcher.py +175 -0
- sage/middleware/operators/tools/arxiv_searcher.py +102 -0
- sage/middleware/operators/tools/duckduckgo_searcher.py +105 -0
- sage/middleware/operators/tools/image_captioner.py +104 -0
- sage/middleware/operators/tools/nature_news_fetcher.py +224 -0
- sage/middleware/operators/tools/searcher_tool.py +514 -0
- sage/middleware/operators/tools/text_detector.py +185 -0
- sage/middleware/operators/tools/url_text_extractor.py +104 -0
- sage/middleware/py.typed +2 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Algorithms for time series processing.
|
|
3
|
+
|
|
4
|
+
This module provides a pluggable algorithm interface for various
|
|
5
|
+
time series processing tasks including stream joins, aggregations,
|
|
6
|
+
and complex event processing.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .base import TimeSeriesAlgorithm
|
|
10
|
+
from .out_of_order_join import OutOfOrderStreamJoin
|
|
11
|
+
from .window_aggregator import WindowAggregator
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"TimeSeriesAlgorithm",
|
|
15
|
+
"OutOfOrderStreamJoin",
|
|
16
|
+
"WindowAggregator",
|
|
17
|
+
]
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base algorithm interface for time series processing.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from ..sage_tsdb import TimeSeriesData
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TimeSeriesAlgorithm(ABC):
|
|
12
|
+
"""
|
|
13
|
+
Base class for time series processing algorithms.
|
|
14
|
+
|
|
15
|
+
All algorithm implementations should inherit from this class and
|
|
16
|
+
implement the process method.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, config: dict[str, Any] | None = None):
|
|
20
|
+
"""
|
|
21
|
+
Initialize algorithm.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
config: Algorithm-specific configuration
|
|
25
|
+
"""
|
|
26
|
+
self.config = config or {}
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def process(self, data: list[TimeSeriesData], **kwargs) -> Any:
|
|
30
|
+
"""
|
|
31
|
+
Process time series data.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
data: Input time series data points
|
|
35
|
+
**kwargs: Additional algorithm-specific parameters
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Processed results (algorithm-specific format)
|
|
39
|
+
"""
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
def reset(self): # noqa: B027
|
|
43
|
+
"""Reset algorithm state (for stateful algorithms)"""
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
def get_stats(self) -> dict[str, Any]:
|
|
47
|
+
"""Get algorithm statistics"""
|
|
48
|
+
return {}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
__all__ = ["TimeSeriesAlgorithm"]
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Out-of-Order Stream Join Algorithm
|
|
3
|
+
|
|
4
|
+
This algorithm handles joining two time series streams that may arrive
|
|
5
|
+
out of order, using windowing and buffering strategies.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from collections.abc import Callable
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from ..sage_tsdb import TimeSeriesData
|
|
14
|
+
from .base import TimeSeriesAlgorithm
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class JoinConfig:
|
|
19
|
+
"""Configuration for stream join"""
|
|
20
|
+
|
|
21
|
+
window_size: int # milliseconds
|
|
22
|
+
max_delay: int # maximum out-of-order delay (ms)
|
|
23
|
+
join_key: str | None = None # tag key for join condition
|
|
24
|
+
join_predicate: Callable[[TimeSeriesData, TimeSeriesData], bool] | None = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class StreamBuffer:
|
|
28
|
+
"""Buffer for managing out-of-order streams"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, max_delay: int):
|
|
31
|
+
"""
|
|
32
|
+
Initialize stream buffer.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
max_delay: Maximum allowed delay (ms)
|
|
36
|
+
"""
|
|
37
|
+
self.max_delay = max_delay
|
|
38
|
+
self.buffer: list[TimeSeriesData] = []
|
|
39
|
+
self.watermark = 0 # Current watermark timestamp
|
|
40
|
+
|
|
41
|
+
def add(self, data: TimeSeriesData):
|
|
42
|
+
"""Add data to buffer"""
|
|
43
|
+
self.buffer.append(data)
|
|
44
|
+
self._update_watermark()
|
|
45
|
+
|
|
46
|
+
def add_batch(self, data_list: list[TimeSeriesData]):
|
|
47
|
+
"""Add multiple data points to buffer"""
|
|
48
|
+
self.buffer.extend(data_list)
|
|
49
|
+
self._update_watermark()
|
|
50
|
+
|
|
51
|
+
def _update_watermark(self):
|
|
52
|
+
"""Update watermark based on latest data"""
|
|
53
|
+
if self.buffer:
|
|
54
|
+
# Sort buffer by timestamp
|
|
55
|
+
self.buffer.sort(key=lambda x: x.timestamp)
|
|
56
|
+
# Watermark is the latest timestamp minus max delay
|
|
57
|
+
latest = self.buffer[-1].timestamp
|
|
58
|
+
self.watermark = latest - self.max_delay
|
|
59
|
+
|
|
60
|
+
def get_ready_data(self) -> list[TimeSeriesData]:
|
|
61
|
+
"""Get data that's ready for processing (before watermark)"""
|
|
62
|
+
ready = [d for d in self.buffer if d.timestamp <= self.watermark]
|
|
63
|
+
# Remove ready data from buffer
|
|
64
|
+
self.buffer = [d for d in self.buffer if d.timestamp > self.watermark]
|
|
65
|
+
return ready
|
|
66
|
+
|
|
67
|
+
def size(self) -> int:
|
|
68
|
+
"""Get buffer size"""
|
|
69
|
+
return len(self.buffer)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class OutOfOrderStreamJoin(TimeSeriesAlgorithm):
|
|
73
|
+
"""
|
|
74
|
+
Out-of-Order Stream Join Algorithm.
|
|
75
|
+
|
|
76
|
+
This algorithm joins two time series streams that may arrive out of order.
|
|
77
|
+
It uses windowing and watermarking to handle late data while maintaining
|
|
78
|
+
join correctness.
|
|
79
|
+
|
|
80
|
+
Features:
|
|
81
|
+
- Handles out-of-order data arrival
|
|
82
|
+
- Window-based join semantics
|
|
83
|
+
- Configurable watermarking for late data
|
|
84
|
+
- Support for custom join predicates
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
def __init__(self, config: dict[str, Any] | None = None):
|
|
88
|
+
"""
|
|
89
|
+
Initialize stream join algorithm.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
config: Configuration dictionary with:
|
|
93
|
+
- window_size: Join window size in milliseconds
|
|
94
|
+
- max_delay: Maximum out-of-order delay in milliseconds
|
|
95
|
+
- join_key: Optional tag key for equi-join
|
|
96
|
+
- join_predicate: Optional custom join predicate function
|
|
97
|
+
"""
|
|
98
|
+
super().__init__(config)
|
|
99
|
+
|
|
100
|
+
self.window_size = self.config.get("window_size", 10000) # 10 seconds
|
|
101
|
+
self.max_delay = self.config.get("max_delay", 5000) # 5 seconds
|
|
102
|
+
self.join_key = self.config.get("join_key", None)
|
|
103
|
+
self.join_predicate = self.config.get("join_predicate", None)
|
|
104
|
+
|
|
105
|
+
# Buffers for two streams
|
|
106
|
+
self.left_buffer = StreamBuffer(self.max_delay)
|
|
107
|
+
self.right_buffer = StreamBuffer(self.max_delay)
|
|
108
|
+
|
|
109
|
+
# Statistics
|
|
110
|
+
self.stats = {
|
|
111
|
+
"total_joined": 0,
|
|
112
|
+
"late_arrivals": 0,
|
|
113
|
+
"dropped_late": 0,
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
def add_left_stream(self, data: list[TimeSeriesData]):
|
|
117
|
+
"""Add data to left stream"""
|
|
118
|
+
self.left_buffer.add_batch(data)
|
|
119
|
+
|
|
120
|
+
def add_right_stream(self, data: list[TimeSeriesData]):
|
|
121
|
+
"""Add data to right stream"""
|
|
122
|
+
self.right_buffer.add_batch(data)
|
|
123
|
+
|
|
124
|
+
def process(
|
|
125
|
+
self,
|
|
126
|
+
data: list[TimeSeriesData] | None = None,
|
|
127
|
+
left_stream: list[TimeSeriesData] | None = None,
|
|
128
|
+
right_stream: list[TimeSeriesData] | None = None,
|
|
129
|
+
**kwargs,
|
|
130
|
+
) -> list[tuple[TimeSeriesData, TimeSeriesData]]:
|
|
131
|
+
"""
|
|
132
|
+
Process stream join.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
data: Not used (for compatibility)
|
|
136
|
+
left_stream: Data from left stream
|
|
137
|
+
right_stream: Data from right stream
|
|
138
|
+
**kwargs: Additional parameters
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
List of joined data pairs
|
|
142
|
+
"""
|
|
143
|
+
# Add data to buffers
|
|
144
|
+
if left_stream:
|
|
145
|
+
self.add_left_stream(left_stream)
|
|
146
|
+
if right_stream:
|
|
147
|
+
self.add_right_stream(right_stream)
|
|
148
|
+
|
|
149
|
+
# Get ready data from both buffers
|
|
150
|
+
left_ready = self.left_buffer.get_ready_data()
|
|
151
|
+
right_ready = self.right_buffer.get_ready_data()
|
|
152
|
+
|
|
153
|
+
# Perform join
|
|
154
|
+
joined = self._join_data(left_ready, right_ready)
|
|
155
|
+
|
|
156
|
+
# Update statistics
|
|
157
|
+
self.stats["total_joined"] += len(joined)
|
|
158
|
+
|
|
159
|
+
return joined
|
|
160
|
+
|
|
161
|
+
def _join_data(
|
|
162
|
+
self, left_data: list[TimeSeriesData], right_data: list[TimeSeriesData]
|
|
163
|
+
) -> list[tuple[TimeSeriesData, TimeSeriesData]]:
|
|
164
|
+
"""
|
|
165
|
+
Join data from two streams.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
left_data: Data from left stream
|
|
169
|
+
right_data: Data from right stream
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
List of joined pairs
|
|
173
|
+
"""
|
|
174
|
+
joined = []
|
|
175
|
+
|
|
176
|
+
# If join key is specified, use hash join
|
|
177
|
+
if self.join_key:
|
|
178
|
+
joined = self._hash_join(left_data, right_data)
|
|
179
|
+
else:
|
|
180
|
+
# Use nested loop join with window condition
|
|
181
|
+
joined = self._nested_loop_join(left_data, right_data)
|
|
182
|
+
|
|
183
|
+
return joined
|
|
184
|
+
|
|
185
|
+
def _hash_join(
|
|
186
|
+
self, left_data: list[TimeSeriesData], right_data: list[TimeSeriesData]
|
|
187
|
+
) -> list[tuple[TimeSeriesData, TimeSeriesData]]:
|
|
188
|
+
"""Hash join on specified key"""
|
|
189
|
+
joined = []
|
|
190
|
+
|
|
191
|
+
# Build hash table for right stream
|
|
192
|
+
right_hash: dict[str, list[TimeSeriesData]] = defaultdict(list)
|
|
193
|
+
for right in right_data:
|
|
194
|
+
key_value = right.tags.get(self.join_key) if self.join_key else None
|
|
195
|
+
if key_value:
|
|
196
|
+
right_hash[key_value].append(right)
|
|
197
|
+
|
|
198
|
+
# Probe with left stream
|
|
199
|
+
for left in left_data:
|
|
200
|
+
key_value = left.tags.get(self.join_key) if self.join_key else None
|
|
201
|
+
if key_value and key_value in right_hash:
|
|
202
|
+
for right in right_hash[key_value]:
|
|
203
|
+
# Check window condition
|
|
204
|
+
if abs(left.timestamp - right.timestamp) <= self.window_size:
|
|
205
|
+
# Check custom predicate if provided
|
|
206
|
+
if self.join_predicate is None or self.join_predicate(left, right):
|
|
207
|
+
joined.append((left, right))
|
|
208
|
+
|
|
209
|
+
return joined
|
|
210
|
+
|
|
211
|
+
def _nested_loop_join(
|
|
212
|
+
self, left_data: list[TimeSeriesData], right_data: list[TimeSeriesData]
|
|
213
|
+
) -> list[tuple[TimeSeriesData, TimeSeriesData]]:
|
|
214
|
+
"""Nested loop join with window condition"""
|
|
215
|
+
joined = []
|
|
216
|
+
|
|
217
|
+
for left in left_data:
|
|
218
|
+
for right in right_data:
|
|
219
|
+
# Check window condition
|
|
220
|
+
if abs(left.timestamp - right.timestamp) <= self.window_size:
|
|
221
|
+
# Check custom predicate if provided
|
|
222
|
+
if self.join_predicate is None or self.join_predicate(left, right):
|
|
223
|
+
joined.append((left, right))
|
|
224
|
+
|
|
225
|
+
return joined
|
|
226
|
+
|
|
227
|
+
def reset(self):
|
|
228
|
+
"""Reset algorithm state"""
|
|
229
|
+
self.left_buffer = StreamBuffer(self.max_delay)
|
|
230
|
+
self.right_buffer = StreamBuffer(self.max_delay)
|
|
231
|
+
self.stats = {
|
|
232
|
+
"total_joined": 0,
|
|
233
|
+
"late_arrivals": 0,
|
|
234
|
+
"dropped_late": 0,
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
def get_stats(self) -> dict[str, Any]:
|
|
238
|
+
"""Get join statistics"""
|
|
239
|
+
return {
|
|
240
|
+
**self.stats,
|
|
241
|
+
"left_buffer_size": self.left_buffer.size(),
|
|
242
|
+
"right_buffer_size": self.right_buffer.size(),
|
|
243
|
+
"left_watermark": self.left_buffer.watermark,
|
|
244
|
+
"right_watermark": self.right_buffer.watermark,
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
__all__ = ["OutOfOrderStreamJoin", "JoinConfig", "StreamBuffer"]
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Window Aggregator Algorithm
|
|
3
|
+
|
|
4
|
+
Provides various windowing strategies for time series aggregation,
|
|
5
|
+
including tumbling, sliding, and session windows.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
from ..sage_tsdb import AggregationType, TimeSeriesData
|
|
15
|
+
from .base import TimeSeriesAlgorithm
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class WindowType(Enum):
|
|
19
|
+
"""Window types for aggregation"""
|
|
20
|
+
|
|
21
|
+
TUMBLING = "tumbling" # Non-overlapping fixed-size windows
|
|
22
|
+
SLIDING = "sliding" # Overlapping fixed-size windows
|
|
23
|
+
SESSION = "session" # Dynamic windows based on inactivity gap
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class WindowConfig:
|
|
28
|
+
"""Configuration for windowing"""
|
|
29
|
+
|
|
30
|
+
window_type: WindowType
|
|
31
|
+
window_size: int # milliseconds
|
|
32
|
+
slide_interval: int | None = None # for sliding windows (ms)
|
|
33
|
+
session_gap: int | None = None # for session windows (ms)
|
|
34
|
+
aggregation: AggregationType = AggregationType.AVG
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class WindowAggregator(TimeSeriesAlgorithm):
|
|
38
|
+
"""
|
|
39
|
+
Window-based aggregation algorithm.
|
|
40
|
+
|
|
41
|
+
Supports multiple windowing strategies:
|
|
42
|
+
- Tumbling windows: Non-overlapping fixed-size windows
|
|
43
|
+
- Sliding windows: Overlapping windows with configurable slide interval
|
|
44
|
+
- Session windows: Dynamic windows based on inactivity gaps
|
|
45
|
+
|
|
46
|
+
Features:
|
|
47
|
+
- Multiple aggregation functions (sum, avg, min, max, count, etc.)
|
|
48
|
+
- Efficient incremental computation
|
|
49
|
+
- Support for late data handling
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(self, config: dict[str, Any] | None = None):
|
|
53
|
+
"""
|
|
54
|
+
Initialize window aggregator.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
config: Configuration dictionary with:
|
|
58
|
+
- window_type: Type of window (tumbling/sliding/session)
|
|
59
|
+
- window_size: Window size in milliseconds
|
|
60
|
+
- slide_interval: Slide interval for sliding windows (ms)
|
|
61
|
+
- session_gap: Inactivity gap for session windows (ms)
|
|
62
|
+
- aggregation: Aggregation function to apply
|
|
63
|
+
"""
|
|
64
|
+
super().__init__(config)
|
|
65
|
+
|
|
66
|
+
window_type_str = self.config.get("window_type", "tumbling")
|
|
67
|
+
self.window_type = WindowType(window_type_str)
|
|
68
|
+
self.window_size = self.config.get("window_size", 60000) # 1 minute
|
|
69
|
+
self.slide_interval = self.config.get("slide_interval", self.window_size)
|
|
70
|
+
self.session_gap = self.config.get("session_gap", 30000) # 30 seconds
|
|
71
|
+
|
|
72
|
+
agg_str = self.config.get("aggregation", "avg")
|
|
73
|
+
if isinstance(agg_str, str):
|
|
74
|
+
self.aggregation = AggregationType(agg_str)
|
|
75
|
+
else:
|
|
76
|
+
self.aggregation = agg_str
|
|
77
|
+
|
|
78
|
+
# State for incremental processing
|
|
79
|
+
self.windows: dict[int, list[TimeSeriesData]] = {}
|
|
80
|
+
self.stats = {
|
|
81
|
+
"windows_created": 0,
|
|
82
|
+
"windows_completed": 0,
|
|
83
|
+
"data_points_processed": 0,
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
def process(self, data: list[TimeSeriesData], **kwargs) -> list[TimeSeriesData]:
|
|
87
|
+
"""
|
|
88
|
+
Process time series data with windowing.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
data: Input time series data points
|
|
92
|
+
**kwargs: Additional parameters
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Aggregated time series data (one point per window)
|
|
96
|
+
"""
|
|
97
|
+
if not data:
|
|
98
|
+
return []
|
|
99
|
+
|
|
100
|
+
# Sort data by timestamp
|
|
101
|
+
sorted_data = sorted(data, key=lambda x: x.timestamp)
|
|
102
|
+
|
|
103
|
+
# Apply windowing based on type
|
|
104
|
+
if self.window_type == WindowType.TUMBLING:
|
|
105
|
+
return self._tumbling_window(sorted_data)
|
|
106
|
+
elif self.window_type == WindowType.SLIDING:
|
|
107
|
+
return self._sliding_window(sorted_data)
|
|
108
|
+
elif self.window_type == WindowType.SESSION:
|
|
109
|
+
return self._session_window(sorted_data)
|
|
110
|
+
|
|
111
|
+
return []
|
|
112
|
+
|
|
113
|
+
def _tumbling_window(self, data: list[TimeSeriesData]) -> list[TimeSeriesData]:
|
|
114
|
+
"""Process with tumbling windows"""
|
|
115
|
+
if not data:
|
|
116
|
+
return []
|
|
117
|
+
|
|
118
|
+
results = []
|
|
119
|
+
window_start = self._align_to_window(data[0].timestamp)
|
|
120
|
+
window_data = []
|
|
121
|
+
|
|
122
|
+
for point in data:
|
|
123
|
+
window_key = self._get_window_key(point.timestamp, window_start)
|
|
124
|
+
|
|
125
|
+
# Check if point belongs to current window
|
|
126
|
+
if window_key == window_start:
|
|
127
|
+
window_data.append(point)
|
|
128
|
+
else:
|
|
129
|
+
# Complete current window
|
|
130
|
+
if window_data:
|
|
131
|
+
agg_point = self._aggregate_window(window_data, window_start)
|
|
132
|
+
results.append(agg_point)
|
|
133
|
+
self.stats["windows_completed"] += 1
|
|
134
|
+
|
|
135
|
+
# Start new window(s)
|
|
136
|
+
# Handle potential gaps
|
|
137
|
+
while window_key > window_start:
|
|
138
|
+
window_start += self.window_size
|
|
139
|
+
|
|
140
|
+
window_data = [point]
|
|
141
|
+
self.stats["windows_created"] += 1
|
|
142
|
+
|
|
143
|
+
# Complete last window
|
|
144
|
+
if window_data:
|
|
145
|
+
agg_point = self._aggregate_window(window_data, window_start)
|
|
146
|
+
results.append(agg_point)
|
|
147
|
+
self.stats["windows_completed"] += 1
|
|
148
|
+
|
|
149
|
+
self.stats["data_points_processed"] += len(data)
|
|
150
|
+
return results
|
|
151
|
+
|
|
152
|
+
def _sliding_window(self, data: list[TimeSeriesData]) -> list[TimeSeriesData]:
|
|
153
|
+
"""Process with sliding windows"""
|
|
154
|
+
if not data:
|
|
155
|
+
return []
|
|
156
|
+
|
|
157
|
+
results = []
|
|
158
|
+
|
|
159
|
+
# Get first window start
|
|
160
|
+
first_timestamp = data[0].timestamp
|
|
161
|
+
window_start = self._align_to_window(first_timestamp)
|
|
162
|
+
|
|
163
|
+
# Create windows until we've covered all data
|
|
164
|
+
last_timestamp = data[-1].timestamp
|
|
165
|
+
|
|
166
|
+
while window_start <= last_timestamp:
|
|
167
|
+
window_end = window_start + self.window_size
|
|
168
|
+
|
|
169
|
+
# Get data points in this window
|
|
170
|
+
window_data = [point for point in data if window_start <= point.timestamp < window_end]
|
|
171
|
+
|
|
172
|
+
if window_data:
|
|
173
|
+
agg_point = self._aggregate_window(window_data, window_start)
|
|
174
|
+
results.append(agg_point)
|
|
175
|
+
self.stats["windows_completed"] += 1
|
|
176
|
+
|
|
177
|
+
# Slide to next window
|
|
178
|
+
window_start += self.slide_interval
|
|
179
|
+
self.stats["windows_created"] += 1
|
|
180
|
+
|
|
181
|
+
self.stats["data_points_processed"] += len(data)
|
|
182
|
+
return results
|
|
183
|
+
|
|
184
|
+
def _session_window(self, data: list[TimeSeriesData]) -> list[TimeSeriesData]:
|
|
185
|
+
"""Process with session windows"""
|
|
186
|
+
if not data:
|
|
187
|
+
return []
|
|
188
|
+
|
|
189
|
+
results = []
|
|
190
|
+
session_data = []
|
|
191
|
+
last_timestamp = data[0].timestamp
|
|
192
|
+
session_start = data[0].timestamp
|
|
193
|
+
|
|
194
|
+
for point in data:
|
|
195
|
+
# Check if point is within session gap
|
|
196
|
+
if point.timestamp - last_timestamp <= self.session_gap:
|
|
197
|
+
session_data.append(point)
|
|
198
|
+
else:
|
|
199
|
+
# Complete current session
|
|
200
|
+
if session_data:
|
|
201
|
+
agg_point = self._aggregate_window(session_data, session_start)
|
|
202
|
+
results.append(agg_point)
|
|
203
|
+
self.stats["windows_completed"] += 1
|
|
204
|
+
|
|
205
|
+
# Start new session
|
|
206
|
+
session_data = [point]
|
|
207
|
+
session_start = point.timestamp
|
|
208
|
+
self.stats["windows_created"] += 1
|
|
209
|
+
|
|
210
|
+
last_timestamp = point.timestamp
|
|
211
|
+
|
|
212
|
+
# Complete last session
|
|
213
|
+
if session_data:
|
|
214
|
+
agg_point = self._aggregate_window(session_data, session_start)
|
|
215
|
+
results.append(agg_point)
|
|
216
|
+
self.stats["windows_completed"] += 1
|
|
217
|
+
|
|
218
|
+
self.stats["data_points_processed"] += len(data)
|
|
219
|
+
return results
|
|
220
|
+
|
|
221
|
+
def _align_to_window(self, timestamp: int) -> int:
|
|
222
|
+
"""Align timestamp to window boundary"""
|
|
223
|
+
return (timestamp // self.window_size) * self.window_size
|
|
224
|
+
|
|
225
|
+
def _get_window_key(self, timestamp: int, reference: int) -> int:
|
|
226
|
+
"""Get window key for timestamp"""
|
|
227
|
+
return self._align_to_window(timestamp)
|
|
228
|
+
|
|
229
|
+
def _aggregate_window(
|
|
230
|
+
self, data: list[TimeSeriesData], window_timestamp: int
|
|
231
|
+
) -> TimeSeriesData:
|
|
232
|
+
"""Aggregate data in a window"""
|
|
233
|
+
if not data:
|
|
234
|
+
return TimeSeriesData(timestamp=window_timestamp, value=0.0)
|
|
235
|
+
|
|
236
|
+
# Extract values
|
|
237
|
+
values = []
|
|
238
|
+
for point in data:
|
|
239
|
+
# Flatten arrays/lists, append scalars
|
|
240
|
+
if isinstance(point.value, (list, np.ndarray)):
|
|
241
|
+
# Use np.ravel to flatten, then convert to list and extend
|
|
242
|
+
values.extend(np.ravel(point.value).tolist())
|
|
243
|
+
else:
|
|
244
|
+
values.append(point.value)
|
|
245
|
+
|
|
246
|
+
# Apply aggregation
|
|
247
|
+
if self.aggregation == AggregationType.SUM:
|
|
248
|
+
agg_value = sum(values)
|
|
249
|
+
elif self.aggregation == AggregationType.AVG:
|
|
250
|
+
agg_value = sum(values) / len(values)
|
|
251
|
+
elif self.aggregation == AggregationType.MIN:
|
|
252
|
+
agg_value = min(values)
|
|
253
|
+
elif self.aggregation == AggregationType.MAX:
|
|
254
|
+
agg_value = max(values)
|
|
255
|
+
elif self.aggregation == AggregationType.COUNT:
|
|
256
|
+
agg_value = len(values)
|
|
257
|
+
elif self.aggregation == AggregationType.FIRST:
|
|
258
|
+
agg_value = values[0]
|
|
259
|
+
elif self.aggregation == AggregationType.LAST:
|
|
260
|
+
agg_value = values[-1]
|
|
261
|
+
elif self.aggregation == AggregationType.STDDEV:
|
|
262
|
+
agg_value = float(np.std(values))
|
|
263
|
+
else:
|
|
264
|
+
agg_value = sum(values) / len(values)
|
|
265
|
+
|
|
266
|
+
# Merge tags from all data points
|
|
267
|
+
merged_tags = {}
|
|
268
|
+
for point in data:
|
|
269
|
+
if point.tags:
|
|
270
|
+
merged_tags.update(point.tags)
|
|
271
|
+
|
|
272
|
+
return TimeSeriesData(
|
|
273
|
+
timestamp=window_timestamp,
|
|
274
|
+
value=agg_value,
|
|
275
|
+
tags=merged_tags,
|
|
276
|
+
fields={"window_size": len(data), "aggregation": self.aggregation.value},
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
def reset(self):
|
|
280
|
+
"""Reset algorithm state"""
|
|
281
|
+
self.windows = {}
|
|
282
|
+
self.stats = {
|
|
283
|
+
"windows_created": 0,
|
|
284
|
+
"windows_completed": 0,
|
|
285
|
+
"data_points_processed": 0,
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
def get_stats(self) -> dict[str, Any]:
|
|
289
|
+
"""Get aggregator statistics"""
|
|
290
|
+
return {
|
|
291
|
+
**self.stats,
|
|
292
|
+
"active_windows": len(self.windows),
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
__all__ = ["WindowAggregator", "WindowType", "WindowConfig"]
|