isage-middleware 0.2.4.3__cp311-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isage_middleware-0.2.4.3.dist-info/METADATA +266 -0
- isage_middleware-0.2.4.3.dist-info/RECORD +94 -0
- isage_middleware-0.2.4.3.dist-info/WHEEL +5 -0
- isage_middleware-0.2.4.3.dist-info/top_level.txt +1 -0
- sage/middleware/__init__.py +59 -0
- sage/middleware/_version.py +6 -0
- sage/middleware/components/__init__.py +30 -0
- sage/middleware/components/extensions_compat.py +141 -0
- sage/middleware/components/sage_db/__init__.py +116 -0
- sage/middleware/components/sage_db/backend.py +136 -0
- sage/middleware/components/sage_db/service.py +15 -0
- sage/middleware/components/sage_flow/__init__.py +76 -0
- sage/middleware/components/sage_flow/python/__init__.py +14 -0
- sage/middleware/components/sage_flow/python/micro_service/__init__.py +4 -0
- sage/middleware/components/sage_flow/python/micro_service/sage_flow_service.py +88 -0
- sage/middleware/components/sage_flow/python/sage_flow.py +30 -0
- sage/middleware/components/sage_flow/service.py +14 -0
- sage/middleware/components/sage_mem/__init__.py +83 -0
- sage/middleware/components/sage_sias/__init__.py +59 -0
- sage/middleware/components/sage_sias/continual_learner.py +184 -0
- sage/middleware/components/sage_sias/coreset_selector.py +302 -0
- sage/middleware/components/sage_sias/types.py +94 -0
- sage/middleware/components/sage_tsdb/__init__.py +81 -0
- sage/middleware/components/sage_tsdb/python/__init__.py +21 -0
- sage/middleware/components/sage_tsdb/python/_sage_tsdb.pyi +17 -0
- sage/middleware/components/sage_tsdb/python/algorithms/__init__.py +17 -0
- sage/middleware/components/sage_tsdb/python/algorithms/base.py +51 -0
- sage/middleware/components/sage_tsdb/python/algorithms/out_of_order_join.py +248 -0
- sage/middleware/components/sage_tsdb/python/algorithms/window_aggregator.py +296 -0
- sage/middleware/components/sage_tsdb/python/micro_service/__init__.py +7 -0
- sage/middleware/components/sage_tsdb/python/micro_service/sage_tsdb_service.py +365 -0
- sage/middleware/components/sage_tsdb/python/sage_tsdb.py +523 -0
- sage/middleware/components/sage_tsdb/service.py +17 -0
- sage/middleware/components/vector_stores/__init__.py +25 -0
- sage/middleware/components/vector_stores/chroma.py +483 -0
- sage/middleware/components/vector_stores/chroma_adapter.py +185 -0
- sage/middleware/components/vector_stores/milvus.py +677 -0
- sage/middleware/operators/__init__.py +56 -0
- sage/middleware/operators/agent/__init__.py +24 -0
- sage/middleware/operators/agent/planning/__init__.py +5 -0
- sage/middleware/operators/agent/planning/llm_adapter.py +41 -0
- sage/middleware/operators/agent/planning/planner_adapter.py +98 -0
- sage/middleware/operators/agent/planning/router.py +107 -0
- sage/middleware/operators/agent/runtime.py +296 -0
- sage/middleware/operators/agentic/__init__.py +41 -0
- sage/middleware/operators/agentic/config.py +254 -0
- sage/middleware/operators/agentic/planning_operator.py +125 -0
- sage/middleware/operators/agentic/refined_searcher.py +132 -0
- sage/middleware/operators/agentic/runtime.py +241 -0
- sage/middleware/operators/agentic/timing_operator.py +125 -0
- sage/middleware/operators/agentic/tool_selection_operator.py +127 -0
- sage/middleware/operators/context/__init__.py +17 -0
- sage/middleware/operators/context/critic_evaluation.py +16 -0
- sage/middleware/operators/context/model_context.py +565 -0
- sage/middleware/operators/context/quality_label.py +12 -0
- sage/middleware/operators/context/search_query_results.py +61 -0
- sage/middleware/operators/context/search_result.py +42 -0
- sage/middleware/operators/context/search_session.py +79 -0
- sage/middleware/operators/filters/__init__.py +26 -0
- sage/middleware/operators/filters/context_sink.py +387 -0
- sage/middleware/operators/filters/context_source.py +376 -0
- sage/middleware/operators/filters/evaluate_filter.py +83 -0
- sage/middleware/operators/filters/tool_filter.py +74 -0
- sage/middleware/operators/llm/__init__.py +18 -0
- sage/middleware/operators/llm/sagellm_generator.py +432 -0
- sage/middleware/operators/rag/__init__.py +147 -0
- sage/middleware/operators/rag/arxiv.py +331 -0
- sage/middleware/operators/rag/chunk.py +13 -0
- sage/middleware/operators/rag/document_loaders.py +23 -0
- sage/middleware/operators/rag/evaluate.py +658 -0
- sage/middleware/operators/rag/generator.py +340 -0
- sage/middleware/operators/rag/index_builder/__init__.py +48 -0
- sage/middleware/operators/rag/index_builder/builder.py +363 -0
- sage/middleware/operators/rag/index_builder/manifest.py +101 -0
- sage/middleware/operators/rag/index_builder/storage.py +131 -0
- sage/middleware/operators/rag/pipeline.py +46 -0
- sage/middleware/operators/rag/profiler.py +59 -0
- sage/middleware/operators/rag/promptor.py +400 -0
- sage/middleware/operators/rag/refiner.py +231 -0
- sage/middleware/operators/rag/reranker.py +364 -0
- sage/middleware/operators/rag/retriever.py +1308 -0
- sage/middleware/operators/rag/searcher.py +37 -0
- sage/middleware/operators/rag/types.py +28 -0
- sage/middleware/operators/rag/writer.py +80 -0
- sage/middleware/operators/tools/__init__.py +71 -0
- sage/middleware/operators/tools/arxiv_paper_searcher.py +175 -0
- sage/middleware/operators/tools/arxiv_searcher.py +102 -0
- sage/middleware/operators/tools/duckduckgo_searcher.py +105 -0
- sage/middleware/operators/tools/image_captioner.py +104 -0
- sage/middleware/operators/tools/nature_news_fetcher.py +224 -0
- sage/middleware/operators/tools/searcher_tool.py +514 -0
- sage/middleware/operators/tools/text_detector.py +185 -0
- sage/middleware/operators/tools/url_text_extractor.py +104 -0
- sage/middleware/py.typed +2 -0
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Online Continual Learning with Experience Replay
|
|
3
|
+
|
|
4
|
+
Implements an experience replay buffer for online/incremental training that
|
|
5
|
+
prevents catastrophic forgetting. The buffer is managed using coreset selection
|
|
6
|
+
to retain the most valuable samples.
|
|
7
|
+
|
|
8
|
+
This is a core component of SIAS (Streaming Importance-Aware Agent System).
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import random
|
|
14
|
+
from typing import Iterable, Optional, Sequence
|
|
15
|
+
|
|
16
|
+
from .coreset_selector import CoresetSelector, SampleT, SelectionSummary
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class OnlineContinualLearner:
|
|
20
|
+
"""
|
|
21
|
+
Maintain a replay buffer for online continual learning.
|
|
22
|
+
|
|
23
|
+
Implements experience replay to prevent catastrophic forgetting during
|
|
24
|
+
incremental/online training. The buffer is managed using coreset selection
|
|
25
|
+
to keep the most valuable samples.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
buffer_size: Maximum number of samples to keep in buffer
|
|
29
|
+
replay_ratio: Ratio of replay samples to add per batch (e.g., 0.25 = 25%)
|
|
30
|
+
selector: CoresetSelector for buffer management
|
|
31
|
+
|
|
32
|
+
Example:
|
|
33
|
+
>>> learner = OnlineContinualLearner(buffer_size=2048, replay_ratio=0.25)
|
|
34
|
+
>>> for new_batch in data_stream:
|
|
35
|
+
... training_batch = learner.update_buffer(new_batch)
|
|
36
|
+
... train_step(training_batch)
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
buffer_size: int = 2048,
|
|
42
|
+
replay_ratio: float = 0.3,
|
|
43
|
+
selector: Optional[CoresetSelector] = None,
|
|
44
|
+
random_seed: int = 17,
|
|
45
|
+
) -> None:
|
|
46
|
+
"""
|
|
47
|
+
Initialize OnlineContinualLearner.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
buffer_size: Maximum samples to keep in replay buffer
|
|
51
|
+
replay_ratio: Fraction of batch size to sample from buffer
|
|
52
|
+
selector: CoresetSelector for buffer management (default: hybrid)
|
|
53
|
+
random_seed: Random seed for reproducibility
|
|
54
|
+
"""
|
|
55
|
+
self.buffer_size = buffer_size
|
|
56
|
+
self.replay_ratio = replay_ratio
|
|
57
|
+
self.selector = selector or CoresetSelector(strategy="hybrid")
|
|
58
|
+
self._buffer: list[SampleT] = []
|
|
59
|
+
self._metrics: dict[str, float] = {}
|
|
60
|
+
self._rng = random.Random(random_seed)
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def buffer(self) -> list[SampleT]:
|
|
64
|
+
"""Access the current buffer (read-only view)."""
|
|
65
|
+
return list(self._buffer)
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def buffer_len(self) -> int:
|
|
69
|
+
"""Current number of samples in buffer."""
|
|
70
|
+
return len(self._buffer)
|
|
71
|
+
|
|
72
|
+
def update_buffer(
|
|
73
|
+
self,
|
|
74
|
+
new_samples: Sequence[SampleT],
|
|
75
|
+
metrics: Optional[dict[str, float]] = None,
|
|
76
|
+
) -> list[SampleT]:
|
|
77
|
+
"""
|
|
78
|
+
Update buffer with new samples and return training batch.
|
|
79
|
+
|
|
80
|
+
This method:
|
|
81
|
+
1. Adds new samples to the buffer
|
|
82
|
+
2. If buffer exceeds size limit, uses coreset selection to prune
|
|
83
|
+
3. Returns new samples + replay samples for training
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
new_samples: New samples to add to buffer
|
|
87
|
+
metrics: Optional metrics dict mapping sample_id to importance score
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Training batch combining new samples with replay samples
|
|
91
|
+
"""
|
|
92
|
+
if not new_samples:
|
|
93
|
+
return list(self._buffer)
|
|
94
|
+
|
|
95
|
+
if metrics:
|
|
96
|
+
self._metrics.update(metrics)
|
|
97
|
+
|
|
98
|
+
# Combine buffer with new samples
|
|
99
|
+
combined = list(self._buffer) + list(new_samples)
|
|
100
|
+
|
|
101
|
+
# Prune if over capacity
|
|
102
|
+
if len(combined) > self.buffer_size:
|
|
103
|
+
combined = self.selector.select(
|
|
104
|
+
combined,
|
|
105
|
+
target_size=self.buffer_size,
|
|
106
|
+
metrics=self._metrics,
|
|
107
|
+
)
|
|
108
|
+
# Clean up metrics for removed samples
|
|
109
|
+
combined_ids = {self._get_sample_id(sample) for sample in combined}
|
|
110
|
+
self._metrics = {k: v for k, v in self._metrics.items() if k in combined_ids}
|
|
111
|
+
|
|
112
|
+
self._buffer = combined
|
|
113
|
+
return self._assemble_training_batch(new_samples)
|
|
114
|
+
|
|
115
|
+
def _assemble_training_batch(
|
|
116
|
+
self,
|
|
117
|
+
new_samples: Sequence[SampleT],
|
|
118
|
+
) -> list[SampleT]:
|
|
119
|
+
"""Combine new samples with replay samples."""
|
|
120
|
+
new_ids = {self._get_sample_id(s) for s in new_samples}
|
|
121
|
+
replay = self.sample_replay(len(new_samples), exclude=new_ids)
|
|
122
|
+
return list(new_samples) + replay
|
|
123
|
+
|
|
124
|
+
def sample_replay(
|
|
125
|
+
self,
|
|
126
|
+
new_batch_size: int,
|
|
127
|
+
*,
|
|
128
|
+
exclude: Optional[Iterable[str]] = None,
|
|
129
|
+
) -> list[SampleT]:
|
|
130
|
+
"""
|
|
131
|
+
Sample from replay buffer.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
new_batch_size: Size of new batch (replay size = batch_size * ratio)
|
|
135
|
+
exclude: Sample IDs to exclude from replay
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
List of replay samples
|
|
139
|
+
"""
|
|
140
|
+
if not self._buffer or self.replay_ratio <= 0:
|
|
141
|
+
return []
|
|
142
|
+
|
|
143
|
+
exclude = set(exclude or [])
|
|
144
|
+
available = [
|
|
145
|
+
sample for sample in self._buffer if self._get_sample_id(sample) not in exclude
|
|
146
|
+
]
|
|
147
|
+
if not available:
|
|
148
|
+
return []
|
|
149
|
+
|
|
150
|
+
replay_size = max(1, int(new_batch_size * self.replay_ratio))
|
|
151
|
+
replay_size = min(replay_size, len(available))
|
|
152
|
+
return self._rng.sample(available, replay_size)
|
|
153
|
+
|
|
154
|
+
def buffer_snapshot(self) -> list[SampleT]:
|
|
155
|
+
"""Return a copy of the current buffer."""
|
|
156
|
+
return list(self._buffer)
|
|
157
|
+
|
|
158
|
+
def buffer_summary(self) -> SelectionSummary:
|
|
159
|
+
"""Get summary statistics for the buffer."""
|
|
160
|
+
return SelectionSummary(
|
|
161
|
+
total_samples=len(self._buffer),
|
|
162
|
+
selected_samples=len(self._buffer),
|
|
163
|
+
strategy=f"buffer:{self.selector.strategy}",
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
def clear(self) -> None:
|
|
167
|
+
"""Clear the buffer and metrics."""
|
|
168
|
+
self._buffer = []
|
|
169
|
+
self._metrics = {}
|
|
170
|
+
|
|
171
|
+
def update_metrics(self, metrics: dict[str, float]) -> None:
|
|
172
|
+
"""
|
|
173
|
+
Update importance metrics for samples in buffer.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
metrics: Dict mapping sample_id to importance score
|
|
177
|
+
"""
|
|
178
|
+
self._metrics.update(metrics)
|
|
179
|
+
|
|
180
|
+
def _get_sample_id(self, sample: SampleT) -> str:
|
|
181
|
+
"""Get sample_id from sample (supports dict or object)."""
|
|
182
|
+
if isinstance(sample, dict):
|
|
183
|
+
return sample.get("sample_id", sample.get("dialog_id", str(id(sample))))
|
|
184
|
+
return getattr(sample, "sample_id", getattr(sample, "dialog_id", str(id(sample))))
|
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Coreset Selection for Efficient Training
|
|
3
|
+
|
|
4
|
+
Implements lightweight coreset selection strategies that identify the most
|
|
5
|
+
valuable samples for training, reducing computational cost while maintaining
|
|
6
|
+
model quality.
|
|
7
|
+
|
|
8
|
+
Strategies:
|
|
9
|
+
- loss_topk: Select samples with highest loss (most informative)
|
|
10
|
+
- diversity: Select samples maximizing coverage of feature space
|
|
11
|
+
- hybrid: Combination of loss-based and diversity-based selection
|
|
12
|
+
- random: Uniform random sampling (baseline)
|
|
13
|
+
|
|
14
|
+
This is a core component of SIAS (Streaming Importance-Aware Agent System).
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import math
|
|
20
|
+
import random
|
|
21
|
+
import re
|
|
22
|
+
from collections import Counter
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
from typing import Any, Optional, Protocol, Sequence, runtime_checkable
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(slots=True)
|
|
28
|
+
class SelectionSummary:
|
|
29
|
+
"""Summary statistics for a selection operation."""
|
|
30
|
+
|
|
31
|
+
total_samples: int
|
|
32
|
+
selected_samples: int
|
|
33
|
+
strategy: str
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@runtime_checkable
|
|
37
|
+
class SampleProtocol(Protocol):
|
|
38
|
+
"""Protocol for samples that can be used with CoresetSelector."""
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def sample_id(self) -> str:
|
|
42
|
+
"""Unique identifier for the sample."""
|
|
43
|
+
...
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def text(self) -> str:
|
|
47
|
+
"""Text content of the sample."""
|
|
48
|
+
...
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def metadata(self) -> dict[str, Any]:
|
|
52
|
+
"""Metadata dictionary."""
|
|
53
|
+
...
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# Type alias for any sample that implements the protocol
|
|
57
|
+
SampleT = Any # Should implement SampleProtocol
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class CoresetSelector:
|
|
61
|
+
"""
|
|
62
|
+
Implements lightweight coreset selection strategies.
|
|
63
|
+
|
|
64
|
+
This class provides several strategies for selecting a representative
|
|
65
|
+
subset of samples from a larger dataset, optimizing for training efficiency.
|
|
66
|
+
|
|
67
|
+
Attributes:
|
|
68
|
+
strategy: Selection strategy ("loss_topk", "diversity", "hybrid", "random")
|
|
69
|
+
metric_key: Key in metadata to use for loss-based selection
|
|
70
|
+
diversity_temperature: Temperature for diversity scoring
|
|
71
|
+
random_seed: Seed for reproducibility
|
|
72
|
+
|
|
73
|
+
Example:
|
|
74
|
+
>>> selector = CoresetSelector(strategy="hybrid")
|
|
75
|
+
>>> selected = selector.select(samples, target_size=1000)
|
|
76
|
+
>>> print(f"Selected {len(selected)} from {len(samples)} samples")
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
STRATEGIES = ("loss_topk", "diversity", "hybrid", "random")
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
strategy: str = "loss_topk",
|
|
84
|
+
metric_key: str = "loss",
|
|
85
|
+
diversity_temperature: float = 0.7,
|
|
86
|
+
random_seed: int = 13,
|
|
87
|
+
) -> None:
|
|
88
|
+
"""
|
|
89
|
+
Initialize CoresetSelector.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
strategy: Selection strategy to use
|
|
93
|
+
metric_key: Metadata key for loss-based selection
|
|
94
|
+
diversity_temperature: Temperature for diversity scoring
|
|
95
|
+
random_seed: Random seed for reproducibility
|
|
96
|
+
"""
|
|
97
|
+
if strategy not in self.STRATEGIES:
|
|
98
|
+
raise ValueError(f"Unknown strategy: {strategy}. Choose from {self.STRATEGIES}")
|
|
99
|
+
|
|
100
|
+
self.strategy = strategy
|
|
101
|
+
self.metric_key = metric_key
|
|
102
|
+
self.diversity_temperature = diversity_temperature
|
|
103
|
+
self._rng = random.Random(random_seed)
|
|
104
|
+
|
|
105
|
+
# ------------------------------------------------------------------
|
|
106
|
+
# Public API
|
|
107
|
+
# ------------------------------------------------------------------
|
|
108
|
+
def select(
|
|
109
|
+
self,
|
|
110
|
+
samples: Sequence[SampleT],
|
|
111
|
+
*,
|
|
112
|
+
target_size: Optional[int],
|
|
113
|
+
metrics: Optional[dict[str, float]] = None,
|
|
114
|
+
) -> list[SampleT]:
|
|
115
|
+
"""
|
|
116
|
+
Select a subset of samples using the configured strategy.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
samples: Input samples to select from
|
|
120
|
+
target_size: Number of samples to select (None = keep all)
|
|
121
|
+
metrics: Optional external metrics dict mapping sample_id to score
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
List of selected samples
|
|
125
|
+
"""
|
|
126
|
+
if target_size is None or target_size <= 0 or target_size >= len(samples):
|
|
127
|
+
return list(samples)
|
|
128
|
+
|
|
129
|
+
if self.strategy == "loss_topk":
|
|
130
|
+
return self._select_loss(samples, target_size, metrics)
|
|
131
|
+
if self.strategy == "diversity":
|
|
132
|
+
return self._select_diversity(samples, target_size)
|
|
133
|
+
if self.strategy == "hybrid":
|
|
134
|
+
return self._select_hybrid(samples, target_size, metrics)
|
|
135
|
+
return self._select_random(samples, target_size)
|
|
136
|
+
|
|
137
|
+
def summary(self, original_size: int, selected_size: int) -> SelectionSummary:
|
|
138
|
+
"""Create a summary of the selection operation."""
|
|
139
|
+
return SelectionSummary(
|
|
140
|
+
total_samples=original_size,
|
|
141
|
+
selected_samples=selected_size,
|
|
142
|
+
strategy=self.strategy,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# ------------------------------------------------------------------
|
|
146
|
+
# Selection Strategies
|
|
147
|
+
# ------------------------------------------------------------------
|
|
148
|
+
def _select_loss(
|
|
149
|
+
self,
|
|
150
|
+
samples: Sequence[SampleT],
|
|
151
|
+
target_size: int,
|
|
152
|
+
metrics: Optional[dict[str, float]],
|
|
153
|
+
) -> list[SampleT]:
|
|
154
|
+
"""Select samples with highest loss/importance scores."""
|
|
155
|
+
|
|
156
|
+
def score(sample: SampleT) -> float:
|
|
157
|
+
sample_id = self._get_sample_id(sample)
|
|
158
|
+
if metrics and sample_id in metrics:
|
|
159
|
+
return metrics[sample_id]
|
|
160
|
+
meta = self._get_metadata(sample)
|
|
161
|
+
meta_val = meta.get(self.metric_key)
|
|
162
|
+
if isinstance(meta_val, (int, float)):
|
|
163
|
+
return float(meta_val)
|
|
164
|
+
return 0.0
|
|
165
|
+
|
|
166
|
+
ranked = sorted(samples, key=score, reverse=True)
|
|
167
|
+
return list(ranked[:target_size])
|
|
168
|
+
|
|
169
|
+
def _select_random(
|
|
170
|
+
self,
|
|
171
|
+
samples: Sequence[SampleT],
|
|
172
|
+
target_size: int,
|
|
173
|
+
) -> list[SampleT]:
|
|
174
|
+
"""Uniform random sampling."""
|
|
175
|
+
return self._rng.sample(list(samples), target_size)
|
|
176
|
+
|
|
177
|
+
def _select_hybrid(
|
|
178
|
+
self,
|
|
179
|
+
samples: Sequence[SampleT],
|
|
180
|
+
target_size: int,
|
|
181
|
+
metrics: Optional[dict[str, float]],
|
|
182
|
+
) -> list[SampleT]:
|
|
183
|
+
"""Hybrid selection: 60% loss-based + 40% diversity-based."""
|
|
184
|
+
loss_portion = int(target_size * 0.6)
|
|
185
|
+
div_portion = target_size - loss_portion
|
|
186
|
+
|
|
187
|
+
# First select high-loss samples
|
|
188
|
+
top_loss = self._select_loss(samples, loss_portion or 1, metrics)
|
|
189
|
+
top_loss_ids = {self._get_sample_id(s) for s in top_loss}
|
|
190
|
+
|
|
191
|
+
# Then select diverse samples from remaining
|
|
192
|
+
remaining = [s for s in samples if self._get_sample_id(s) not in top_loss_ids]
|
|
193
|
+
if not remaining:
|
|
194
|
+
return top_loss
|
|
195
|
+
|
|
196
|
+
diversity = self._select_diversity(remaining, max(div_portion, 1))
|
|
197
|
+
merged = (top_loss + diversity)[:target_size]
|
|
198
|
+
return merged
|
|
199
|
+
|
|
200
|
+
def _select_diversity(
|
|
201
|
+
self,
|
|
202
|
+
samples: Sequence[SampleT],
|
|
203
|
+
target_size: int,
|
|
204
|
+
) -> list[SampleT]:
|
|
205
|
+
"""Select samples maximizing feature space coverage."""
|
|
206
|
+
if not samples:
|
|
207
|
+
return []
|
|
208
|
+
|
|
209
|
+
# Extract features for all samples
|
|
210
|
+
features = {
|
|
211
|
+
self._get_sample_id(sample): self._text_features(self._get_text(sample))
|
|
212
|
+
for sample in samples
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
selected: list[SampleT] = []
|
|
216
|
+
candidates = list(samples)
|
|
217
|
+
|
|
218
|
+
# Start with the sample that has the highest token variance
|
|
219
|
+
scores = {
|
|
220
|
+
self._get_sample_id(sample): self._feature_norm(features[self._get_sample_id(sample)])
|
|
221
|
+
for sample in samples
|
|
222
|
+
}
|
|
223
|
+
first = max(candidates, key=lambda s: scores.get(self._get_sample_id(s), 0.0))
|
|
224
|
+
selected.append(first)
|
|
225
|
+
candidates = [s for s in candidates if self._get_sample_id(s) != self._get_sample_id(first)]
|
|
226
|
+
|
|
227
|
+
# Iteratively select most diverse samples
|
|
228
|
+
while candidates and len(selected) < target_size:
|
|
229
|
+
best_candidate = max(
|
|
230
|
+
candidates,
|
|
231
|
+
key=lambda sample: self._min_distance(sample, selected, features),
|
|
232
|
+
)
|
|
233
|
+
selected.append(best_candidate)
|
|
234
|
+
candidates = [
|
|
235
|
+
s
|
|
236
|
+
for s in candidates
|
|
237
|
+
if self._get_sample_id(s) != self._get_sample_id(best_candidate)
|
|
238
|
+
]
|
|
239
|
+
|
|
240
|
+
return selected
|
|
241
|
+
|
|
242
|
+
# ------------------------------------------------------------------
|
|
243
|
+
# Feature Extraction Helpers
|
|
244
|
+
# ------------------------------------------------------------------
|
|
245
|
+
def _text_features(self, text: str) -> Counter:
|
|
246
|
+
"""Extract normalized token frequency features from text."""
|
|
247
|
+
tokens = re.findall(r"[a-zA-Z0-9_]+", text.lower())
|
|
248
|
+
filtered = [token for token in tokens if len(token) > 2]
|
|
249
|
+
counts = Counter(filtered)
|
|
250
|
+
total = sum(counts.values()) or 1.0
|
|
251
|
+
for key in counts:
|
|
252
|
+
counts[key] /= total
|
|
253
|
+
return counts
|
|
254
|
+
|
|
255
|
+
def _feature_norm(self, features: Counter) -> float:
|
|
256
|
+
"""Compute L2 norm of feature vector."""
|
|
257
|
+
return math.sqrt(sum(value * value for value in features.values()))
|
|
258
|
+
|
|
259
|
+
def _cosine_similarity(self, left: Counter, right: Counter) -> float:
|
|
260
|
+
"""Compute cosine similarity between two feature vectors."""
|
|
261
|
+
keys = left.keys() & right.keys()
|
|
262
|
+
if not keys:
|
|
263
|
+
return 0.0
|
|
264
|
+
return sum(left[key] * right[key] for key in keys)
|
|
265
|
+
|
|
266
|
+
def _min_distance(
|
|
267
|
+
self,
|
|
268
|
+
candidate: SampleT,
|
|
269
|
+
selected: Sequence[SampleT],
|
|
270
|
+
features: dict[str, Counter],
|
|
271
|
+
) -> float:
|
|
272
|
+
"""Compute minimum distance from candidate to selected set."""
|
|
273
|
+
cand_feat = features[self._get_sample_id(candidate)]
|
|
274
|
+
if not selected:
|
|
275
|
+
return 1.0
|
|
276
|
+
sims = [
|
|
277
|
+
self._cosine_similarity(cand_feat, features[self._get_sample_id(item)])
|
|
278
|
+
for item in selected
|
|
279
|
+
]
|
|
280
|
+
similarity = max(sims) if sims else 0.0
|
|
281
|
+
return 1.0 - similarity
|
|
282
|
+
|
|
283
|
+
# ------------------------------------------------------------------
|
|
284
|
+
# Sample Access Helpers (support both dict and object access)
|
|
285
|
+
# ------------------------------------------------------------------
|
|
286
|
+
def _get_sample_id(self, sample: SampleT) -> str:
|
|
287
|
+
"""Get sample_id from sample (supports dict or object)."""
|
|
288
|
+
if isinstance(sample, dict):
|
|
289
|
+
return sample.get("sample_id", sample.get("dialog_id", str(id(sample))))
|
|
290
|
+
return getattr(sample, "sample_id", getattr(sample, "dialog_id", str(id(sample))))
|
|
291
|
+
|
|
292
|
+
def _get_text(self, sample: SampleT) -> str:
|
|
293
|
+
"""Get text from sample (supports dict or object)."""
|
|
294
|
+
if isinstance(sample, dict):
|
|
295
|
+
return sample.get("text", "")
|
|
296
|
+
return getattr(sample, "text", "")
|
|
297
|
+
|
|
298
|
+
def _get_metadata(self, sample: SampleT) -> dict[str, Any]:
|
|
299
|
+
"""Get metadata from sample (supports dict or object)."""
|
|
300
|
+
if isinstance(sample, dict):
|
|
301
|
+
return sample.get("metadata", {})
|
|
302
|
+
return getattr(sample, "metadata", {})
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SIAS Core Data Types
|
|
3
|
+
|
|
4
|
+
Defines the core data structures used across SIAS components.
|
|
5
|
+
These are designed to be independent of specific data sources.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from typing import Any, Protocol, runtime_checkable
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(slots=True)
|
|
15
|
+
class SIASSample:
|
|
16
|
+
"""
|
|
17
|
+
Generic sample container for SIAS algorithms.
|
|
18
|
+
|
|
19
|
+
This is a lightweight data class that can wrap samples from various sources.
|
|
20
|
+
The only required fields are sample_id and text; everything else is optional.
|
|
21
|
+
|
|
22
|
+
Attributes:
|
|
23
|
+
sample_id: Unique identifier for this sample
|
|
24
|
+
text: The text content (or serialized representation)
|
|
25
|
+
metadata: Arbitrary metadata dictionary
|
|
26
|
+
importance_score: SSIS-computed importance score (set during training)
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
sample_id: str
|
|
30
|
+
text: str
|
|
31
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
32
|
+
importance_score: float = 0.0
|
|
33
|
+
|
|
34
|
+
def __hash__(self) -> int:
|
|
35
|
+
return hash(self.sample_id)
|
|
36
|
+
|
|
37
|
+
def __eq__(self, other: object) -> bool:
|
|
38
|
+
if isinstance(other, SIASSample):
|
|
39
|
+
return self.sample_id == other.sample_id
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@runtime_checkable
|
|
44
|
+
class SampleProtocol(Protocol):
|
|
45
|
+
"""
|
|
46
|
+
Protocol for samples that can be used with SIAS algorithms.
|
|
47
|
+
|
|
48
|
+
Any class with these attributes can be used with CoresetSelector
|
|
49
|
+
and OnlineContinualLearner without modification.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def sample_id(self) -> str:
|
|
54
|
+
"""Unique identifier for the sample."""
|
|
55
|
+
...
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def text(self) -> str:
|
|
59
|
+
"""Text content of the sample."""
|
|
60
|
+
...
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def metadata(self) -> dict[str, Any]:
|
|
64
|
+
"""Metadata dictionary."""
|
|
65
|
+
...
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# Backward compatibility alias
|
|
69
|
+
# This allows existing code using ProcessedDialog to work with SIAS
|
|
70
|
+
# by implementing the SampleProtocol
|
|
71
|
+
Sample = SIASSample
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def wrap_sample(
|
|
75
|
+
sample_id: str,
|
|
76
|
+
text: str,
|
|
77
|
+
metadata: dict[str, Any] | None = None,
|
|
78
|
+
**kwargs: Any,
|
|
79
|
+
) -> SIASSample:
|
|
80
|
+
"""
|
|
81
|
+
Factory function to create a SIASSample.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
sample_id: Unique identifier
|
|
85
|
+
text: Text content
|
|
86
|
+
metadata: Optional metadata dict
|
|
87
|
+
**kwargs: Additional metadata fields
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
A new SIASSample instance
|
|
91
|
+
"""
|
|
92
|
+
meta = metadata or {}
|
|
93
|
+
meta.update(kwargs)
|
|
94
|
+
return SIASSample(sample_id=sample_id, text=text, metadata=meta)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SAGE-TSDB: Time Series Database Component for SAGE
|
|
3
|
+
|
|
4
|
+
Provides efficient time series data storage, querying, and processing capabilities
|
|
5
|
+
for streaming and historical data analysis.
|
|
6
|
+
|
|
7
|
+
Note: SAGE TSDB core is now an independent PyPI package (isage-tsdb).
|
|
8
|
+
This module provides backward-compatible wrappers and SAGE-specific services.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import warnings
|
|
12
|
+
|
|
13
|
+
# Import from PyPI package (isage-tsdb)
|
|
14
|
+
_SAGE_TSDB_AVAILABLE = False
|
|
15
|
+
try:
|
|
16
|
+
from sage_tsdb import (
|
|
17
|
+
QueryConfig,
|
|
18
|
+
TimeRange,
|
|
19
|
+
TimeSeriesData,
|
|
20
|
+
TimeSeriesDB,
|
|
21
|
+
TimeSeriesIndex,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# Backward compatibility alias
|
|
25
|
+
SageTSDB = TimeSeriesDB
|
|
26
|
+
_SAGE_TSDB_AVAILABLE = True
|
|
27
|
+
except ImportError as e:
|
|
28
|
+
# Don't fail immediately - allow graceful degradation
|
|
29
|
+
warnings.warn(
|
|
30
|
+
f"SAGE TSDB not available: {e}\n"
|
|
31
|
+
"Install with: pip install isage-tsdb\n"
|
|
32
|
+
"Time series features will be unavailable.",
|
|
33
|
+
UserWarning,
|
|
34
|
+
stacklevel=2,
|
|
35
|
+
)
|
|
36
|
+
# Provide stub exports
|
|
37
|
+
SageTSDB = None
|
|
38
|
+
TimeSeriesDB = None
|
|
39
|
+
TimeSeriesData = None
|
|
40
|
+
QueryConfig = None
|
|
41
|
+
TimeRange = None
|
|
42
|
+
TimeSeriesIndex = None
|
|
43
|
+
|
|
44
|
+
# Algorithms (SAGE-specific extensions)
|
|
45
|
+
# Only import if base package is available
|
|
46
|
+
if _SAGE_TSDB_AVAILABLE:
|
|
47
|
+
from .python.algorithms import (
|
|
48
|
+
OutOfOrderStreamJoin,
|
|
49
|
+
TimeSeriesAlgorithm,
|
|
50
|
+
WindowAggregator,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Micro-service wrapper (SAGE-specific)
|
|
54
|
+
from .python.micro_service.sage_tsdb_service import (
|
|
55
|
+
SageTSDBService,
|
|
56
|
+
SageTSDBServiceConfig,
|
|
57
|
+
)
|
|
58
|
+
else:
|
|
59
|
+
# Stub classes if TSDB not available
|
|
60
|
+
TimeSeriesAlgorithm = None
|
|
61
|
+
OutOfOrderStreamJoin = None
|
|
62
|
+
WindowAggregator = None
|
|
63
|
+
SageTSDBService = None
|
|
64
|
+
SageTSDBServiceConfig = None
|
|
65
|
+
|
|
66
|
+
__all__ = [
|
|
67
|
+
# Core API (may be None if not installed)
|
|
68
|
+
"SageTSDB",
|
|
69
|
+
"TimeSeriesData",
|
|
70
|
+
"QueryConfig",
|
|
71
|
+
"TimeRange",
|
|
72
|
+
# Service
|
|
73
|
+
"SageTSDBService",
|
|
74
|
+
"SageTSDBServiceConfig",
|
|
75
|
+
# Algorithms
|
|
76
|
+
"TimeSeriesAlgorithm",
|
|
77
|
+
"OutOfOrderStreamJoin",
|
|
78
|
+
"WindowAggregator",
|
|
79
|
+
# Availability flag
|
|
80
|
+
"_SAGE_TSDB_AVAILABLE",
|
|
81
|
+
]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Python package for SageTSDB
|
|
3
|
+
|
|
4
|
+
This package provides both high-performance C++ bindings and pure Python implementations
|
|
5
|
+
for time series database operations.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
# Try to import C++ bindings first
|
|
10
|
+
from . import _sage_tsdb
|
|
11
|
+
|
|
12
|
+
TSDB_BACKEND = "cpp"
|
|
13
|
+
except ImportError:
|
|
14
|
+
# Fallback to pure Python implementation
|
|
15
|
+
_sage_tsdb = None
|
|
16
|
+
TSDB_BACKEND = "python"
|
|
17
|
+
|
|
18
|
+
# Import Python APIs (these wrap C++ or pure Python implementations)
|
|
19
|
+
from . import algorithms, sage_tsdb
|
|
20
|
+
|
|
21
|
+
__all__ = ["sage_tsdb", "algorithms", "_sage_tsdb", "TSDB_BACKEND"]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Type stub for SAGE TSDB C++ extension module.
|
|
3
|
+
|
|
4
|
+
This is a compiled C++ extension module created via pybind11.
|
|
5
|
+
The actual implementation is in C++, this file provides type hints for Python.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# Basic type hints for the C++ extension
|
|
9
|
+
# Add specific function/class signatures as needed when you know the API
|
|
10
|
+
|
|
11
|
+
class SageTSDB:
|
|
12
|
+
"""SAGE TSDB C++ extension interface"""
|
|
13
|
+
|
|
14
|
+
def __init__(self) -> None: ...
|
|
15
|
+
# Add more methods as needed
|
|
16
|
+
|
|
17
|
+
# Add other exported symbols from the C++ module as needed
|