isage-middleware 0.2.4.3__cp311-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. isage_middleware-0.2.4.3.dist-info/METADATA +266 -0
  2. isage_middleware-0.2.4.3.dist-info/RECORD +94 -0
  3. isage_middleware-0.2.4.3.dist-info/WHEEL +5 -0
  4. isage_middleware-0.2.4.3.dist-info/top_level.txt +1 -0
  5. sage/middleware/__init__.py +59 -0
  6. sage/middleware/_version.py +6 -0
  7. sage/middleware/components/__init__.py +30 -0
  8. sage/middleware/components/extensions_compat.py +141 -0
  9. sage/middleware/components/sage_db/__init__.py +116 -0
  10. sage/middleware/components/sage_db/backend.py +136 -0
  11. sage/middleware/components/sage_db/service.py +15 -0
  12. sage/middleware/components/sage_flow/__init__.py +76 -0
  13. sage/middleware/components/sage_flow/python/__init__.py +14 -0
  14. sage/middleware/components/sage_flow/python/micro_service/__init__.py +4 -0
  15. sage/middleware/components/sage_flow/python/micro_service/sage_flow_service.py +88 -0
  16. sage/middleware/components/sage_flow/python/sage_flow.py +30 -0
  17. sage/middleware/components/sage_flow/service.py +14 -0
  18. sage/middleware/components/sage_mem/__init__.py +83 -0
  19. sage/middleware/components/sage_sias/__init__.py +59 -0
  20. sage/middleware/components/sage_sias/continual_learner.py +184 -0
  21. sage/middleware/components/sage_sias/coreset_selector.py +302 -0
  22. sage/middleware/components/sage_sias/types.py +94 -0
  23. sage/middleware/components/sage_tsdb/__init__.py +81 -0
  24. sage/middleware/components/sage_tsdb/python/__init__.py +21 -0
  25. sage/middleware/components/sage_tsdb/python/_sage_tsdb.pyi +17 -0
  26. sage/middleware/components/sage_tsdb/python/algorithms/__init__.py +17 -0
  27. sage/middleware/components/sage_tsdb/python/algorithms/base.py +51 -0
  28. sage/middleware/components/sage_tsdb/python/algorithms/out_of_order_join.py +248 -0
  29. sage/middleware/components/sage_tsdb/python/algorithms/window_aggregator.py +296 -0
  30. sage/middleware/components/sage_tsdb/python/micro_service/__init__.py +7 -0
  31. sage/middleware/components/sage_tsdb/python/micro_service/sage_tsdb_service.py +365 -0
  32. sage/middleware/components/sage_tsdb/python/sage_tsdb.py +523 -0
  33. sage/middleware/components/sage_tsdb/service.py +17 -0
  34. sage/middleware/components/vector_stores/__init__.py +25 -0
  35. sage/middleware/components/vector_stores/chroma.py +483 -0
  36. sage/middleware/components/vector_stores/chroma_adapter.py +185 -0
  37. sage/middleware/components/vector_stores/milvus.py +677 -0
  38. sage/middleware/operators/__init__.py +56 -0
  39. sage/middleware/operators/agent/__init__.py +24 -0
  40. sage/middleware/operators/agent/planning/__init__.py +5 -0
  41. sage/middleware/operators/agent/planning/llm_adapter.py +41 -0
  42. sage/middleware/operators/agent/planning/planner_adapter.py +98 -0
  43. sage/middleware/operators/agent/planning/router.py +107 -0
  44. sage/middleware/operators/agent/runtime.py +296 -0
  45. sage/middleware/operators/agentic/__init__.py +41 -0
  46. sage/middleware/operators/agentic/config.py +254 -0
  47. sage/middleware/operators/agentic/planning_operator.py +125 -0
  48. sage/middleware/operators/agentic/refined_searcher.py +132 -0
  49. sage/middleware/operators/agentic/runtime.py +241 -0
  50. sage/middleware/operators/agentic/timing_operator.py +125 -0
  51. sage/middleware/operators/agentic/tool_selection_operator.py +127 -0
  52. sage/middleware/operators/context/__init__.py +17 -0
  53. sage/middleware/operators/context/critic_evaluation.py +16 -0
  54. sage/middleware/operators/context/model_context.py +565 -0
  55. sage/middleware/operators/context/quality_label.py +12 -0
  56. sage/middleware/operators/context/search_query_results.py +61 -0
  57. sage/middleware/operators/context/search_result.py +42 -0
  58. sage/middleware/operators/context/search_session.py +79 -0
  59. sage/middleware/operators/filters/__init__.py +26 -0
  60. sage/middleware/operators/filters/context_sink.py +387 -0
  61. sage/middleware/operators/filters/context_source.py +376 -0
  62. sage/middleware/operators/filters/evaluate_filter.py +83 -0
  63. sage/middleware/operators/filters/tool_filter.py +74 -0
  64. sage/middleware/operators/llm/__init__.py +18 -0
  65. sage/middleware/operators/llm/sagellm_generator.py +432 -0
  66. sage/middleware/operators/rag/__init__.py +147 -0
  67. sage/middleware/operators/rag/arxiv.py +331 -0
  68. sage/middleware/operators/rag/chunk.py +13 -0
  69. sage/middleware/operators/rag/document_loaders.py +23 -0
  70. sage/middleware/operators/rag/evaluate.py +658 -0
  71. sage/middleware/operators/rag/generator.py +340 -0
  72. sage/middleware/operators/rag/index_builder/__init__.py +48 -0
  73. sage/middleware/operators/rag/index_builder/builder.py +363 -0
  74. sage/middleware/operators/rag/index_builder/manifest.py +101 -0
  75. sage/middleware/operators/rag/index_builder/storage.py +131 -0
  76. sage/middleware/operators/rag/pipeline.py +46 -0
  77. sage/middleware/operators/rag/profiler.py +59 -0
  78. sage/middleware/operators/rag/promptor.py +400 -0
  79. sage/middleware/operators/rag/refiner.py +231 -0
  80. sage/middleware/operators/rag/reranker.py +364 -0
  81. sage/middleware/operators/rag/retriever.py +1308 -0
  82. sage/middleware/operators/rag/searcher.py +37 -0
  83. sage/middleware/operators/rag/types.py +28 -0
  84. sage/middleware/operators/rag/writer.py +80 -0
  85. sage/middleware/operators/tools/__init__.py +71 -0
  86. sage/middleware/operators/tools/arxiv_paper_searcher.py +175 -0
  87. sage/middleware/operators/tools/arxiv_searcher.py +102 -0
  88. sage/middleware/operators/tools/duckduckgo_searcher.py +105 -0
  89. sage/middleware/operators/tools/image_captioner.py +104 -0
  90. sage/middleware/operators/tools/nature_news_fetcher.py +224 -0
  91. sage/middleware/operators/tools/searcher_tool.py +514 -0
  92. sage/middleware/operators/tools/text_detector.py +185 -0
  93. sage/middleware/operators/tools/url_text_extractor.py +104 -0
  94. sage/middleware/py.typed +2 -0
@@ -0,0 +1,184 @@
1
+ """
2
+ Online Continual Learning with Experience Replay
3
+
4
+ Implements an experience replay buffer for online/incremental training that
5
+ prevents catastrophic forgetting. The buffer is managed using coreset selection
6
+ to retain the most valuable samples.
7
+
8
+ This is a core component of SIAS (Streaming Importance-Aware Agent System).
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import random
14
+ from typing import Iterable, Optional, Sequence
15
+
16
+ from .coreset_selector import CoresetSelector, SampleT, SelectionSummary
17
+
18
+
19
+ class OnlineContinualLearner:
20
+ """
21
+ Maintain a replay buffer for online continual learning.
22
+
23
+ Implements experience replay to prevent catastrophic forgetting during
24
+ incremental/online training. The buffer is managed using coreset selection
25
+ to keep the most valuable samples.
26
+
27
+ Attributes:
28
+ buffer_size: Maximum number of samples to keep in buffer
29
+ replay_ratio: Ratio of replay samples to add per batch (e.g., 0.25 = 25%)
30
+ selector: CoresetSelector for buffer management
31
+
32
+ Example:
33
+ >>> learner = OnlineContinualLearner(buffer_size=2048, replay_ratio=0.25)
34
+ >>> for new_batch in data_stream:
35
+ ... training_batch = learner.update_buffer(new_batch)
36
+ ... train_step(training_batch)
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ buffer_size: int = 2048,
42
+ replay_ratio: float = 0.3,
43
+ selector: Optional[CoresetSelector] = None,
44
+ random_seed: int = 17,
45
+ ) -> None:
46
+ """
47
+ Initialize OnlineContinualLearner.
48
+
49
+ Args:
50
+ buffer_size: Maximum samples to keep in replay buffer
51
+ replay_ratio: Fraction of batch size to sample from buffer
52
+ selector: CoresetSelector for buffer management (default: hybrid)
53
+ random_seed: Random seed for reproducibility
54
+ """
55
+ self.buffer_size = buffer_size
56
+ self.replay_ratio = replay_ratio
57
+ self.selector = selector or CoresetSelector(strategy="hybrid")
58
+ self._buffer: list[SampleT] = []
59
+ self._metrics: dict[str, float] = {}
60
+ self._rng = random.Random(random_seed)
61
+
62
+ @property
63
+ def buffer(self) -> list[SampleT]:
64
+ """Access the current buffer (read-only view)."""
65
+ return list(self._buffer)
66
+
67
+ @property
68
+ def buffer_len(self) -> int:
69
+ """Current number of samples in buffer."""
70
+ return len(self._buffer)
71
+
72
+ def update_buffer(
73
+ self,
74
+ new_samples: Sequence[SampleT],
75
+ metrics: Optional[dict[str, float]] = None,
76
+ ) -> list[SampleT]:
77
+ """
78
+ Update buffer with new samples and return training batch.
79
+
80
+ This method:
81
+ 1. Adds new samples to the buffer
82
+ 2. If buffer exceeds size limit, uses coreset selection to prune
83
+ 3. Returns new samples + replay samples for training
84
+
85
+ Args:
86
+ new_samples: New samples to add to buffer
87
+ metrics: Optional metrics dict mapping sample_id to importance score
88
+
89
+ Returns:
90
+ Training batch combining new samples with replay samples
91
+ """
92
+ if not new_samples:
93
+ return list(self._buffer)
94
+
95
+ if metrics:
96
+ self._metrics.update(metrics)
97
+
98
+ # Combine buffer with new samples
99
+ combined = list(self._buffer) + list(new_samples)
100
+
101
+ # Prune if over capacity
102
+ if len(combined) > self.buffer_size:
103
+ combined = self.selector.select(
104
+ combined,
105
+ target_size=self.buffer_size,
106
+ metrics=self._metrics,
107
+ )
108
+ # Clean up metrics for removed samples
109
+ combined_ids = {self._get_sample_id(sample) for sample in combined}
110
+ self._metrics = {k: v for k, v in self._metrics.items() if k in combined_ids}
111
+
112
+ self._buffer = combined
113
+ return self._assemble_training_batch(new_samples)
114
+
115
+ def _assemble_training_batch(
116
+ self,
117
+ new_samples: Sequence[SampleT],
118
+ ) -> list[SampleT]:
119
+ """Combine new samples with replay samples."""
120
+ new_ids = {self._get_sample_id(s) for s in new_samples}
121
+ replay = self.sample_replay(len(new_samples), exclude=new_ids)
122
+ return list(new_samples) + replay
123
+
124
+ def sample_replay(
125
+ self,
126
+ new_batch_size: int,
127
+ *,
128
+ exclude: Optional[Iterable[str]] = None,
129
+ ) -> list[SampleT]:
130
+ """
131
+ Sample from replay buffer.
132
+
133
+ Args:
134
+ new_batch_size: Size of new batch (replay size = batch_size * ratio)
135
+ exclude: Sample IDs to exclude from replay
136
+
137
+ Returns:
138
+ List of replay samples
139
+ """
140
+ if not self._buffer or self.replay_ratio <= 0:
141
+ return []
142
+
143
+ exclude = set(exclude or [])
144
+ available = [
145
+ sample for sample in self._buffer if self._get_sample_id(sample) not in exclude
146
+ ]
147
+ if not available:
148
+ return []
149
+
150
+ replay_size = max(1, int(new_batch_size * self.replay_ratio))
151
+ replay_size = min(replay_size, len(available))
152
+ return self._rng.sample(available, replay_size)
153
+
154
+ def buffer_snapshot(self) -> list[SampleT]:
155
+ """Return a copy of the current buffer."""
156
+ return list(self._buffer)
157
+
158
+ def buffer_summary(self) -> SelectionSummary:
159
+ """Get summary statistics for the buffer."""
160
+ return SelectionSummary(
161
+ total_samples=len(self._buffer),
162
+ selected_samples=len(self._buffer),
163
+ strategy=f"buffer:{self.selector.strategy}",
164
+ )
165
+
166
+ def clear(self) -> None:
167
+ """Clear the buffer and metrics."""
168
+ self._buffer = []
169
+ self._metrics = {}
170
+
171
+ def update_metrics(self, metrics: dict[str, float]) -> None:
172
+ """
173
+ Update importance metrics for samples in buffer.
174
+
175
+ Args:
176
+ metrics: Dict mapping sample_id to importance score
177
+ """
178
+ self._metrics.update(metrics)
179
+
180
+ def _get_sample_id(self, sample: SampleT) -> str:
181
+ """Get sample_id from sample (supports dict or object)."""
182
+ if isinstance(sample, dict):
183
+ return sample.get("sample_id", sample.get("dialog_id", str(id(sample))))
184
+ return getattr(sample, "sample_id", getattr(sample, "dialog_id", str(id(sample))))
@@ -0,0 +1,302 @@
1
+ """
2
+ Coreset Selection for Efficient Training
3
+
4
+ Implements lightweight coreset selection strategies that identify the most
5
+ valuable samples for training, reducing computational cost while maintaining
6
+ model quality.
7
+
8
+ Strategies:
9
+ - loss_topk: Select samples with highest loss (most informative)
10
+ - diversity: Select samples maximizing coverage of feature space
11
+ - hybrid: Combination of loss-based and diversity-based selection
12
+ - random: Uniform random sampling (baseline)
13
+
14
+ This is a core component of SIAS (Streaming Importance-Aware Agent System).
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import math
20
+ import random
21
+ import re
22
+ from collections import Counter
23
+ from dataclasses import dataclass
24
+ from typing import Any, Optional, Protocol, Sequence, runtime_checkable
25
+
26
+
27
+ @dataclass(slots=True)
28
+ class SelectionSummary:
29
+ """Summary statistics for a selection operation."""
30
+
31
+ total_samples: int
32
+ selected_samples: int
33
+ strategy: str
34
+
35
+
36
+ @runtime_checkable
37
+ class SampleProtocol(Protocol):
38
+ """Protocol for samples that can be used with CoresetSelector."""
39
+
40
+ @property
41
+ def sample_id(self) -> str:
42
+ """Unique identifier for the sample."""
43
+ ...
44
+
45
+ @property
46
+ def text(self) -> str:
47
+ """Text content of the sample."""
48
+ ...
49
+
50
+ @property
51
+ def metadata(self) -> dict[str, Any]:
52
+ """Metadata dictionary."""
53
+ ...
54
+
55
+
56
+ # Type alias for any sample that implements the protocol
57
+ SampleT = Any # Should implement SampleProtocol
58
+
59
+
60
+ class CoresetSelector:
61
+ """
62
+ Implements lightweight coreset selection strategies.
63
+
64
+ This class provides several strategies for selecting a representative
65
+ subset of samples from a larger dataset, optimizing for training efficiency.
66
+
67
+ Attributes:
68
+ strategy: Selection strategy ("loss_topk", "diversity", "hybrid", "random")
69
+ metric_key: Key in metadata to use for loss-based selection
70
+ diversity_temperature: Temperature for diversity scoring
71
+ random_seed: Seed for reproducibility
72
+
73
+ Example:
74
+ >>> selector = CoresetSelector(strategy="hybrid")
75
+ >>> selected = selector.select(samples, target_size=1000)
76
+ >>> print(f"Selected {len(selected)} from {len(samples)} samples")
77
+ """
78
+
79
+ STRATEGIES = ("loss_topk", "diversity", "hybrid", "random")
80
+
81
+ def __init__(
82
+ self,
83
+ strategy: str = "loss_topk",
84
+ metric_key: str = "loss",
85
+ diversity_temperature: float = 0.7,
86
+ random_seed: int = 13,
87
+ ) -> None:
88
+ """
89
+ Initialize CoresetSelector.
90
+
91
+ Args:
92
+ strategy: Selection strategy to use
93
+ metric_key: Metadata key for loss-based selection
94
+ diversity_temperature: Temperature for diversity scoring
95
+ random_seed: Random seed for reproducibility
96
+ """
97
+ if strategy not in self.STRATEGIES:
98
+ raise ValueError(f"Unknown strategy: {strategy}. Choose from {self.STRATEGIES}")
99
+
100
+ self.strategy = strategy
101
+ self.metric_key = metric_key
102
+ self.diversity_temperature = diversity_temperature
103
+ self._rng = random.Random(random_seed)
104
+
105
+ # ------------------------------------------------------------------
106
+ # Public API
107
+ # ------------------------------------------------------------------
108
+ def select(
109
+ self,
110
+ samples: Sequence[SampleT],
111
+ *,
112
+ target_size: Optional[int],
113
+ metrics: Optional[dict[str, float]] = None,
114
+ ) -> list[SampleT]:
115
+ """
116
+ Select a subset of samples using the configured strategy.
117
+
118
+ Args:
119
+ samples: Input samples to select from
120
+ target_size: Number of samples to select (None = keep all)
121
+ metrics: Optional external metrics dict mapping sample_id to score
122
+
123
+ Returns:
124
+ List of selected samples
125
+ """
126
+ if target_size is None or target_size <= 0 or target_size >= len(samples):
127
+ return list(samples)
128
+
129
+ if self.strategy == "loss_topk":
130
+ return self._select_loss(samples, target_size, metrics)
131
+ if self.strategy == "diversity":
132
+ return self._select_diversity(samples, target_size)
133
+ if self.strategy == "hybrid":
134
+ return self._select_hybrid(samples, target_size, metrics)
135
+ return self._select_random(samples, target_size)
136
+
137
+ def summary(self, original_size: int, selected_size: int) -> SelectionSummary:
138
+ """Create a summary of the selection operation."""
139
+ return SelectionSummary(
140
+ total_samples=original_size,
141
+ selected_samples=selected_size,
142
+ strategy=self.strategy,
143
+ )
144
+
145
+ # ------------------------------------------------------------------
146
+ # Selection Strategies
147
+ # ------------------------------------------------------------------
148
+ def _select_loss(
149
+ self,
150
+ samples: Sequence[SampleT],
151
+ target_size: int,
152
+ metrics: Optional[dict[str, float]],
153
+ ) -> list[SampleT]:
154
+ """Select samples with highest loss/importance scores."""
155
+
156
+ def score(sample: SampleT) -> float:
157
+ sample_id = self._get_sample_id(sample)
158
+ if metrics and sample_id in metrics:
159
+ return metrics[sample_id]
160
+ meta = self._get_metadata(sample)
161
+ meta_val = meta.get(self.metric_key)
162
+ if isinstance(meta_val, (int, float)):
163
+ return float(meta_val)
164
+ return 0.0
165
+
166
+ ranked = sorted(samples, key=score, reverse=True)
167
+ return list(ranked[:target_size])
168
+
169
+ def _select_random(
170
+ self,
171
+ samples: Sequence[SampleT],
172
+ target_size: int,
173
+ ) -> list[SampleT]:
174
+ """Uniform random sampling."""
175
+ return self._rng.sample(list(samples), target_size)
176
+
177
+ def _select_hybrid(
178
+ self,
179
+ samples: Sequence[SampleT],
180
+ target_size: int,
181
+ metrics: Optional[dict[str, float]],
182
+ ) -> list[SampleT]:
183
+ """Hybrid selection: 60% loss-based + 40% diversity-based."""
184
+ loss_portion = int(target_size * 0.6)
185
+ div_portion = target_size - loss_portion
186
+
187
+ # First select high-loss samples
188
+ top_loss = self._select_loss(samples, loss_portion or 1, metrics)
189
+ top_loss_ids = {self._get_sample_id(s) for s in top_loss}
190
+
191
+ # Then select diverse samples from remaining
192
+ remaining = [s for s in samples if self._get_sample_id(s) not in top_loss_ids]
193
+ if not remaining:
194
+ return top_loss
195
+
196
+ diversity = self._select_diversity(remaining, max(div_portion, 1))
197
+ merged = (top_loss + diversity)[:target_size]
198
+ return merged
199
+
200
+ def _select_diversity(
201
+ self,
202
+ samples: Sequence[SampleT],
203
+ target_size: int,
204
+ ) -> list[SampleT]:
205
+ """Select samples maximizing feature space coverage."""
206
+ if not samples:
207
+ return []
208
+
209
+ # Extract features for all samples
210
+ features = {
211
+ self._get_sample_id(sample): self._text_features(self._get_text(sample))
212
+ for sample in samples
213
+ }
214
+
215
+ selected: list[SampleT] = []
216
+ candidates = list(samples)
217
+
218
+ # Start with the sample that has the highest token variance
219
+ scores = {
220
+ self._get_sample_id(sample): self._feature_norm(features[self._get_sample_id(sample)])
221
+ for sample in samples
222
+ }
223
+ first = max(candidates, key=lambda s: scores.get(self._get_sample_id(s), 0.0))
224
+ selected.append(first)
225
+ candidates = [s for s in candidates if self._get_sample_id(s) != self._get_sample_id(first)]
226
+
227
+ # Iteratively select most diverse samples
228
+ while candidates and len(selected) < target_size:
229
+ best_candidate = max(
230
+ candidates,
231
+ key=lambda sample: self._min_distance(sample, selected, features),
232
+ )
233
+ selected.append(best_candidate)
234
+ candidates = [
235
+ s
236
+ for s in candidates
237
+ if self._get_sample_id(s) != self._get_sample_id(best_candidate)
238
+ ]
239
+
240
+ return selected
241
+
242
+ # ------------------------------------------------------------------
243
+ # Feature Extraction Helpers
244
+ # ------------------------------------------------------------------
245
+ def _text_features(self, text: str) -> Counter:
246
+ """Extract normalized token frequency features from text."""
247
+ tokens = re.findall(r"[a-zA-Z0-9_]+", text.lower())
248
+ filtered = [token for token in tokens if len(token) > 2]
249
+ counts = Counter(filtered)
250
+ total = sum(counts.values()) or 1.0
251
+ for key in counts:
252
+ counts[key] /= total
253
+ return counts
254
+
255
+ def _feature_norm(self, features: Counter) -> float:
256
+ """Compute L2 norm of feature vector."""
257
+ return math.sqrt(sum(value * value for value in features.values()))
258
+
259
+ def _cosine_similarity(self, left: Counter, right: Counter) -> float:
260
+ """Compute cosine similarity between two feature vectors."""
261
+ keys = left.keys() & right.keys()
262
+ if not keys:
263
+ return 0.0
264
+ return sum(left[key] * right[key] for key in keys)
265
+
266
+ def _min_distance(
267
+ self,
268
+ candidate: SampleT,
269
+ selected: Sequence[SampleT],
270
+ features: dict[str, Counter],
271
+ ) -> float:
272
+ """Compute minimum distance from candidate to selected set."""
273
+ cand_feat = features[self._get_sample_id(candidate)]
274
+ if not selected:
275
+ return 1.0
276
+ sims = [
277
+ self._cosine_similarity(cand_feat, features[self._get_sample_id(item)])
278
+ for item in selected
279
+ ]
280
+ similarity = max(sims) if sims else 0.0
281
+ return 1.0 - similarity
282
+
283
+ # ------------------------------------------------------------------
284
+ # Sample Access Helpers (support both dict and object access)
285
+ # ------------------------------------------------------------------
286
+ def _get_sample_id(self, sample: SampleT) -> str:
287
+ """Get sample_id from sample (supports dict or object)."""
288
+ if isinstance(sample, dict):
289
+ return sample.get("sample_id", sample.get("dialog_id", str(id(sample))))
290
+ return getattr(sample, "sample_id", getattr(sample, "dialog_id", str(id(sample))))
291
+
292
+ def _get_text(self, sample: SampleT) -> str:
293
+ """Get text from sample (supports dict or object)."""
294
+ if isinstance(sample, dict):
295
+ return sample.get("text", "")
296
+ return getattr(sample, "text", "")
297
+
298
+ def _get_metadata(self, sample: SampleT) -> dict[str, Any]:
299
+ """Get metadata from sample (supports dict or object)."""
300
+ if isinstance(sample, dict):
301
+ return sample.get("metadata", {})
302
+ return getattr(sample, "metadata", {})
@@ -0,0 +1,94 @@
1
+ """
2
+ SIAS Core Data Types
3
+
4
+ Defines the core data structures used across SIAS components.
5
+ These are designed to be independent of specific data sources.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+ from typing import Any, Protocol, runtime_checkable
12
+
13
+
14
+ @dataclass(slots=True)
15
+ class SIASSample:
16
+ """
17
+ Generic sample container for SIAS algorithms.
18
+
19
+ This is a lightweight data class that can wrap samples from various sources.
20
+ The only required fields are sample_id and text; everything else is optional.
21
+
22
+ Attributes:
23
+ sample_id: Unique identifier for this sample
24
+ text: The text content (or serialized representation)
25
+ metadata: Arbitrary metadata dictionary
26
+ importance_score: SSIS-computed importance score (set during training)
27
+ """
28
+
29
+ sample_id: str
30
+ text: str
31
+ metadata: dict[str, Any] = field(default_factory=dict)
32
+ importance_score: float = 0.0
33
+
34
+ def __hash__(self) -> int:
35
+ return hash(self.sample_id)
36
+
37
+ def __eq__(self, other: object) -> bool:
38
+ if isinstance(other, SIASSample):
39
+ return self.sample_id == other.sample_id
40
+ return False
41
+
42
+
43
+ @runtime_checkable
44
+ class SampleProtocol(Protocol):
45
+ """
46
+ Protocol for samples that can be used with SIAS algorithms.
47
+
48
+ Any class with these attributes can be used with CoresetSelector
49
+ and OnlineContinualLearner without modification.
50
+ """
51
+
52
+ @property
53
+ def sample_id(self) -> str:
54
+ """Unique identifier for the sample."""
55
+ ...
56
+
57
+ @property
58
+ def text(self) -> str:
59
+ """Text content of the sample."""
60
+ ...
61
+
62
+ @property
63
+ def metadata(self) -> dict[str, Any]:
64
+ """Metadata dictionary."""
65
+ ...
66
+
67
+
68
+ # Backward compatibility alias
69
+ # This allows existing code using ProcessedDialog to work with SIAS
70
+ # by implementing the SampleProtocol
71
+ Sample = SIASSample
72
+
73
+
74
+ def wrap_sample(
75
+ sample_id: str,
76
+ text: str,
77
+ metadata: dict[str, Any] | None = None,
78
+ **kwargs: Any,
79
+ ) -> SIASSample:
80
+ """
81
+ Factory function to create a SIASSample.
82
+
83
+ Args:
84
+ sample_id: Unique identifier
85
+ text: Text content
86
+ metadata: Optional metadata dict
87
+ **kwargs: Additional metadata fields
88
+
89
+ Returns:
90
+ A new SIASSample instance
91
+ """
92
+ meta = metadata or {}
93
+ meta.update(kwargs)
94
+ return SIASSample(sample_id=sample_id, text=text, metadata=meta)
@@ -0,0 +1,81 @@
1
+ """
2
+ SAGE-TSDB: Time Series Database Component for SAGE
3
+
4
+ Provides efficient time series data storage, querying, and processing capabilities
5
+ for streaming and historical data analysis.
6
+
7
+ Note: SAGE TSDB core is now an independent PyPI package (isage-tsdb).
8
+ This module provides backward-compatible wrappers and SAGE-specific services.
9
+ """
10
+
11
+ import warnings
12
+
13
+ # Import from PyPI package (isage-tsdb)
14
+ _SAGE_TSDB_AVAILABLE = False
15
+ try:
16
+ from sage_tsdb import (
17
+ QueryConfig,
18
+ TimeRange,
19
+ TimeSeriesData,
20
+ TimeSeriesDB,
21
+ TimeSeriesIndex,
22
+ )
23
+
24
+ # Backward compatibility alias
25
+ SageTSDB = TimeSeriesDB
26
+ _SAGE_TSDB_AVAILABLE = True
27
+ except ImportError as e:
28
+ # Don't fail immediately - allow graceful degradation
29
+ warnings.warn(
30
+ f"SAGE TSDB not available: {e}\n"
31
+ "Install with: pip install isage-tsdb\n"
32
+ "Time series features will be unavailable.",
33
+ UserWarning,
34
+ stacklevel=2,
35
+ )
36
+ # Provide stub exports
37
+ SageTSDB = None
38
+ TimeSeriesDB = None
39
+ TimeSeriesData = None
40
+ QueryConfig = None
41
+ TimeRange = None
42
+ TimeSeriesIndex = None
43
+
44
+ # Algorithms (SAGE-specific extensions)
45
+ # Only import if base package is available
46
+ if _SAGE_TSDB_AVAILABLE:
47
+ from .python.algorithms import (
48
+ OutOfOrderStreamJoin,
49
+ TimeSeriesAlgorithm,
50
+ WindowAggregator,
51
+ )
52
+
53
+ # Micro-service wrapper (SAGE-specific)
54
+ from .python.micro_service.sage_tsdb_service import (
55
+ SageTSDBService,
56
+ SageTSDBServiceConfig,
57
+ )
58
+ else:
59
+ # Stub classes if TSDB not available
60
+ TimeSeriesAlgorithm = None
61
+ OutOfOrderStreamJoin = None
62
+ WindowAggregator = None
63
+ SageTSDBService = None
64
+ SageTSDBServiceConfig = None
65
+
66
+ __all__ = [
67
+ # Core API (may be None if not installed)
68
+ "SageTSDB",
69
+ "TimeSeriesData",
70
+ "QueryConfig",
71
+ "TimeRange",
72
+ # Service
73
+ "SageTSDBService",
74
+ "SageTSDBServiceConfig",
75
+ # Algorithms
76
+ "TimeSeriesAlgorithm",
77
+ "OutOfOrderStreamJoin",
78
+ "WindowAggregator",
79
+ # Availability flag
80
+ "_SAGE_TSDB_AVAILABLE",
81
+ ]
@@ -0,0 +1,21 @@
1
+ """
2
+ Python package for SageTSDB
3
+
4
+ This package provides both high-performance C++ bindings and pure Python implementations
5
+ for time series database operations.
6
+ """
7
+
8
+ try:
9
+ # Try to import C++ bindings first
10
+ from . import _sage_tsdb
11
+
12
+ TSDB_BACKEND = "cpp"
13
+ except ImportError:
14
+ # Fallback to pure Python implementation
15
+ _sage_tsdb = None
16
+ TSDB_BACKEND = "python"
17
+
18
+ # Import Python APIs (these wrap C++ or pure Python implementations)
19
+ from . import algorithms, sage_tsdb
20
+
21
+ __all__ = ["sage_tsdb", "algorithms", "_sage_tsdb", "TSDB_BACKEND"]
@@ -0,0 +1,17 @@
1
+ """
2
+ Type stub for SAGE TSDB C++ extension module.
3
+
4
+ This is a compiled C++ extension module created via pybind11.
5
+ The actual implementation is in C++, this file provides type hints for Python.
6
+ """
7
+
8
+ # Basic type hints for the C++ extension
9
+ # Add specific function/class signatures as needed when you know the API
10
+
11
+ class SageTSDB:
12
+ """SAGE TSDB C++ extension interface"""
13
+
14
+ def __init__(self) -> None: ...
15
+ # Add more methods as needed
16
+
17
+ # Add other exported symbols from the C++ module as needed