isage-middleware 0.2.4.3__cp311-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isage_middleware-0.2.4.3.dist-info/METADATA +266 -0
- isage_middleware-0.2.4.3.dist-info/RECORD +94 -0
- isage_middleware-0.2.4.3.dist-info/WHEEL +5 -0
- isage_middleware-0.2.4.3.dist-info/top_level.txt +1 -0
- sage/middleware/__init__.py +59 -0
- sage/middleware/_version.py +6 -0
- sage/middleware/components/__init__.py +30 -0
- sage/middleware/components/extensions_compat.py +141 -0
- sage/middleware/components/sage_db/__init__.py +116 -0
- sage/middleware/components/sage_db/backend.py +136 -0
- sage/middleware/components/sage_db/service.py +15 -0
- sage/middleware/components/sage_flow/__init__.py +76 -0
- sage/middleware/components/sage_flow/python/__init__.py +14 -0
- sage/middleware/components/sage_flow/python/micro_service/__init__.py +4 -0
- sage/middleware/components/sage_flow/python/micro_service/sage_flow_service.py +88 -0
- sage/middleware/components/sage_flow/python/sage_flow.py +30 -0
- sage/middleware/components/sage_flow/service.py +14 -0
- sage/middleware/components/sage_mem/__init__.py +83 -0
- sage/middleware/components/sage_sias/__init__.py +59 -0
- sage/middleware/components/sage_sias/continual_learner.py +184 -0
- sage/middleware/components/sage_sias/coreset_selector.py +302 -0
- sage/middleware/components/sage_sias/types.py +94 -0
- sage/middleware/components/sage_tsdb/__init__.py +81 -0
- sage/middleware/components/sage_tsdb/python/__init__.py +21 -0
- sage/middleware/components/sage_tsdb/python/_sage_tsdb.pyi +17 -0
- sage/middleware/components/sage_tsdb/python/algorithms/__init__.py +17 -0
- sage/middleware/components/sage_tsdb/python/algorithms/base.py +51 -0
- sage/middleware/components/sage_tsdb/python/algorithms/out_of_order_join.py +248 -0
- sage/middleware/components/sage_tsdb/python/algorithms/window_aggregator.py +296 -0
- sage/middleware/components/sage_tsdb/python/micro_service/__init__.py +7 -0
- sage/middleware/components/sage_tsdb/python/micro_service/sage_tsdb_service.py +365 -0
- sage/middleware/components/sage_tsdb/python/sage_tsdb.py +523 -0
- sage/middleware/components/sage_tsdb/service.py +17 -0
- sage/middleware/components/vector_stores/__init__.py +25 -0
- sage/middleware/components/vector_stores/chroma.py +483 -0
- sage/middleware/components/vector_stores/chroma_adapter.py +185 -0
- sage/middleware/components/vector_stores/milvus.py +677 -0
- sage/middleware/operators/__init__.py +56 -0
- sage/middleware/operators/agent/__init__.py +24 -0
- sage/middleware/operators/agent/planning/__init__.py +5 -0
- sage/middleware/operators/agent/planning/llm_adapter.py +41 -0
- sage/middleware/operators/agent/planning/planner_adapter.py +98 -0
- sage/middleware/operators/agent/planning/router.py +107 -0
- sage/middleware/operators/agent/runtime.py +296 -0
- sage/middleware/operators/agentic/__init__.py +41 -0
- sage/middleware/operators/agentic/config.py +254 -0
- sage/middleware/operators/agentic/planning_operator.py +125 -0
- sage/middleware/operators/agentic/refined_searcher.py +132 -0
- sage/middleware/operators/agentic/runtime.py +241 -0
- sage/middleware/operators/agentic/timing_operator.py +125 -0
- sage/middleware/operators/agentic/tool_selection_operator.py +127 -0
- sage/middleware/operators/context/__init__.py +17 -0
- sage/middleware/operators/context/critic_evaluation.py +16 -0
- sage/middleware/operators/context/model_context.py +565 -0
- sage/middleware/operators/context/quality_label.py +12 -0
- sage/middleware/operators/context/search_query_results.py +61 -0
- sage/middleware/operators/context/search_result.py +42 -0
- sage/middleware/operators/context/search_session.py +79 -0
- sage/middleware/operators/filters/__init__.py +26 -0
- sage/middleware/operators/filters/context_sink.py +387 -0
- sage/middleware/operators/filters/context_source.py +376 -0
- sage/middleware/operators/filters/evaluate_filter.py +83 -0
- sage/middleware/operators/filters/tool_filter.py +74 -0
- sage/middleware/operators/llm/__init__.py +18 -0
- sage/middleware/operators/llm/sagellm_generator.py +432 -0
- sage/middleware/operators/rag/__init__.py +147 -0
- sage/middleware/operators/rag/arxiv.py +331 -0
- sage/middleware/operators/rag/chunk.py +13 -0
- sage/middleware/operators/rag/document_loaders.py +23 -0
- sage/middleware/operators/rag/evaluate.py +658 -0
- sage/middleware/operators/rag/generator.py +340 -0
- sage/middleware/operators/rag/index_builder/__init__.py +48 -0
- sage/middleware/operators/rag/index_builder/builder.py +363 -0
- sage/middleware/operators/rag/index_builder/manifest.py +101 -0
- sage/middleware/operators/rag/index_builder/storage.py +131 -0
- sage/middleware/operators/rag/pipeline.py +46 -0
- sage/middleware/operators/rag/profiler.py +59 -0
- sage/middleware/operators/rag/promptor.py +400 -0
- sage/middleware/operators/rag/refiner.py +231 -0
- sage/middleware/operators/rag/reranker.py +364 -0
- sage/middleware/operators/rag/retriever.py +1308 -0
- sage/middleware/operators/rag/searcher.py +37 -0
- sage/middleware/operators/rag/types.py +28 -0
- sage/middleware/operators/rag/writer.py +80 -0
- sage/middleware/operators/tools/__init__.py +71 -0
- sage/middleware/operators/tools/arxiv_paper_searcher.py +175 -0
- sage/middleware/operators/tools/arxiv_searcher.py +102 -0
- sage/middleware/operators/tools/duckduckgo_searcher.py +105 -0
- sage/middleware/operators/tools/image_captioner.py +104 -0
- sage/middleware/operators/tools/nature_news_fetcher.py +224 -0
- sage/middleware/operators/tools/searcher_tool.py +514 -0
- sage/middleware/operators/tools/text_detector.py +185 -0
- sage/middleware/operators/tools/url_text_extractor.py +104 -0
- sage/middleware/py.typed +2 -0
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
"""Index Builder - Service for building RAG vector indices
|
|
2
|
+
|
|
3
|
+
Layer: L4 (sage-middleware/operators/rag)
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from collections.abc import Callable
|
|
8
|
+
from contextlib import contextmanager
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from sage.middleware.operators.rag.index_builder.manifest import IndexManifest
|
|
14
|
+
from sage.middleware.operators.rag.index_builder.storage import VectorStore
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@contextmanager
|
|
20
|
+
def _optional_progress(show: bool, description: str, total: int | None = None):
|
|
21
|
+
"""Context manager for optional Rich progress bar.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
show: Whether to show progress bar (False = silent mode)
|
|
25
|
+
description: Task description
|
|
26
|
+
total: Total number of items (None for indeterminate)
|
|
27
|
+
|
|
28
|
+
Yields:
|
|
29
|
+
Progress task update function: update(advance=1)
|
|
30
|
+
"""
|
|
31
|
+
if not show:
|
|
32
|
+
# Silent mode - yield a no-op update function
|
|
33
|
+
def noop(**kwargs):
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
yield noop
|
|
37
|
+
return
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
from rich.progress import (
|
|
41
|
+
BarColumn,
|
|
42
|
+
Progress,
|
|
43
|
+
SpinnerColumn,
|
|
44
|
+
TaskProgressColumn,
|
|
45
|
+
TextColumn,
|
|
46
|
+
TimeRemainingColumn,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
with Progress(
|
|
50
|
+
SpinnerColumn(),
|
|
51
|
+
TextColumn("[cyan]{task.description}"),
|
|
52
|
+
BarColumn(bar_width=30),
|
|
53
|
+
TaskProgressColumn(),
|
|
54
|
+
TimeRemainingColumn(),
|
|
55
|
+
transient=True, # Clear after completion
|
|
56
|
+
) as progress:
|
|
57
|
+
task = progress.add_task(description, total=total)
|
|
58
|
+
|
|
59
|
+
def update(advance: int = 1, **kwargs):
|
|
60
|
+
progress.update(task, advance=advance, **kwargs)
|
|
61
|
+
|
|
62
|
+
yield update
|
|
63
|
+
|
|
64
|
+
except ImportError:
|
|
65
|
+
# Fallback if rich is not available
|
|
66
|
+
logger.info(f"[Progress] {description}")
|
|
67
|
+
|
|
68
|
+
def fallback_update(**kwargs):
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
yield fallback_update
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class IndexBuilder:
|
|
75
|
+
"""Service for building RAG vector indices with pluggable backends.
|
|
76
|
+
|
|
77
|
+
This class orchestrates the complete index building workflow, using
|
|
78
|
+
dependency injection to decouple from specific vector storage backends.
|
|
79
|
+
|
|
80
|
+
Architecture Pattern:
|
|
81
|
+
- L4 defines this builder (orchestration logic)
|
|
82
|
+
- L4 provides SageDB backend (sage.middleware.components.sage_db)
|
|
83
|
+
- L3 provides ChromaDB backend (sage.libs.integrations.chroma)
|
|
84
|
+
- L5 uses IndexBuilder with injected backend factory
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
backend_factory: Function creating VectorStore instances
|
|
88
|
+
Signature: (persist_path: Path, dim: int) -> VectorStore
|
|
89
|
+
|
|
90
|
+
Example:
|
|
91
|
+
>>> # In sage-cli (L5)
|
|
92
|
+
>>> from sage.middleware.operators.rag.index_builder import IndexBuilder
|
|
93
|
+
>>> from sage.middleware.components.sage_db import SageVDBBackend
|
|
94
|
+
>>>
|
|
95
|
+
>>> def factory(path: Path, dim: int):
|
|
96
|
+
... return SageVDBBackend(path, dim)
|
|
97
|
+
>>>
|
|
98
|
+
>>> builder = IndexBuilder(backend_factory=factory)
|
|
99
|
+
>>> manifest = builder.build_from_docs(
|
|
100
|
+
... source_dir=Path("docs"),
|
|
101
|
+
... persist_path=Path(".sage/db"),
|
|
102
|
+
... embedding_model=embedder,
|
|
103
|
+
... chunk_size=800,
|
|
104
|
+
... chunk_overlap=160,
|
|
105
|
+
... )
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
def __init__(self, backend_factory: Callable[[Path, int], VectorStore]):
|
|
109
|
+
"""Initialize builder with backend factory.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
backend_factory: Factory function for creating VectorStore instances
|
|
113
|
+
"""
|
|
114
|
+
self.backend_factory = backend_factory
|
|
115
|
+
|
|
116
|
+
def build_from_docs(
|
|
117
|
+
self,
|
|
118
|
+
source_dir: Path,
|
|
119
|
+
persist_path: Path,
|
|
120
|
+
embedding_model: Any,
|
|
121
|
+
index_name: str = "default",
|
|
122
|
+
chunk_size: int = 800,
|
|
123
|
+
chunk_overlap: int = 160,
|
|
124
|
+
document_processor: Callable[[Path], list[dict[str, Any]]] | None = None,
|
|
125
|
+
max_documents: int | None = None,
|
|
126
|
+
show_progress: bool = True,
|
|
127
|
+
) -> IndexManifest:
|
|
128
|
+
"""Build vector index from document directory.
|
|
129
|
+
|
|
130
|
+
This method orchestrates the complete index building process:
|
|
131
|
+
1. Create vector store backend
|
|
132
|
+
2. Process documents (via document_processor or default)
|
|
133
|
+
3. Chunk text content
|
|
134
|
+
4. Generate embeddings
|
|
135
|
+
5. Store vectors with metadata
|
|
136
|
+
6. Build/optimize index
|
|
137
|
+
7. Persist to disk
|
|
138
|
+
8. Return manifest
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
source_dir: Directory containing source documents
|
|
142
|
+
persist_path: Path to save the built index
|
|
143
|
+
embedding_model: Model with embed() and get_dim() methods
|
|
144
|
+
index_name: Unique identifier for this index
|
|
145
|
+
chunk_size: Size of text chunks in characters
|
|
146
|
+
chunk_overlap: Overlap between consecutive chunks
|
|
147
|
+
document_processor: Optional custom document processing function
|
|
148
|
+
If None, uses simple text extraction
|
|
149
|
+
Signature: (source_dir: Path) -> list[dict] where dict has:
|
|
150
|
+
- "content": str (text content)
|
|
151
|
+
- "metadata": dict (doc_path, title, heading, etc.)
|
|
152
|
+
max_documents: Optional limit on number of documents to process
|
|
153
|
+
show_progress: Show Rich progress bar (False = quiet mode)
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
IndexManifest with build statistics and metadata
|
|
157
|
+
|
|
158
|
+
Raises:
|
|
159
|
+
FileNotFoundError: If source_dir doesn't exist
|
|
160
|
+
RuntimeError: If index building fails
|
|
161
|
+
|
|
162
|
+
Example:
|
|
163
|
+
>>> # Custom document processor for Markdown
|
|
164
|
+
>>> def process_markdown(source_dir: Path):
|
|
165
|
+
... chunks = []
|
|
166
|
+
... for file in source_dir.glob("**/*.md"):
|
|
167
|
+
... text = file.read_text()
|
|
168
|
+
... chunks.append({
|
|
169
|
+
... "content": text,
|
|
170
|
+
... "metadata": {"doc_path": str(file.relative_to(source_dir))}
|
|
171
|
+
... })
|
|
172
|
+
... return chunks
|
|
173
|
+
>>>
|
|
174
|
+
>>> manifest = builder.build_from_docs(
|
|
175
|
+
... source_dir=Path("docs"),
|
|
176
|
+
... persist_path=Path(".sage/db"),
|
|
177
|
+
... embedding_model=embedder,
|
|
178
|
+
... document_processor=process_markdown,
|
|
179
|
+
... )
|
|
180
|
+
"""
|
|
181
|
+
if not source_dir.exists():
|
|
182
|
+
raise FileNotFoundError(f"Source directory not found: {source_dir}")
|
|
183
|
+
|
|
184
|
+
logger.debug(f"Building index from {source_dir}")
|
|
185
|
+
logger.debug(f"Backend: {self.backend_factory}")
|
|
186
|
+
logger.debug(f"Chunk size: {chunk_size}, overlap: {chunk_overlap}")
|
|
187
|
+
|
|
188
|
+
# Create vector store backend
|
|
189
|
+
dim = embedding_model.get_dim()
|
|
190
|
+
store = self.backend_factory(persist_path, dim)
|
|
191
|
+
logger.debug(f"Created vector store with dimension {dim}")
|
|
192
|
+
|
|
193
|
+
# Process documents
|
|
194
|
+
if document_processor is None:
|
|
195
|
+
# Default: simple text file processing
|
|
196
|
+
logger.debug(
|
|
197
|
+
"No document_processor provided, using default text extraction. "
|
|
198
|
+
"For better results, provide a custom processor."
|
|
199
|
+
)
|
|
200
|
+
processed_docs = self._default_document_processor(source_dir, max_documents)
|
|
201
|
+
else:
|
|
202
|
+
processed_docs = document_processor(source_dir)
|
|
203
|
+
if max_documents:
|
|
204
|
+
processed_docs = processed_docs[:max_documents]
|
|
205
|
+
|
|
206
|
+
logger.debug(f"Processed {len(processed_docs)} document sections")
|
|
207
|
+
|
|
208
|
+
# Import chunking utility
|
|
209
|
+
try:
|
|
210
|
+
from sage.common.utils.document_processing import (
|
|
211
|
+
chunk_text,
|
|
212
|
+
sanitize_metadata_value,
|
|
213
|
+
truncate_text,
|
|
214
|
+
)
|
|
215
|
+
except ImportError:
|
|
216
|
+
logger.debug("Cannot import chunking utilities from sage.common, using simple split")
|
|
217
|
+
|
|
218
|
+
def chunk_text(text: str, size: int, overlap: int) -> list[str]:
|
|
219
|
+
# Fallback: simple fixed-size chunking
|
|
220
|
+
chunks = []
|
|
221
|
+
start = 0
|
|
222
|
+
while start < len(text):
|
|
223
|
+
end = min(len(text), start + size)
|
|
224
|
+
chunks.append(text[start:end])
|
|
225
|
+
start += size - overlap
|
|
226
|
+
return chunks
|
|
227
|
+
|
|
228
|
+
def sanitize_metadata_value(val: str) -> str:
|
|
229
|
+
# Remove problematic chars for JSON/C++ parser
|
|
230
|
+
return (
|
|
231
|
+
val.replace("\\", "")
|
|
232
|
+
.replace("\n", " ")
|
|
233
|
+
.replace('"', "'")
|
|
234
|
+
.replace("{", "(")
|
|
235
|
+
.replace("}", ")")
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
def truncate_text(text: str, limit: int = 480) -> str:
|
|
239
|
+
return text[:limit] if len(text) > limit else text
|
|
240
|
+
|
|
241
|
+
# Embed and store (with chunking)
|
|
242
|
+
# First pass: count total chunks for accurate progress
|
|
243
|
+
all_chunks_data = [] # List of (chunk_text, base_metadata)
|
|
244
|
+
unique_docs = set()
|
|
245
|
+
|
|
246
|
+
for doc in processed_docs:
|
|
247
|
+
content = doc["content"]
|
|
248
|
+
base_metadata = doc["metadata"]
|
|
249
|
+
|
|
250
|
+
# Track unique documents
|
|
251
|
+
if "doc_path" in base_metadata:
|
|
252
|
+
unique_docs.add(base_metadata["doc_path"])
|
|
253
|
+
|
|
254
|
+
# Chunk the content
|
|
255
|
+
content_chunks = chunk_text(content, chunk_size, chunk_overlap)
|
|
256
|
+
|
|
257
|
+
for chunk_idx, chunk in enumerate(content_chunks):
|
|
258
|
+
all_chunks_data.append((chunk, base_metadata, chunk_idx))
|
|
259
|
+
|
|
260
|
+
total_chunks = len(all_chunks_data)
|
|
261
|
+
logger.debug(f"Total chunks to embed: {total_chunks}")
|
|
262
|
+
|
|
263
|
+
# Second pass: embed with accurate progress
|
|
264
|
+
with _optional_progress(show_progress, "Embedding", total=total_chunks) as progress_update:
|
|
265
|
+
for idx, (chunk, base_metadata, chunk_idx) in enumerate(all_chunks_data, start=1):
|
|
266
|
+
# Generate embedding
|
|
267
|
+
vector = embedding_model.embed(chunk)
|
|
268
|
+
|
|
269
|
+
# Create metadata for this chunk
|
|
270
|
+
metadata = {
|
|
271
|
+
**base_metadata,
|
|
272
|
+
"chunk": str(chunk_idx),
|
|
273
|
+
"text": sanitize_metadata_value(truncate_text(chunk, limit=1200)),
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
# Sanitize all string values
|
|
277
|
+
metadata = {
|
|
278
|
+
k: sanitize_metadata_value(str(v)) if isinstance(v, str) else str(v)
|
|
279
|
+
for k, v in metadata.items()
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
# Store vector with metadata
|
|
283
|
+
store.add(vector, metadata)
|
|
284
|
+
progress_update(advance=1)
|
|
285
|
+
|
|
286
|
+
if idx % 500 == 0:
|
|
287
|
+
logger.debug(f"Embedded {idx}/{total_chunks} chunks")
|
|
288
|
+
|
|
289
|
+
logger.debug(f"Added {total_chunks} vectors from {len(unique_docs)} documents")
|
|
290
|
+
|
|
291
|
+
# Build index
|
|
292
|
+
logger.debug("Building vector index...")
|
|
293
|
+
store.build_index()
|
|
294
|
+
|
|
295
|
+
# Persist to disk
|
|
296
|
+
logger.debug(f"Saving index to {persist_path}")
|
|
297
|
+
store.save(str(persist_path))
|
|
298
|
+
|
|
299
|
+
# Create manifest
|
|
300
|
+
manifest = IndexManifest(
|
|
301
|
+
index_name=index_name,
|
|
302
|
+
backend_type=type(store).__name__,
|
|
303
|
+
persist_path=persist_path,
|
|
304
|
+
source_dir=str(source_dir),
|
|
305
|
+
embedding_config={
|
|
306
|
+
"model": type(embedding_model).__name__,
|
|
307
|
+
"dim": dim,
|
|
308
|
+
},
|
|
309
|
+
chunk_size=chunk_size,
|
|
310
|
+
chunk_overlap=chunk_overlap,
|
|
311
|
+
num_documents=len(unique_docs),
|
|
312
|
+
num_chunks=total_chunks,
|
|
313
|
+
created_at=datetime.utcnow().isoformat(),
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
logger.debug(f"Index built successfully: {manifest}")
|
|
317
|
+
return manifest
|
|
318
|
+
|
|
319
|
+
def _default_document_processor(
|
|
320
|
+
self,
|
|
321
|
+
source_dir: Path,
|
|
322
|
+
max_documents: int | None = None,
|
|
323
|
+
) -> list[dict[str, Any]]:
|
|
324
|
+
"""Default document processor for plain text files.
|
|
325
|
+
|
|
326
|
+
This is a fallback processor that simply reads text files.
|
|
327
|
+
For production use, provide a custom processor that:
|
|
328
|
+
- Handles specific formats (Markdown, PDF, etc.)
|
|
329
|
+
- Implements smart chunking
|
|
330
|
+
- Preserves document structure
|
|
331
|
+
|
|
332
|
+
Args:
|
|
333
|
+
source_dir: Directory to scan
|
|
334
|
+
max_documents: Optional limit
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
List of processed chunks with metadata
|
|
338
|
+
"""
|
|
339
|
+
chunks = []
|
|
340
|
+
text_files = list(source_dir.glob("**/*.txt")) + list(source_dir.glob("**/*.md"))
|
|
341
|
+
|
|
342
|
+
if max_documents:
|
|
343
|
+
text_files = text_files[:max_documents]
|
|
344
|
+
|
|
345
|
+
for file_path in text_files:
|
|
346
|
+
try:
|
|
347
|
+
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
|
348
|
+
rel_path = file_path.relative_to(source_dir)
|
|
349
|
+
|
|
350
|
+
chunks.append(
|
|
351
|
+
{
|
|
352
|
+
"content": content,
|
|
353
|
+
"metadata": {
|
|
354
|
+
"doc_path": str(rel_path),
|
|
355
|
+
"title": file_path.stem,
|
|
356
|
+
"text": content[:1000], # Preview
|
|
357
|
+
},
|
|
358
|
+
}
|
|
359
|
+
)
|
|
360
|
+
except Exception as e:
|
|
361
|
+
logger.warning(f"Failed to process {file_path}: {e}")
|
|
362
|
+
|
|
363
|
+
return chunks
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Index Manifest - Metadata describing a built RAG index
|
|
2
|
+
|
|
3
|
+
Layer: L4 (sage-middleware/operators/rag)
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class IndexManifest:
|
|
14
|
+
"""Metadata describing a built knowledge index.
|
|
15
|
+
|
|
16
|
+
This dataclass stores comprehensive metadata about a vector index,
|
|
17
|
+
including source information, embedding configuration, and statistics.
|
|
18
|
+
|
|
19
|
+
Attributes:
|
|
20
|
+
index_name: Unique identifier for this index
|
|
21
|
+
backend_type: Storage backend ("sagedb", "chromadb", "milvus", etc.)
|
|
22
|
+
persist_path: Path where the index is stored
|
|
23
|
+
source_dir: Original document directory
|
|
24
|
+
embedding_config: Embedding model configuration
|
|
25
|
+
chunk_size: Size of text chunks in characters
|
|
26
|
+
chunk_overlap: Overlap between chunks in characters
|
|
27
|
+
num_documents: Total number of documents indexed
|
|
28
|
+
num_chunks: Total number of text chunks (vectors) stored
|
|
29
|
+
created_at: ISO timestamp of index creation
|
|
30
|
+
metadata: Additional custom metadata
|
|
31
|
+
|
|
32
|
+
Example:
|
|
33
|
+
>>> manifest = IndexManifest(
|
|
34
|
+
... index_name="docs-public",
|
|
35
|
+
... backend_type="chromadb",
|
|
36
|
+
... persist_path=Path(".sage/vector_db"),
|
|
37
|
+
... source_dir="docs-public/docs_src",
|
|
38
|
+
... embedding_config={"method": "hash", "dim": 384},
|
|
39
|
+
... chunk_size=800,
|
|
40
|
+
... chunk_overlap=160,
|
|
41
|
+
... num_documents=124,
|
|
42
|
+
... num_chunks=2720,
|
|
43
|
+
... created_at=datetime.utcnow().isoformat(),
|
|
44
|
+
... )
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
index_name: str
|
|
48
|
+
backend_type: str
|
|
49
|
+
persist_path: Path
|
|
50
|
+
source_dir: str
|
|
51
|
+
embedding_config: dict[str, Any]
|
|
52
|
+
chunk_size: int
|
|
53
|
+
chunk_overlap: int
|
|
54
|
+
num_documents: int
|
|
55
|
+
num_chunks: int
|
|
56
|
+
created_at: str
|
|
57
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
58
|
+
|
|
59
|
+
def to_dict(self) -> dict[str, Any]:
|
|
60
|
+
"""Convert manifest to dictionary for serialization."""
|
|
61
|
+
return {
|
|
62
|
+
"index_name": self.index_name,
|
|
63
|
+
"backend_type": self.backend_type,
|
|
64
|
+
"persist_path": str(self.persist_path),
|
|
65
|
+
"source_dir": self.source_dir,
|
|
66
|
+
"embedding_config": self.embedding_config,
|
|
67
|
+
"chunk_size": self.chunk_size,
|
|
68
|
+
"chunk_overlap": self.chunk_overlap,
|
|
69
|
+
"num_documents": self.num_documents,
|
|
70
|
+
"num_chunks": self.num_chunks,
|
|
71
|
+
"created_at": self.created_at,
|
|
72
|
+
"metadata": self.metadata,
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
@classmethod
|
|
76
|
+
def from_dict(cls, data: dict[str, Any]) -> "IndexManifest":
|
|
77
|
+
"""Create manifest from dictionary."""
|
|
78
|
+
data_copy = data.copy()
|
|
79
|
+
data_copy["persist_path"] = Path(data_copy["persist_path"])
|
|
80
|
+
return cls(**data_copy)
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def age_seconds(self) -> float:
|
|
84
|
+
"""Get age of index in seconds since creation."""
|
|
85
|
+
created = datetime.fromisoformat(self.created_at)
|
|
86
|
+
return (datetime.utcnow() - created).total_seconds()
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def is_empty(self) -> bool:
|
|
90
|
+
"""Check if index contains any data."""
|
|
91
|
+
return self.num_chunks == 0
|
|
92
|
+
|
|
93
|
+
def __repr__(self) -> str:
|
|
94
|
+
"""Readable representation of manifest."""
|
|
95
|
+
return (
|
|
96
|
+
f"IndexManifest("
|
|
97
|
+
f"name={self.index_name!r}, "
|
|
98
|
+
f"backend={self.backend_type!r}, "
|
|
99
|
+
f"docs={self.num_documents}, "
|
|
100
|
+
f"chunks={self.num_chunks})"
|
|
101
|
+
)
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Vector Store Protocol - Abstract interface for vector storage backends
|
|
2
|
+
|
|
3
|
+
Layer: L4 (sage-middleware/operators/rag)
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import Any, Protocol, runtime_checkable
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@runtime_checkable
|
|
10
|
+
class VectorStore(Protocol):
|
|
11
|
+
"""Abstract interface for vector storage backends.
|
|
12
|
+
|
|
13
|
+
This Protocol defines the contract that all vector storage implementations
|
|
14
|
+
must satisfy. It enables dependency injection and backend swapping without
|
|
15
|
+
tight coupling to specific implementations (SageDB, ChromaDB, Milvus, etc.).
|
|
16
|
+
|
|
17
|
+
Architecture Pattern:
|
|
18
|
+
- L4 (sage-middleware): Defines this Protocol + SageDB implementation
|
|
19
|
+
- L3 (sage-libs/integrations): Provides ChromaDB implementation
|
|
20
|
+
- L5 (sage-cli): Uses via factory injection
|
|
21
|
+
|
|
22
|
+
Example Implementation:
|
|
23
|
+
>>> class SageVDBBackend:
|
|
24
|
+
... def __init__(self, persist_path: Path, dim: int):
|
|
25
|
+
... from sage.middleware.components.sage_db import SageDB
|
|
26
|
+
... self.db = SageDB(dim)
|
|
27
|
+
... self.path = persist_path
|
|
28
|
+
...
|
|
29
|
+
... def add(self, vector: list[float], metadata: dict) -> None:
|
|
30
|
+
... self.db.add(vector, metadata)
|
|
31
|
+
...
|
|
32
|
+
... # ... implement other methods
|
|
33
|
+
|
|
34
|
+
Usage with IndexBuilder:
|
|
35
|
+
>>> def backend_factory(path: Path, dim: int) -> VectorStore:
|
|
36
|
+
... return SageDBBackend(path, dim)
|
|
37
|
+
>>>
|
|
38
|
+
>>> builder = IndexBuilder(backend_factory=backend_factory)
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def add(self, vector: list[float], metadata: dict[str, Any]) -> None:
|
|
42
|
+
"""Add a single vector with metadata to the store.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
vector: Dense vector embedding (must match dimension)
|
|
46
|
+
metadata: Associated metadata (doc_path, title, heading, chunk, text, etc.)
|
|
47
|
+
|
|
48
|
+
Raises:
|
|
49
|
+
ValueError: If vector dimension doesn't match
|
|
50
|
+
"""
|
|
51
|
+
...
|
|
52
|
+
|
|
53
|
+
def build_index(self) -> None:
|
|
54
|
+
"""Build/optimize the vector index for efficient search.
|
|
55
|
+
|
|
56
|
+
This is typically called after all vectors are added via `add()`.
|
|
57
|
+
Implementations may use various indexing strategies:
|
|
58
|
+
- Flat index (brute force)
|
|
59
|
+
- HNSW (Hierarchical Navigable Small World)
|
|
60
|
+
- IVF (Inverted File Index)
|
|
61
|
+
- Product Quantization
|
|
62
|
+
|
|
63
|
+
Raises:
|
|
64
|
+
RuntimeError: If index building fails
|
|
65
|
+
"""
|
|
66
|
+
...
|
|
67
|
+
|
|
68
|
+
def save(self, path: str) -> None:
|
|
69
|
+
"""Persist the index to disk.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
path: Absolute path to save location
|
|
73
|
+
|
|
74
|
+
Raises:
|
|
75
|
+
IOError: If save fails
|
|
76
|
+
"""
|
|
77
|
+
...
|
|
78
|
+
|
|
79
|
+
def load(self, path: str) -> None:
|
|
80
|
+
"""Load a previously saved index from disk.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
path: Absolute path to load from
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
FileNotFoundError: If index doesn't exist
|
|
87
|
+
IOError: If load fails
|
|
88
|
+
"""
|
|
89
|
+
...
|
|
90
|
+
|
|
91
|
+
def search(
|
|
92
|
+
self,
|
|
93
|
+
query_vector: list[float],
|
|
94
|
+
top_k: int = 5,
|
|
95
|
+
filter_metadata: dict[str, Any] | None = None,
|
|
96
|
+
) -> list[dict[str, Any]]:
|
|
97
|
+
"""Search for nearest neighbor vectors.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
query_vector: Query embedding
|
|
101
|
+
top_k: Number of results to return
|
|
102
|
+
filter_metadata: Optional metadata filters (e.g., {"doc_path": "intro.md"})
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
List of results, each containing:
|
|
106
|
+
- vector: The matched vector
|
|
107
|
+
- metadata: Associated metadata
|
|
108
|
+
- distance/score: Similarity score
|
|
109
|
+
|
|
110
|
+
Example:
|
|
111
|
+
>>> results = store.search([0.1, 0.2, ...], top_k=5)
|
|
112
|
+
>>> for result in results:
|
|
113
|
+
... print(result["metadata"]["title"], result["score"])
|
|
114
|
+
"""
|
|
115
|
+
...
|
|
116
|
+
|
|
117
|
+
def get_dim(self) -> int:
|
|
118
|
+
"""Get the vector dimension of this store.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
Vector dimension (e.g., 384 for BGE-small, 768 for BERT)
|
|
122
|
+
"""
|
|
123
|
+
...
|
|
124
|
+
|
|
125
|
+
def count(self) -> int:
|
|
126
|
+
"""Get total number of vectors in the store.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
Total vector count
|
|
130
|
+
"""
|
|
131
|
+
...
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RAG Pipeline - RAG 系统的核心管道组件
|
|
3
|
+
|
|
4
|
+
Layer: L4 (Middleware - Orchestration)
|
|
5
|
+
This module orchestrates multiple RAG components (retriever, reranker, refiner, generator)
|
|
6
|
+
into a cohesive pipeline. Pipeline/orchestration belongs in middleware, not libs.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RAGPipeline:
|
|
13
|
+
"""RAG 管道主类 - 编排多个RAG组件"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, retriever=None, generator=None, reranker=None, refiner=None):
|
|
16
|
+
self.retriever = retriever
|
|
17
|
+
self.generator = generator
|
|
18
|
+
self.reranker = reranker
|
|
19
|
+
self.refiner = refiner
|
|
20
|
+
|
|
21
|
+
def run(self, query: str, **kwargs) -> dict[str, Any]:
|
|
22
|
+
"""运行 RAG 管道"""
|
|
23
|
+
# 1. 检索相关文档
|
|
24
|
+
if self.retriever:
|
|
25
|
+
documents = self.retriever.retrieve(query, **kwargs)
|
|
26
|
+
else:
|
|
27
|
+
documents = []
|
|
28
|
+
|
|
29
|
+
# 2. 重排序(可选)
|
|
30
|
+
if self.reranker and documents:
|
|
31
|
+
documents = self.reranker.rerank(query, documents, **kwargs)
|
|
32
|
+
|
|
33
|
+
# 3. 精化查询或文档(可选)
|
|
34
|
+
if self.refiner:
|
|
35
|
+
query, documents = self.refiner.refine(query, documents, **kwargs)
|
|
36
|
+
|
|
37
|
+
# 4. 生成回答
|
|
38
|
+
if self.generator:
|
|
39
|
+
response = self.generator.generate(query, documents, **kwargs)
|
|
40
|
+
else:
|
|
41
|
+
response = "No generator configured"
|
|
42
|
+
|
|
43
|
+
return {"query": query, "documents": documents, "response": response}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
__all__ = ["RAGPipeline"]
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from sage.common.core import FilterFunction
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class QueryProfilerResult:
|
|
9
|
+
need_joint_reasoning: bool
|
|
10
|
+
complexity: str # "High" or "Low"
|
|
11
|
+
need_summarization: bool
|
|
12
|
+
summarization_length: int # 30-200
|
|
13
|
+
n_info_items: int # 1-6
|
|
14
|
+
|
|
15
|
+
def __post_init__(self):
|
|
16
|
+
# 严格验证,抛出异常
|
|
17
|
+
if self.complexity not in ["High", "Low"]:
|
|
18
|
+
raise ValueError(f"complexity必须是'High'或'Low',得到: {self.complexity}")
|
|
19
|
+
|
|
20
|
+
if not (30 <= self.summarization_length <= 200):
|
|
21
|
+
raise ValueError(
|
|
22
|
+
f"summarization_length必须在30-200之间,得到: {self.summarization_length}"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
if not (1 <= self.n_info_items <= 6):
|
|
26
|
+
raise ValueError(f"n_info_items必须在1-6之间,得到: {self.n_info_items}")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class Query_Profiler(FilterFunction):
|
|
30
|
+
def __init__(self, config, **kwargs):
|
|
31
|
+
super().__init__(**kwargs)
|
|
32
|
+
|
|
33
|
+
def execute(self, data):
|
|
34
|
+
js = json.loads(data)
|
|
35
|
+
# 使用解包创建对象并直接获取属性
|
|
36
|
+
profiler_result = QueryProfilerResult(
|
|
37
|
+
need_joint_reasoning=js.get("need_joint_reasoning", False),
|
|
38
|
+
complexity=js.get("complexity", "Low"),
|
|
39
|
+
need_summarization=js.get("need_summarization", False),
|
|
40
|
+
summarization_length=js.get("summarization_length", 30),
|
|
41
|
+
n_info_items=js.get("n_info_items", 1),
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# 直接解包到变量
|
|
45
|
+
need_joint_reasoning = profiler_result.need_joint_reasoning
|
|
46
|
+
complexity = profiler_result.complexity
|
|
47
|
+
summarization_length = profiler_result.summarization_length
|
|
48
|
+
|
|
49
|
+
if need_joint_reasoning is False:
|
|
50
|
+
synthesis_method = "map_rerank"
|
|
51
|
+
else:
|
|
52
|
+
if complexity == "Low":
|
|
53
|
+
synthesis_method = "stuff"
|
|
54
|
+
else:
|
|
55
|
+
synthesis_method = "map_reduce"
|
|
56
|
+
num_chunks = [profiler_result.n_info_items, 3 * profiler_result.n_info_items]
|
|
57
|
+
intermediate_length_range = summarization_length
|
|
58
|
+
|
|
59
|
+
return [synthesis_method, num_chunks, intermediate_length_range]
|