aethergraph 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aethergraph/__init__.py +49 -0
- aethergraph/config/__init__.py +0 -0
- aethergraph/config/config.py +121 -0
- aethergraph/config/context.py +16 -0
- aethergraph/config/llm.py +26 -0
- aethergraph/config/loader.py +60 -0
- aethergraph/config/runtime.py +9 -0
- aethergraph/contracts/errors/errors.py +44 -0
- aethergraph/contracts/services/artifacts.py +142 -0
- aethergraph/contracts/services/channel.py +72 -0
- aethergraph/contracts/services/continuations.py +23 -0
- aethergraph/contracts/services/eventbus.py +12 -0
- aethergraph/contracts/services/kv.py +24 -0
- aethergraph/contracts/services/llm.py +17 -0
- aethergraph/contracts/services/mcp.py +22 -0
- aethergraph/contracts/services/memory.py +108 -0
- aethergraph/contracts/services/resume.py +28 -0
- aethergraph/contracts/services/state_stores.py +33 -0
- aethergraph/contracts/services/wakeup.py +28 -0
- aethergraph/core/execution/base_scheduler.py +77 -0
- aethergraph/core/execution/forward_scheduler.py +777 -0
- aethergraph/core/execution/global_scheduler.py +634 -0
- aethergraph/core/execution/retry_policy.py +22 -0
- aethergraph/core/execution/step_forward.py +411 -0
- aethergraph/core/execution/step_result.py +18 -0
- aethergraph/core/execution/wait_types.py +72 -0
- aethergraph/core/graph/graph_builder.py +192 -0
- aethergraph/core/graph/graph_fn.py +219 -0
- aethergraph/core/graph/graph_io.py +67 -0
- aethergraph/core/graph/graph_refs.py +154 -0
- aethergraph/core/graph/graph_spec.py +115 -0
- aethergraph/core/graph/graph_state.py +59 -0
- aethergraph/core/graph/graphify.py +128 -0
- aethergraph/core/graph/interpreter.py +145 -0
- aethergraph/core/graph/node_handle.py +33 -0
- aethergraph/core/graph/node_spec.py +46 -0
- aethergraph/core/graph/node_state.py +63 -0
- aethergraph/core/graph/task_graph.py +747 -0
- aethergraph/core/graph/task_node.py +82 -0
- aethergraph/core/graph/utils.py +37 -0
- aethergraph/core/graph/visualize.py +239 -0
- aethergraph/core/runtime/ad_hoc_context.py +61 -0
- aethergraph/core/runtime/base_service.py +153 -0
- aethergraph/core/runtime/bind_adapter.py +42 -0
- aethergraph/core/runtime/bound_memory.py +69 -0
- aethergraph/core/runtime/execution_context.py +220 -0
- aethergraph/core/runtime/graph_runner.py +349 -0
- aethergraph/core/runtime/lifecycle.py +26 -0
- aethergraph/core/runtime/node_context.py +203 -0
- aethergraph/core/runtime/node_services.py +30 -0
- aethergraph/core/runtime/recovery.py +159 -0
- aethergraph/core/runtime/run_registration.py +33 -0
- aethergraph/core/runtime/runtime_env.py +157 -0
- aethergraph/core/runtime/runtime_registry.py +32 -0
- aethergraph/core/runtime/runtime_services.py +224 -0
- aethergraph/core/runtime/wakeup_watcher.py +40 -0
- aethergraph/core/tools/__init__.py +10 -0
- aethergraph/core/tools/builtins/channel_tools.py +194 -0
- aethergraph/core/tools/builtins/toolset.py +134 -0
- aethergraph/core/tools/toolkit.py +510 -0
- aethergraph/core/tools/waitable.py +109 -0
- aethergraph/plugins/channel/__init__.py +0 -0
- aethergraph/plugins/channel/adapters/__init__.py +0 -0
- aethergraph/plugins/channel/adapters/console.py +106 -0
- aethergraph/plugins/channel/adapters/file.py +102 -0
- aethergraph/plugins/channel/adapters/slack.py +285 -0
- aethergraph/plugins/channel/adapters/telegram.py +302 -0
- aethergraph/plugins/channel/adapters/webhook.py +104 -0
- aethergraph/plugins/channel/adapters/webui.py +134 -0
- aethergraph/plugins/channel/routes/__init__.py +0 -0
- aethergraph/plugins/channel/routes/console_routes.py +86 -0
- aethergraph/plugins/channel/routes/slack_routes.py +49 -0
- aethergraph/plugins/channel/routes/telegram_routes.py +26 -0
- aethergraph/plugins/channel/routes/webui_routes.py +136 -0
- aethergraph/plugins/channel/utils/__init__.py +0 -0
- aethergraph/plugins/channel/utils/slack_utils.py +278 -0
- aethergraph/plugins/channel/utils/telegram_utils.py +324 -0
- aethergraph/plugins/channel/websockets/slack_ws.py +68 -0
- aethergraph/plugins/channel/websockets/telegram_polling.py +151 -0
- aethergraph/plugins/mcp/fs_server.py +128 -0
- aethergraph/plugins/mcp/http_server.py +101 -0
- aethergraph/plugins/mcp/ws_server.py +180 -0
- aethergraph/plugins/net/http.py +10 -0
- aethergraph/plugins/utils/data_io.py +359 -0
- aethergraph/runner/__init__.py +5 -0
- aethergraph/runtime/__init__.py +62 -0
- aethergraph/server/__init__.py +3 -0
- aethergraph/server/app_factory.py +84 -0
- aethergraph/server/start.py +122 -0
- aethergraph/services/__init__.py +10 -0
- aethergraph/services/artifacts/facade.py +284 -0
- aethergraph/services/artifacts/factory.py +35 -0
- aethergraph/services/artifacts/fs_store.py +656 -0
- aethergraph/services/artifacts/jsonl_index.py +123 -0
- aethergraph/services/artifacts/paths.py +23 -0
- aethergraph/services/artifacts/sqlite_index.py +209 -0
- aethergraph/services/artifacts/utils.py +124 -0
- aethergraph/services/auth/dev.py +16 -0
- aethergraph/services/channel/channel_bus.py +293 -0
- aethergraph/services/channel/factory.py +44 -0
- aethergraph/services/channel/session.py +511 -0
- aethergraph/services/channel/wait_helpers.py +57 -0
- aethergraph/services/clock/clock.py +9 -0
- aethergraph/services/container/default_container.py +320 -0
- aethergraph/services/continuations/continuation.py +56 -0
- aethergraph/services/continuations/factory.py +34 -0
- aethergraph/services/continuations/stores/fs_store.py +264 -0
- aethergraph/services/continuations/stores/inmem_store.py +95 -0
- aethergraph/services/eventbus/inmem.py +21 -0
- aethergraph/services/features/static.py +10 -0
- aethergraph/services/kv/ephemeral.py +90 -0
- aethergraph/services/kv/factory.py +27 -0
- aethergraph/services/kv/layered.py +41 -0
- aethergraph/services/kv/sqlite_kv.py +128 -0
- aethergraph/services/llm/factory.py +157 -0
- aethergraph/services/llm/generic_client.py +542 -0
- aethergraph/services/llm/providers.py +3 -0
- aethergraph/services/llm/service.py +105 -0
- aethergraph/services/logger/base.py +36 -0
- aethergraph/services/logger/compat.py +50 -0
- aethergraph/services/logger/formatters.py +106 -0
- aethergraph/services/logger/std.py +203 -0
- aethergraph/services/mcp/helpers.py +23 -0
- aethergraph/services/mcp/http_client.py +70 -0
- aethergraph/services/mcp/mcp_tools.py +21 -0
- aethergraph/services/mcp/registry.py +14 -0
- aethergraph/services/mcp/service.py +100 -0
- aethergraph/services/mcp/stdio_client.py +70 -0
- aethergraph/services/mcp/ws_client.py +115 -0
- aethergraph/services/memory/bound.py +106 -0
- aethergraph/services/memory/distillers/episode.py +116 -0
- aethergraph/services/memory/distillers/rolling.py +74 -0
- aethergraph/services/memory/facade.py +633 -0
- aethergraph/services/memory/factory.py +78 -0
- aethergraph/services/memory/hotlog_kv.py +27 -0
- aethergraph/services/memory/indices.py +74 -0
- aethergraph/services/memory/io_helpers.py +72 -0
- aethergraph/services/memory/persist_fs.py +40 -0
- aethergraph/services/memory/resolver.py +152 -0
- aethergraph/services/metering/noop.py +4 -0
- aethergraph/services/prompts/file_store.py +41 -0
- aethergraph/services/rag/chunker.py +29 -0
- aethergraph/services/rag/facade.py +593 -0
- aethergraph/services/rag/index/base.py +27 -0
- aethergraph/services/rag/index/faiss_index.py +121 -0
- aethergraph/services/rag/index/sqlite_index.py +134 -0
- aethergraph/services/rag/index_factory.py +52 -0
- aethergraph/services/rag/parsers/md.py +7 -0
- aethergraph/services/rag/parsers/pdf.py +14 -0
- aethergraph/services/rag/parsers/txt.py +7 -0
- aethergraph/services/rag/utils/hybrid.py +39 -0
- aethergraph/services/rag/utils/make_fs_key.py +62 -0
- aethergraph/services/redactor/simple.py +16 -0
- aethergraph/services/registry/key_parsing.py +44 -0
- aethergraph/services/registry/registry_key.py +19 -0
- aethergraph/services/registry/unified_registry.py +185 -0
- aethergraph/services/resume/multi_scheduler_resume_bus.py +65 -0
- aethergraph/services/resume/router.py +73 -0
- aethergraph/services/schedulers/registry.py +41 -0
- aethergraph/services/secrets/base.py +7 -0
- aethergraph/services/secrets/env.py +8 -0
- aethergraph/services/state_stores/externalize.py +135 -0
- aethergraph/services/state_stores/graph_observer.py +131 -0
- aethergraph/services/state_stores/json_store.py +67 -0
- aethergraph/services/state_stores/resume_policy.py +119 -0
- aethergraph/services/state_stores/serialize.py +249 -0
- aethergraph/services/state_stores/utils.py +91 -0
- aethergraph/services/state_stores/validate.py +78 -0
- aethergraph/services/tracing/noop.py +18 -0
- aethergraph/services/waits/wait_registry.py +91 -0
- aethergraph/services/wakeup/memory_queue.py +57 -0
- aethergraph/services/wakeup/scanner_producer.py +56 -0
- aethergraph/services/wakeup/worker.py +31 -0
- aethergraph/tools/__init__.py +25 -0
- aethergraph/utils/optdeps.py +8 -0
- aethergraph-0.1.0a1.dist-info/METADATA +410 -0
- aethergraph-0.1.0a1.dist-info/RECORD +182 -0
- aethergraph-0.1.0a1.dist-info/WHEEL +5 -0
- aethergraph-0.1.0a1.dist-info/entry_points.txt +2 -0
- aethergraph-0.1.0a1.dist-info/licenses/LICENSE +176 -0
- aethergraph-0.1.0a1.dist-info/licenses/NOTICE +31 -0
- aethergraph-0.1.0a1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,593 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
import hashlib
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import shutil
|
|
8
|
+
import time
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from aethergraph.contracts.services.llm import LLMClientProtocol
|
|
12
|
+
|
|
13
|
+
from .chunker import TextSplitter
|
|
14
|
+
from .utils.hybrid import topk_fuse
|
|
15
|
+
from .utils.make_fs_key import make_fs_key
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class SearchHit:
|
|
20
|
+
"""A single search hit from RAG retrieval."""
|
|
21
|
+
|
|
22
|
+
chunk_id: str
|
|
23
|
+
doc_id: str
|
|
24
|
+
corpus_id: str
|
|
25
|
+
score: float
|
|
26
|
+
text: str
|
|
27
|
+
meta: dict[str, Any]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _now_iso() -> str:
|
|
31
|
+
return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _stable_id(parts: dict[str, Any]) -> str:
|
|
35
|
+
blob = json.dumps(parts, sort_keys=True, ensure_ascii=False).encode("utf-8")
|
|
36
|
+
return hashlib.sha256(blob).hexdigest()[:24]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class RAGFacade:
|
|
40
|
+
"""Facade for RAG operations: corpus management, document ingestion, retrieval, and QA."""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
*,
|
|
45
|
+
corpus_root: str,
|
|
46
|
+
artifacts,
|
|
47
|
+
embed_client: LLMClientProtocol,
|
|
48
|
+
llm_client: LLMClientProtocol,
|
|
49
|
+
index_backend,
|
|
50
|
+
chunker: TextSplitter,
|
|
51
|
+
logger=None,
|
|
52
|
+
):
|
|
53
|
+
"""Initialize RAGFacade with storage paths and service clients.
|
|
54
|
+
Args:
|
|
55
|
+
corpus_root: Root directory for storing corpora.
|
|
56
|
+
artifacts: Artifact storage facade.
|
|
57
|
+
embed_client: Embedding service client.
|
|
58
|
+
index_backend: Vector index backend.
|
|
59
|
+
chunker: TextSplitter instance for chunking documents.
|
|
60
|
+
logger: Optional logger for logging messages.
|
|
61
|
+
"""
|
|
62
|
+
self.root = corpus_root
|
|
63
|
+
self.artifacts = artifacts
|
|
64
|
+
self.embed = embed_client
|
|
65
|
+
self.llm = llm_client
|
|
66
|
+
self.index = index_backend
|
|
67
|
+
self.chunker = chunker
|
|
68
|
+
self.logger = logger
|
|
69
|
+
|
|
70
|
+
# self.logger.info(f"RAGFacade initialized with corpus root: {self.root}, index: {type(self.index).__name__}, embed model: {getattr(self.embed, 'embed_model', None)}, llm model: {getattr(self.llm, 'model', None)}")
|
|
71
|
+
|
|
72
|
+
def set_llm_client(self, client: LLMClientProtocol) -> None:
|
|
73
|
+
"""Set the LLM client to use for answering questions."""
|
|
74
|
+
assert client.model is not None, "RAG LLM client must have a model set"
|
|
75
|
+
assert client.embed_model is not None, "RAG LLM client must have an embedding model set"
|
|
76
|
+
self.llm = client
|
|
77
|
+
self.logger.info(
|
|
78
|
+
f"RAG LLM client set to model: {self.llm.model}, embed model: {self.llm.embed_model}"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
def set_index_backend(self, index_backend) -> None:
|
|
82
|
+
"""Set the vector index backend."""
|
|
83
|
+
self.index = index_backend
|
|
84
|
+
self.logger.info(f"RAG index backend set to: {type(self.index).__name__}")
|
|
85
|
+
|
|
86
|
+
def _cdir(self, corpus_id: str) -> str:
|
|
87
|
+
"""Get corpus directory path based on corpus ID while ensuring the path work safely across OS.
|
|
88
|
+
Args:
|
|
89
|
+
corpus_id: Unique identifier for the corpus.
|
|
90
|
+
Returns:
|
|
91
|
+
Path to the corpus directory.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
return os.path.join(self.root, make_fs_key(corpus_id))
|
|
95
|
+
|
|
96
|
+
# ---------- ingestion ----------
|
|
97
|
+
async def add_corpus(self, corpus_id: str, meta: dict[str, Any] | None = None):
|
|
98
|
+
"""Create a new corpus with optional metadata.
|
|
99
|
+
Args:
|
|
100
|
+
corpus_id: Unique identifier for the corpus.
|
|
101
|
+
meta: Optional metadata dictionary to store with the corpus.
|
|
102
|
+
"""
|
|
103
|
+
p = self._cdir(corpus_id)
|
|
104
|
+
os.makedirs(p, exist_ok=True)
|
|
105
|
+
meta_path = os.path.join(p, "corpus.json")
|
|
106
|
+
if not os.path.exists(meta_path):
|
|
107
|
+
with open(meta_path, "w", encoding="utf-8") as f:
|
|
108
|
+
json.dump(
|
|
109
|
+
{
|
|
110
|
+
"corpus_id": corpus_id,
|
|
111
|
+
"fs_key": make_fs_key(corpus_id), # for reference
|
|
112
|
+
"created_at": _now_iso(),
|
|
113
|
+
"meta": meta or {},
|
|
114
|
+
},
|
|
115
|
+
f,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
async def upsert_docs(self, corpus_id: str, docs: list[dict[str, Any]]) -> dict[str, Any]:
|
|
119
|
+
"""Ingest and index a list of documents into the specified corpus.
|
|
120
|
+
Args:
|
|
121
|
+
corpus_id: The target corpus identifier.
|
|
122
|
+
docs: A list of document specifications.
|
|
123
|
+
|
|
124
|
+
Docs can be specified as either:
|
|
125
|
+
- File-based documents: {"path": "/path/to/doc.pdf", "labels": {...}}
|
|
126
|
+
- Inline text documents: {"text": "Document content...", "title": "Doc Title", "labels": {...}}
|
|
127
|
+
"""
|
|
128
|
+
if not self.embed:
|
|
129
|
+
raise RuntimeError("RAGFacade: embed client not configured")
|
|
130
|
+
|
|
131
|
+
await self.add_corpus(corpus_id)
|
|
132
|
+
cdir = self._cdir(corpus_id)
|
|
133
|
+
docs_jl = os.path.join(cdir, "docs.jsonl")
|
|
134
|
+
chunks_jl = os.path.join(cdir, "chunks.jsonl")
|
|
135
|
+
os.makedirs(cdir, exist_ok=True)
|
|
136
|
+
|
|
137
|
+
added_docs = 0
|
|
138
|
+
all_chunk_ids, all_vecs, all_metas = [], [], []
|
|
139
|
+
total_chunks = 0
|
|
140
|
+
|
|
141
|
+
for d in docs:
|
|
142
|
+
labels = d.get("labels", {})
|
|
143
|
+
title = d.get("title") or os.path.basename(d.get("path", "")) or "untitled"
|
|
144
|
+
doc_id = _stable_id({"title": title, "labels": labels, "ts": _now_iso()})
|
|
145
|
+
text = None
|
|
146
|
+
extra_meta = {}
|
|
147
|
+
|
|
148
|
+
if "path" in d and os.path.exists(d["path"]):
|
|
149
|
+
# save original file into artifacts CAS and parse
|
|
150
|
+
uri = await self.artifacts.save_file(
|
|
151
|
+
path=d["path"],
|
|
152
|
+
kind="doc",
|
|
153
|
+
run_id="rag",
|
|
154
|
+
graph_id="rag",
|
|
155
|
+
node_id="rag",
|
|
156
|
+
tool_name="rag.upsert",
|
|
157
|
+
tool_version="0.1.0",
|
|
158
|
+
labels=labels,
|
|
159
|
+
cleanup=False,
|
|
160
|
+
)
|
|
161
|
+
path = d["path"].lower()
|
|
162
|
+
if path.endswith(".pdf"):
|
|
163
|
+
from .parsers.pdf import extract_text
|
|
164
|
+
|
|
165
|
+
text, extra_meta = extract_text(d["path"]) # type: ignore
|
|
166
|
+
elif path.endswith(".md") or path.endswith(".markdown") or path.endswith(".mkd"):
|
|
167
|
+
from .parsers.md import extract_text
|
|
168
|
+
|
|
169
|
+
text, extra_meta = extract_text(d["path"]) # type: ignore
|
|
170
|
+
else:
|
|
171
|
+
from .parsers.txt import extract_text
|
|
172
|
+
|
|
173
|
+
text, extra_meta = extract_text(d["path"]) # type: ignore
|
|
174
|
+
doc_uri = uri.uri if hasattr(uri, "uri") else uri
|
|
175
|
+
else:
|
|
176
|
+
# inline text doc — persist as artifact first
|
|
177
|
+
payload = d.get("text", "")
|
|
178
|
+
uri = await self.artifacts.save_text(payload=payload) # store as temp artifact
|
|
179
|
+
doc_uri = uri.uri if hasattr(uri, "uri") else uri
|
|
180
|
+
text = payload
|
|
181
|
+
|
|
182
|
+
text = (text or "").strip()
|
|
183
|
+
if not text:
|
|
184
|
+
if self.logger:
|
|
185
|
+
self.logger.warning(f"RAG: empty text for doc {title}")
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
# write doc record
|
|
189
|
+
with open(docs_jl, "a", encoding="utf-8") as f:
|
|
190
|
+
f.write(
|
|
191
|
+
json.dumps(
|
|
192
|
+
{
|
|
193
|
+
"doc_id": doc_id,
|
|
194
|
+
"corpus_id": corpus_id,
|
|
195
|
+
"uri": doc_uri,
|
|
196
|
+
"title": title,
|
|
197
|
+
"meta": {"labels": labels, **extra_meta},
|
|
198
|
+
"created_at": _now_iso(),
|
|
199
|
+
},
|
|
200
|
+
ensure_ascii=False,
|
|
201
|
+
)
|
|
202
|
+
+ "\n"
|
|
203
|
+
)
|
|
204
|
+
added_docs += 1
|
|
205
|
+
|
|
206
|
+
# chunk + embed
|
|
207
|
+
chunks = self.chunker.split(text)
|
|
208
|
+
if not chunks:
|
|
209
|
+
continue
|
|
210
|
+
# batch embed
|
|
211
|
+
vecs = await self.embed.embed(chunks)
|
|
212
|
+
for i, (chunk_text, vec) in enumerate(zip(chunks, vecs, strict=True)):
|
|
213
|
+
chunk_id = _stable_id({"doc": doc_id, "i": i})
|
|
214
|
+
meta = {"doc_id": doc_id, "title": title, "i": i, "labels": labels}
|
|
215
|
+
# append chunk record
|
|
216
|
+
with open(chunks_jl, "a", encoding="utf-8") as f:
|
|
217
|
+
f.write(
|
|
218
|
+
json.dumps(
|
|
219
|
+
{
|
|
220
|
+
"chunk_id": chunk_id,
|
|
221
|
+
"doc_id": doc_id,
|
|
222
|
+
"corpus_id": corpus_id,
|
|
223
|
+
"text": chunk_text,
|
|
224
|
+
"meta": meta,
|
|
225
|
+
},
|
|
226
|
+
ensure_ascii=False,
|
|
227
|
+
)
|
|
228
|
+
+ "\n"
|
|
229
|
+
)
|
|
230
|
+
all_chunk_ids.append(chunk_id)
|
|
231
|
+
all_vecs.append(vec)
|
|
232
|
+
all_metas.append({**meta})
|
|
233
|
+
total_chunks += len(chunks)
|
|
234
|
+
|
|
235
|
+
# add to index
|
|
236
|
+
if all_chunk_ids:
|
|
237
|
+
await self.index.add(corpus_id, all_chunk_ids, all_vecs, all_metas)
|
|
238
|
+
|
|
239
|
+
return {"added": added_docs, "chunks": total_chunks, "index": type(self.index).__name__}
|
|
240
|
+
|
|
241
|
+
# ---------- retrieval ----------
|
|
242
|
+
def _load_chunks_map(self, corpus_id: str) -> dict[str, dict[str, Any]]:
|
|
243
|
+
"""Load chunk metadata for a given corpus."""
|
|
244
|
+
# Load latest chunk text+meta into a dict
|
|
245
|
+
cdir = self._cdir(corpus_id)
|
|
246
|
+
chunks_jl = os.path.join(cdir, "chunks.jsonl")
|
|
247
|
+
out = {}
|
|
248
|
+
if not os.path.exists(chunks_jl):
|
|
249
|
+
return out
|
|
250
|
+
with open(chunks_jl, encoding="utf-8") as f:
|
|
251
|
+
for line in f:
|
|
252
|
+
if not line.strip():
|
|
253
|
+
continue
|
|
254
|
+
obj = json.loads(line)
|
|
255
|
+
out[obj["chunk_id"]] = obj
|
|
256
|
+
return out
|
|
257
|
+
|
|
258
|
+
async def search(
|
|
259
|
+
self,
|
|
260
|
+
corpus_id: str,
|
|
261
|
+
query: str,
|
|
262
|
+
k: int = 8,
|
|
263
|
+
filters: dict[str, Any] | None = None,
|
|
264
|
+
mode: str = "hybrid",
|
|
265
|
+
) -> list[SearchHit]:
|
|
266
|
+
"""Search the corpus for relevant chunks given a query.
|
|
267
|
+
Args:
|
|
268
|
+
corpus_id: Target corpus identifier.
|
|
269
|
+
query: The search query string.
|
|
270
|
+
k: Number of top results to return.
|
|
271
|
+
filters: Optional metadata filters to apply.
|
|
272
|
+
mode: Search mode - "dense", "hybrid".
|
|
273
|
+
"""
|
|
274
|
+
if not self.embed:
|
|
275
|
+
raise RuntimeError("RAGFacade: embed client not configured")
|
|
276
|
+
|
|
277
|
+
# dense search via index then optional lexical fusion
|
|
278
|
+
qvec = (await self.embed.embed([query]))[0]
|
|
279
|
+
dense_hits = await self.index.search(corpus_id, qvec, max(24, k))
|
|
280
|
+
chunks_map = self._load_chunks_map(corpus_id)
|
|
281
|
+
if mode == "dense" or not dense_hits:
|
|
282
|
+
dense_hits = dense_hits[:k]
|
|
283
|
+
return [
|
|
284
|
+
SearchHit(
|
|
285
|
+
chunk_id=h["chunk_id"],
|
|
286
|
+
doc_id=chunks_map.get(h["chunk_id"], {}).get("doc_id", ""),
|
|
287
|
+
corpus_id=corpus_id,
|
|
288
|
+
score=h["score"],
|
|
289
|
+
text=chunks_map.get(h["chunk_id"], {}).get("text", ""),
|
|
290
|
+
meta=h.get("meta", {}),
|
|
291
|
+
)
|
|
292
|
+
for h in dense_hits
|
|
293
|
+
]
|
|
294
|
+
|
|
295
|
+
fused = topk_fuse(
|
|
296
|
+
query, dense_hits, {cid: rec.get("text", "") for cid, rec in chunks_map.items()}, k
|
|
297
|
+
)
|
|
298
|
+
out = []
|
|
299
|
+
for h in fused:
|
|
300
|
+
rec = chunks_map.get(h["chunk_id"], {})
|
|
301
|
+
out.append(
|
|
302
|
+
SearchHit(
|
|
303
|
+
chunk_id=h["chunk_id"],
|
|
304
|
+
doc_id=rec.get("doc_id", ""),
|
|
305
|
+
corpus_id=corpus_id,
|
|
306
|
+
score=h["score"],
|
|
307
|
+
text=rec.get("text", ""),
|
|
308
|
+
meta=h.get("meta", {}),
|
|
309
|
+
)
|
|
310
|
+
)
|
|
311
|
+
return out
|
|
312
|
+
|
|
313
|
+
async def retrieve(
|
|
314
|
+
self, corpus_id: str, query: str, k: int = 6, rerank: bool = True
|
|
315
|
+
) -> list[SearchHit]:
|
|
316
|
+
"""Retrieve top-k relevant chunks for a query from the corpus.
|
|
317
|
+
Args:
|
|
318
|
+
corpus_id: Target corpus identifier.
|
|
319
|
+
query: The retrieval query string.
|
|
320
|
+
k: Number of top results to return.
|
|
321
|
+
rerank: Whether to rerank results using hybrid scoring.
|
|
322
|
+
"""
|
|
323
|
+
# For now, rerank flag is ignored; fused hybrid already sorts reasonably.
|
|
324
|
+
return await self.search(corpus_id, query, k=k, mode="hybrid")
|
|
325
|
+
|
|
326
|
+
async def answer(
|
|
327
|
+
self,
|
|
328
|
+
corpus_id: str,
|
|
329
|
+
question: str,
|
|
330
|
+
*,
|
|
331
|
+
llm: LLMClientProtocol | None = None,
|
|
332
|
+
style: str = "concise",
|
|
333
|
+
with_citations: bool = True,
|
|
334
|
+
k: int = 6,
|
|
335
|
+
) -> dict[str, Any]:
|
|
336
|
+
"""Answer a question using retrieved context from the corpus.
|
|
337
|
+
Args:
|
|
338
|
+
corpus_id: Target corpus identifier.
|
|
339
|
+
question: The question to answer.
|
|
340
|
+
llm: Language model client for generating the answer. If None, uses default LLM.
|
|
341
|
+
style: Answering style - "concise" or "detailed".
|
|
342
|
+
with_citations: Whether to include citations in the answer.
|
|
343
|
+
k: Number of context chunks to retrieve.
|
|
344
|
+
"""
|
|
345
|
+
if not llm:
|
|
346
|
+
# use default LLM client
|
|
347
|
+
llm = self.llm
|
|
348
|
+
|
|
349
|
+
hits = await self.retrieve(corpus_id, question, k=k, rerank=True)
|
|
350
|
+
context = "\n\n".join([f"[{i + 1}] {h.text}" for i, h in enumerate(hits)])
|
|
351
|
+
sys = "You answer strictly from the provided context. Cite chunk numbers like [1],[2]. If insufficient, say you don't know."
|
|
352
|
+
if style == "detailed":
|
|
353
|
+
sys += " Be structured and explain reasoning briefly."
|
|
354
|
+
usr = f"Question: {question}\n\nContext:\n{context}"
|
|
355
|
+
text, usage = await llm.chat(
|
|
356
|
+
[{"role": "system", "content": sys}, {"role": "user", "content": usr}]
|
|
357
|
+
)
|
|
358
|
+
out = {
|
|
359
|
+
"answer": text,
|
|
360
|
+
"citations": [
|
|
361
|
+
{"chunk_id": h.chunk_id, "doc_id": h.doc_id, "rank": i + 1}
|
|
362
|
+
for i, h in enumerate(hits)
|
|
363
|
+
],
|
|
364
|
+
"usage": usage,
|
|
365
|
+
}
|
|
366
|
+
if with_citations:
|
|
367
|
+
out["resolved_citations"] = self.resolve_citations(corpus_id, out["citations"])
|
|
368
|
+
return out
|
|
369
|
+
|
|
370
|
+
def resolve_citations(self, corpus_id: str, citations: list[dict]) -> list[dict]:
|
|
371
|
+
"""Return [{rank, doc_id, title, uri, chunk_id, snippet}] sorted by rank."""
|
|
372
|
+
# load chunks + doc meta
|
|
373
|
+
cdir = self._cdir(corpus_id)
|
|
374
|
+
chunks_jl = os.path.join(cdir, "chunks.jsonl")
|
|
375
|
+
docs_jl = os.path.join(cdir, "docs.jsonl")
|
|
376
|
+
|
|
377
|
+
# build maps
|
|
378
|
+
chunk_map, doc_map = {}, {}
|
|
379
|
+
if os.path.exists(chunks_jl):
|
|
380
|
+
with open(chunks_jl, encoding="utf-8") as f:
|
|
381
|
+
for line in f:
|
|
382
|
+
o = json.loads(line)
|
|
383
|
+
chunk_map[o["chunk_id"]] = o
|
|
384
|
+
if os.path.exists(docs_jl):
|
|
385
|
+
with open(docs_jl, encoding="utf-8") as f:
|
|
386
|
+
for line in f:
|
|
387
|
+
o = json.loads(line)
|
|
388
|
+
doc_map[o["doc_id"]] = o
|
|
389
|
+
|
|
390
|
+
out = []
|
|
391
|
+
for c in sorted(citations, key=lambda x: x["rank"]):
|
|
392
|
+
ch = chunk_map.get(c["chunk_id"], {})
|
|
393
|
+
dd = doc_map.get(c["doc_id"], {})
|
|
394
|
+
text = (ch.get("text") or "").strip().replace("\n", " ")
|
|
395
|
+
snippet = (text[:220] + "…") if len(text) > 220 else text
|
|
396
|
+
out.append(
|
|
397
|
+
{
|
|
398
|
+
"rank": c["rank"],
|
|
399
|
+
"doc_id": c["doc_id"],
|
|
400
|
+
"title": dd.get("title", "(untitled)"),
|
|
401
|
+
"uri": dd.get("uri"), # CAS or file URI from artifact store
|
|
402
|
+
"chunk_id": c["chunk_id"],
|
|
403
|
+
"snippet": snippet,
|
|
404
|
+
}
|
|
405
|
+
)
|
|
406
|
+
return out
|
|
407
|
+
|
|
408
|
+
async def list_corpora(self) -> list[dict]:
|
|
409
|
+
out = []
|
|
410
|
+
for d in sorted(os.listdir(self.root)):
|
|
411
|
+
# cdir = self._cdir(d)
|
|
412
|
+
cdir = os.path.join(self.root, d) # d is already fs_key
|
|
413
|
+
if not os.path.isdir(cdir):
|
|
414
|
+
continue
|
|
415
|
+
meta_path = os.path.join(cdir, "corpus.json")
|
|
416
|
+
meta = {}
|
|
417
|
+
if os.path.exists(meta_path):
|
|
418
|
+
try:
|
|
419
|
+
with open(meta_path, encoding="utf-8") as f:
|
|
420
|
+
meta = json.load(f)
|
|
421
|
+
except Exception:
|
|
422
|
+
meta = {}
|
|
423
|
+
# Prefer the recorded logical id (i.e. corpus_id); fall back to folder name (which may be fs-safe key)
|
|
424
|
+
logical_id = meta.get("corpus_id") or meta.get("logical_id") or d
|
|
425
|
+
out.append({"corpus_id": logical_id, "meta": meta})
|
|
426
|
+
return out
|
|
427
|
+
|
|
428
|
+
async def list_docs(
|
|
429
|
+
self, corpus_id: str, limit: int = 200, after: str | None = None
|
|
430
|
+
) -> list[dict]:
|
|
431
|
+
cdir = self._cdir(corpus_id)
|
|
432
|
+
docs_jl = os.path.join(cdir, "docs.jsonl")
|
|
433
|
+
if not os.path.exists(docs_jl):
|
|
434
|
+
return []
|
|
435
|
+
acc: list[dict] = []
|
|
436
|
+
seen_after = after is None
|
|
437
|
+
with open(docs_jl, encoding="utf-8") as f:
|
|
438
|
+
for line in f:
|
|
439
|
+
if not line.strip():
|
|
440
|
+
continue
|
|
441
|
+
obj = json.loads(line)
|
|
442
|
+
if not seen_after:
|
|
443
|
+
if obj.get("doc_id") == after:
|
|
444
|
+
seen_after = True
|
|
445
|
+
continue
|
|
446
|
+
acc.append(obj)
|
|
447
|
+
if len(acc) >= limit:
|
|
448
|
+
break
|
|
449
|
+
return acc
|
|
450
|
+
|
|
451
|
+
async def delete_docs(self, corpus_id: str, doc_ids: list[str]) -> dict:
|
|
452
|
+
"""
|
|
453
|
+
Removes docs from docs.jsonl and any chunks in chunks.jsonl; asks the index to drop vectors if supported.
|
|
454
|
+
"""
|
|
455
|
+
cdir = self._cdir(corpus_id)
|
|
456
|
+
docs_jl = os.path.join(cdir, "docs.jsonl")
|
|
457
|
+
chunks_jl = os.path.join(cdir, "chunks.jsonl")
|
|
458
|
+
kept_docs, kept_chunks = [], []
|
|
459
|
+
removed_chunks = []
|
|
460
|
+
doc_set = set(doc_ids)
|
|
461
|
+
|
|
462
|
+
if os.path.exists(chunks_jl):
|
|
463
|
+
with open(chunks_jl, encoding="utf-8") as f:
|
|
464
|
+
for line in f:
|
|
465
|
+
if not line.strip():
|
|
466
|
+
continue
|
|
467
|
+
o = json.loads(line)
|
|
468
|
+
if o.get("doc_id") in doc_set:
|
|
469
|
+
removed_chunks.append(o.get("chunk_id"))
|
|
470
|
+
else:
|
|
471
|
+
kept_chunks.append(line)
|
|
472
|
+
with open(chunks_jl, "w", encoding="utf-8") as f:
|
|
473
|
+
f.writelines(kept_chunks)
|
|
474
|
+
|
|
475
|
+
if os.path.exists(docs_jl):
|
|
476
|
+
with open(docs_jl, encoding="utf-8") as f:
|
|
477
|
+
for line in f:
|
|
478
|
+
if not line.strip():
|
|
479
|
+
continue
|
|
480
|
+
o = json.loads(line)
|
|
481
|
+
if o.get("doc_id") not in doc_set:
|
|
482
|
+
kept_docs.append(line)
|
|
483
|
+
with open(docs_jl, "w", encoding="utf-8") as f:
|
|
484
|
+
f.writelines(kept_docs)
|
|
485
|
+
|
|
486
|
+
# drop from index if supported
|
|
487
|
+
if hasattr(self.index, "remove"):
|
|
488
|
+
await self.index.remove(corpus_id, removed_chunks)
|
|
489
|
+
elif hasattr(self.index, "delete"):
|
|
490
|
+
await self.index.delete(corpus_id, removed_chunks)
|
|
491
|
+
|
|
492
|
+
return {"removed_docs": len(doc_ids), "removed_chunks": len(removed_chunks)}
|
|
493
|
+
|
|
494
|
+
async def reembed(
|
|
495
|
+
self, corpus_id: str, *, doc_ids: list[str] | None = None, batch: int = 64
|
|
496
|
+
) -> dict:
|
|
497
|
+
"""
|
|
498
|
+
Re-embeds selected docs (or all) and re-adds vectors. Uses the configured embed client or a model override if your client supports it.
|
|
499
|
+
"""
|
|
500
|
+
cdir = self._cdir(corpus_id)
|
|
501
|
+
chunks_jl = os.path.join(cdir, "chunks.jsonl")
|
|
502
|
+
if not os.path.exists(chunks_jl):
|
|
503
|
+
return {"reembedded": 0}
|
|
504
|
+
|
|
505
|
+
targets: list[dict] = []
|
|
506
|
+
with open(chunks_jl, encoding="utf-8") as f:
|
|
507
|
+
for line in f:
|
|
508
|
+
if not line.strip():
|
|
509
|
+
continue
|
|
510
|
+
o = json.loads(line)
|
|
511
|
+
if doc_ids is None or o.get("doc_id") in set(doc_ids):
|
|
512
|
+
targets.append(o)
|
|
513
|
+
|
|
514
|
+
# set model on embed client if supported
|
|
515
|
+
embed = self.embed
|
|
516
|
+
|
|
517
|
+
# Re-embed in batches
|
|
518
|
+
added = 0
|
|
519
|
+
for i in range(0, len(targets), batch):
|
|
520
|
+
batch_ch = targets[i : i + batch]
|
|
521
|
+
vecs = await embed.embed([t["text"] for t in batch_ch])
|
|
522
|
+
chunk_ids = [t["chunk_id"] for t in batch_ch]
|
|
523
|
+
metas = [t.get("meta", {}) for t in batch_ch]
|
|
524
|
+
await self.index.add(corpus_id, chunk_ids, vecs, metas)
|
|
525
|
+
added += len(batch_ch)
|
|
526
|
+
return {"reembedded": added, "model": getattr(embed, "embed_model", None)}
|
|
527
|
+
|
|
528
|
+
async def stats(self, corpus_id: str) -> dict:
|
|
529
|
+
cdir = self._cdir(corpus_id)
|
|
530
|
+
docs_jl = os.path.join(cdir, "docs.jsonl")
|
|
531
|
+
chunks_jl = os.path.join(cdir, "chunks.jsonl")
|
|
532
|
+
|
|
533
|
+
def _count_lines(path: str) -> int:
|
|
534
|
+
if not os.path.exists(path):
|
|
535
|
+
return 0
|
|
536
|
+
with open(path, encoding="utf-8") as f:
|
|
537
|
+
return sum(1 for _ in f)
|
|
538
|
+
|
|
539
|
+
n_docs = _count_lines(docs_jl)
|
|
540
|
+
n_chunks = _count_lines(chunks_jl)
|
|
541
|
+
|
|
542
|
+
meta = {}
|
|
543
|
+
meta_path = os.path.join(cdir, "corpus.json")
|
|
544
|
+
if os.path.exists(meta_path):
|
|
545
|
+
try:
|
|
546
|
+
with open(meta_path, encoding="utf-8") as f:
|
|
547
|
+
meta = json.load(f)
|
|
548
|
+
except Exception:
|
|
549
|
+
meta = {}
|
|
550
|
+
return {"corpus_id": corpus_id, "docs": n_docs, "chunks": n_chunks, "meta": meta}
|
|
551
|
+
|
|
552
|
+
async def export(self, corpus_id: str) -> dict:
|
|
553
|
+
"""
|
|
554
|
+
Create a simple tarball-like directory bundle (docs+chunks+corpus.json) and persist via artifacts store.
|
|
555
|
+
"""
|
|
556
|
+
raise NotImplementedError("RAGFacade.export is not yet implemented")
|
|
557
|
+
# TODO: implement proper temp dir cleanup
|
|
558
|
+
cdir = self._cdir(corpus_id)
|
|
559
|
+
bundle_dir = os.path.join(cdir, f"bundle_{_now_iso().replace(':', '').replace('-', '')}")
|
|
560
|
+
os.makedirs(bundle_dir, exist_ok=True)
|
|
561
|
+
for name in ("corpus.json", "docs.jsonl", "chunks.jsonl"):
|
|
562
|
+
p = os.path.join(cdir, name)
|
|
563
|
+
if os.path.exists(p):
|
|
564
|
+
shutil.copy2(p, os.path.join(bundle_dir, name))
|
|
565
|
+
# Save dir via artifacts as a bundle
|
|
566
|
+
uri = await self.artifacts.save_dir(bundle_dir, labels={"corpus_id": corpus_id})
|
|
567
|
+
return {"uri": getattr(uri, "uri", uri)}
|
|
568
|
+
|
|
569
|
+
async def import_bundle(self, bundle_uri: str, into_corpus: str | None = None) -> dict:
|
|
570
|
+
"""
|
|
571
|
+
Resolve artifact dir and merge into an existing/new corpus.
|
|
572
|
+
"""
|
|
573
|
+
raise NotImplementedError("RAGFacade.import_bundle is not yet implemented")
|
|
574
|
+
# TODO: implement proper temp dir cleanup
|
|
575
|
+
# Assuming artifacts can resolve a dir path from URI
|
|
576
|
+
bundle_path = await self.artifacts.resolve_dir(bundle_uri)
|
|
577
|
+
with open(os.path.join(bundle_path, "corpus.json"), encoding="utf-8") as f:
|
|
578
|
+
meta = json.load(f)
|
|
579
|
+
target = into_corpus or meta.get("corpus_id")
|
|
580
|
+
await self.add_corpus(target, meta=meta.get("meta", {}))
|
|
581
|
+
|
|
582
|
+
# Append docs & chunks
|
|
583
|
+
for name in ("docs.jsonl", "chunks.jsonl"):
|
|
584
|
+
src = os.path.join(bundle_path, name)
|
|
585
|
+
if not os.path.exists(src):
|
|
586
|
+
continue
|
|
587
|
+
dst = os.path.join(self._cdir(target), name)
|
|
588
|
+
os.makedirs(os.path.dirname(dst), exist_ok=True)
|
|
589
|
+
with open(dst, "a", encoding="utf-8") as out_f, open(src, encoding="utf-8") as in_f:
|
|
590
|
+
for line in in_f:
|
|
591
|
+
if line.strip():
|
|
592
|
+
out_f.write(line)
|
|
593
|
+
return {"imported_into": target}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class VectorIndex:
|
|
7
|
+
def __init__(self, index_path: str):
|
|
8
|
+
self.index_path = index_path
|
|
9
|
+
|
|
10
|
+
async def add(
|
|
11
|
+
self,
|
|
12
|
+
corpus_id: str,
|
|
13
|
+
chunk_ids: list[str],
|
|
14
|
+
vectors: list[list[float]],
|
|
15
|
+
metas: list[dict[str, Any]],
|
|
16
|
+
):
|
|
17
|
+
raise NotImplementedError
|
|
18
|
+
|
|
19
|
+
async def delete(self, corpus_id: str, chunk_ids: list[str] | None = None):
|
|
20
|
+
raise NotImplementedError
|
|
21
|
+
|
|
22
|
+
async def search(self, corpus_id: str, query_vec: list[float], k: int) -> list[dict[str, Any]]:
|
|
23
|
+
"""Return a list of {{chunk_id, score, meta}} sorted by descending score."""
|
|
24
|
+
raise NotImplementedError
|
|
25
|
+
|
|
26
|
+
async def list_chunks(self, corpus_id: str) -> list[str]:
|
|
27
|
+
raise NotImplementedError
|