aethergraph 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. aethergraph/__init__.py +49 -0
  2. aethergraph/config/__init__.py +0 -0
  3. aethergraph/config/config.py +121 -0
  4. aethergraph/config/context.py +16 -0
  5. aethergraph/config/llm.py +26 -0
  6. aethergraph/config/loader.py +60 -0
  7. aethergraph/config/runtime.py +9 -0
  8. aethergraph/contracts/errors/errors.py +44 -0
  9. aethergraph/contracts/services/artifacts.py +142 -0
  10. aethergraph/contracts/services/channel.py +72 -0
  11. aethergraph/contracts/services/continuations.py +23 -0
  12. aethergraph/contracts/services/eventbus.py +12 -0
  13. aethergraph/contracts/services/kv.py +24 -0
  14. aethergraph/contracts/services/llm.py +17 -0
  15. aethergraph/contracts/services/mcp.py +22 -0
  16. aethergraph/contracts/services/memory.py +108 -0
  17. aethergraph/contracts/services/resume.py +28 -0
  18. aethergraph/contracts/services/state_stores.py +33 -0
  19. aethergraph/contracts/services/wakeup.py +28 -0
  20. aethergraph/core/execution/base_scheduler.py +77 -0
  21. aethergraph/core/execution/forward_scheduler.py +777 -0
  22. aethergraph/core/execution/global_scheduler.py +634 -0
  23. aethergraph/core/execution/retry_policy.py +22 -0
  24. aethergraph/core/execution/step_forward.py +411 -0
  25. aethergraph/core/execution/step_result.py +18 -0
  26. aethergraph/core/execution/wait_types.py +72 -0
  27. aethergraph/core/graph/graph_builder.py +192 -0
  28. aethergraph/core/graph/graph_fn.py +219 -0
  29. aethergraph/core/graph/graph_io.py +67 -0
  30. aethergraph/core/graph/graph_refs.py +154 -0
  31. aethergraph/core/graph/graph_spec.py +115 -0
  32. aethergraph/core/graph/graph_state.py +59 -0
  33. aethergraph/core/graph/graphify.py +128 -0
  34. aethergraph/core/graph/interpreter.py +145 -0
  35. aethergraph/core/graph/node_handle.py +33 -0
  36. aethergraph/core/graph/node_spec.py +46 -0
  37. aethergraph/core/graph/node_state.py +63 -0
  38. aethergraph/core/graph/task_graph.py +747 -0
  39. aethergraph/core/graph/task_node.py +82 -0
  40. aethergraph/core/graph/utils.py +37 -0
  41. aethergraph/core/graph/visualize.py +239 -0
  42. aethergraph/core/runtime/ad_hoc_context.py +61 -0
  43. aethergraph/core/runtime/base_service.py +153 -0
  44. aethergraph/core/runtime/bind_adapter.py +42 -0
  45. aethergraph/core/runtime/bound_memory.py +69 -0
  46. aethergraph/core/runtime/execution_context.py +220 -0
  47. aethergraph/core/runtime/graph_runner.py +349 -0
  48. aethergraph/core/runtime/lifecycle.py +26 -0
  49. aethergraph/core/runtime/node_context.py +203 -0
  50. aethergraph/core/runtime/node_services.py +30 -0
  51. aethergraph/core/runtime/recovery.py +159 -0
  52. aethergraph/core/runtime/run_registration.py +33 -0
  53. aethergraph/core/runtime/runtime_env.py +157 -0
  54. aethergraph/core/runtime/runtime_registry.py +32 -0
  55. aethergraph/core/runtime/runtime_services.py +224 -0
  56. aethergraph/core/runtime/wakeup_watcher.py +40 -0
  57. aethergraph/core/tools/__init__.py +10 -0
  58. aethergraph/core/tools/builtins/channel_tools.py +194 -0
  59. aethergraph/core/tools/builtins/toolset.py +134 -0
  60. aethergraph/core/tools/toolkit.py +510 -0
  61. aethergraph/core/tools/waitable.py +109 -0
  62. aethergraph/plugins/channel/__init__.py +0 -0
  63. aethergraph/plugins/channel/adapters/__init__.py +0 -0
  64. aethergraph/plugins/channel/adapters/console.py +106 -0
  65. aethergraph/plugins/channel/adapters/file.py +102 -0
  66. aethergraph/plugins/channel/adapters/slack.py +285 -0
  67. aethergraph/plugins/channel/adapters/telegram.py +302 -0
  68. aethergraph/plugins/channel/adapters/webhook.py +104 -0
  69. aethergraph/plugins/channel/adapters/webui.py +134 -0
  70. aethergraph/plugins/channel/routes/__init__.py +0 -0
  71. aethergraph/plugins/channel/routes/console_routes.py +86 -0
  72. aethergraph/plugins/channel/routes/slack_routes.py +49 -0
  73. aethergraph/plugins/channel/routes/telegram_routes.py +26 -0
  74. aethergraph/plugins/channel/routes/webui_routes.py +136 -0
  75. aethergraph/plugins/channel/utils/__init__.py +0 -0
  76. aethergraph/plugins/channel/utils/slack_utils.py +278 -0
  77. aethergraph/plugins/channel/utils/telegram_utils.py +324 -0
  78. aethergraph/plugins/channel/websockets/slack_ws.py +68 -0
  79. aethergraph/plugins/channel/websockets/telegram_polling.py +151 -0
  80. aethergraph/plugins/mcp/fs_server.py +128 -0
  81. aethergraph/plugins/mcp/http_server.py +101 -0
  82. aethergraph/plugins/mcp/ws_server.py +180 -0
  83. aethergraph/plugins/net/http.py +10 -0
  84. aethergraph/plugins/utils/data_io.py +359 -0
  85. aethergraph/runner/__init__.py +5 -0
  86. aethergraph/runtime/__init__.py +62 -0
  87. aethergraph/server/__init__.py +3 -0
  88. aethergraph/server/app_factory.py +84 -0
  89. aethergraph/server/start.py +122 -0
  90. aethergraph/services/__init__.py +10 -0
  91. aethergraph/services/artifacts/facade.py +284 -0
  92. aethergraph/services/artifacts/factory.py +35 -0
  93. aethergraph/services/artifacts/fs_store.py +656 -0
  94. aethergraph/services/artifacts/jsonl_index.py +123 -0
  95. aethergraph/services/artifacts/paths.py +23 -0
  96. aethergraph/services/artifacts/sqlite_index.py +209 -0
  97. aethergraph/services/artifacts/utils.py +124 -0
  98. aethergraph/services/auth/dev.py +16 -0
  99. aethergraph/services/channel/channel_bus.py +293 -0
  100. aethergraph/services/channel/factory.py +44 -0
  101. aethergraph/services/channel/session.py +511 -0
  102. aethergraph/services/channel/wait_helpers.py +57 -0
  103. aethergraph/services/clock/clock.py +9 -0
  104. aethergraph/services/container/default_container.py +320 -0
  105. aethergraph/services/continuations/continuation.py +56 -0
  106. aethergraph/services/continuations/factory.py +34 -0
  107. aethergraph/services/continuations/stores/fs_store.py +264 -0
  108. aethergraph/services/continuations/stores/inmem_store.py +95 -0
  109. aethergraph/services/eventbus/inmem.py +21 -0
  110. aethergraph/services/features/static.py +10 -0
  111. aethergraph/services/kv/ephemeral.py +90 -0
  112. aethergraph/services/kv/factory.py +27 -0
  113. aethergraph/services/kv/layered.py +41 -0
  114. aethergraph/services/kv/sqlite_kv.py +128 -0
  115. aethergraph/services/llm/factory.py +157 -0
  116. aethergraph/services/llm/generic_client.py +542 -0
  117. aethergraph/services/llm/providers.py +3 -0
  118. aethergraph/services/llm/service.py +105 -0
  119. aethergraph/services/logger/base.py +36 -0
  120. aethergraph/services/logger/compat.py +50 -0
  121. aethergraph/services/logger/formatters.py +106 -0
  122. aethergraph/services/logger/std.py +203 -0
  123. aethergraph/services/mcp/helpers.py +23 -0
  124. aethergraph/services/mcp/http_client.py +70 -0
  125. aethergraph/services/mcp/mcp_tools.py +21 -0
  126. aethergraph/services/mcp/registry.py +14 -0
  127. aethergraph/services/mcp/service.py +100 -0
  128. aethergraph/services/mcp/stdio_client.py +70 -0
  129. aethergraph/services/mcp/ws_client.py +115 -0
  130. aethergraph/services/memory/bound.py +106 -0
  131. aethergraph/services/memory/distillers/episode.py +116 -0
  132. aethergraph/services/memory/distillers/rolling.py +74 -0
  133. aethergraph/services/memory/facade.py +633 -0
  134. aethergraph/services/memory/factory.py +78 -0
  135. aethergraph/services/memory/hotlog_kv.py +27 -0
  136. aethergraph/services/memory/indices.py +74 -0
  137. aethergraph/services/memory/io_helpers.py +72 -0
  138. aethergraph/services/memory/persist_fs.py +40 -0
  139. aethergraph/services/memory/resolver.py +152 -0
  140. aethergraph/services/metering/noop.py +4 -0
  141. aethergraph/services/prompts/file_store.py +41 -0
  142. aethergraph/services/rag/chunker.py +29 -0
  143. aethergraph/services/rag/facade.py +593 -0
  144. aethergraph/services/rag/index/base.py +27 -0
  145. aethergraph/services/rag/index/faiss_index.py +121 -0
  146. aethergraph/services/rag/index/sqlite_index.py +134 -0
  147. aethergraph/services/rag/index_factory.py +52 -0
  148. aethergraph/services/rag/parsers/md.py +7 -0
  149. aethergraph/services/rag/parsers/pdf.py +14 -0
  150. aethergraph/services/rag/parsers/txt.py +7 -0
  151. aethergraph/services/rag/utils/hybrid.py +39 -0
  152. aethergraph/services/rag/utils/make_fs_key.py +62 -0
  153. aethergraph/services/redactor/simple.py +16 -0
  154. aethergraph/services/registry/key_parsing.py +44 -0
  155. aethergraph/services/registry/registry_key.py +19 -0
  156. aethergraph/services/registry/unified_registry.py +185 -0
  157. aethergraph/services/resume/multi_scheduler_resume_bus.py +65 -0
  158. aethergraph/services/resume/router.py +73 -0
  159. aethergraph/services/schedulers/registry.py +41 -0
  160. aethergraph/services/secrets/base.py +7 -0
  161. aethergraph/services/secrets/env.py +8 -0
  162. aethergraph/services/state_stores/externalize.py +135 -0
  163. aethergraph/services/state_stores/graph_observer.py +131 -0
  164. aethergraph/services/state_stores/json_store.py +67 -0
  165. aethergraph/services/state_stores/resume_policy.py +119 -0
  166. aethergraph/services/state_stores/serialize.py +249 -0
  167. aethergraph/services/state_stores/utils.py +91 -0
  168. aethergraph/services/state_stores/validate.py +78 -0
  169. aethergraph/services/tracing/noop.py +18 -0
  170. aethergraph/services/waits/wait_registry.py +91 -0
  171. aethergraph/services/wakeup/memory_queue.py +57 -0
  172. aethergraph/services/wakeup/scanner_producer.py +56 -0
  173. aethergraph/services/wakeup/worker.py +31 -0
  174. aethergraph/tools/__init__.py +25 -0
  175. aethergraph/utils/optdeps.py +8 -0
  176. aethergraph-0.1.0a1.dist-info/METADATA +410 -0
  177. aethergraph-0.1.0a1.dist-info/RECORD +182 -0
  178. aethergraph-0.1.0a1.dist-info/WHEEL +5 -0
  179. aethergraph-0.1.0a1.dist-info/entry_points.txt +2 -0
  180. aethergraph-0.1.0a1.dist-info/licenses/LICENSE +176 -0
  181. aethergraph-0.1.0a1.dist-info/licenses/NOTICE +31 -0
  182. aethergraph-0.1.0a1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,593 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ import hashlib
5
+ import json
6
+ import os
7
+ import shutil
8
+ import time
9
+ from typing import Any
10
+
11
+ from aethergraph.contracts.services.llm import LLMClientProtocol
12
+
13
+ from .chunker import TextSplitter
14
+ from .utils.hybrid import topk_fuse
15
+ from .utils.make_fs_key import make_fs_key
16
+
17
+
18
+ @dataclass
19
+ class SearchHit:
20
+ """A single search hit from RAG retrieval."""
21
+
22
+ chunk_id: str
23
+ doc_id: str
24
+ corpus_id: str
25
+ score: float
26
+ text: str
27
+ meta: dict[str, Any]
28
+
29
+
30
+ def _now_iso() -> str:
31
+ return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
32
+
33
+
34
+ def _stable_id(parts: dict[str, Any]) -> str:
35
+ blob = json.dumps(parts, sort_keys=True, ensure_ascii=False).encode("utf-8")
36
+ return hashlib.sha256(blob).hexdigest()[:24]
37
+
38
+
39
+ class RAGFacade:
40
+ """Facade for RAG operations: corpus management, document ingestion, retrieval, and QA."""
41
+
42
+ def __init__(
43
+ self,
44
+ *,
45
+ corpus_root: str,
46
+ artifacts,
47
+ embed_client: LLMClientProtocol,
48
+ llm_client: LLMClientProtocol,
49
+ index_backend,
50
+ chunker: TextSplitter,
51
+ logger=None,
52
+ ):
53
+ """Initialize RAGFacade with storage paths and service clients.
54
+ Args:
55
+ corpus_root: Root directory for storing corpora.
56
+ artifacts: Artifact storage facade.
57
+ embed_client: Embedding service client.
58
+ index_backend: Vector index backend.
59
+ chunker: TextSplitter instance for chunking documents.
60
+ logger: Optional logger for logging messages.
61
+ """
62
+ self.root = corpus_root
63
+ self.artifacts = artifacts
64
+ self.embed = embed_client
65
+ self.llm = llm_client
66
+ self.index = index_backend
67
+ self.chunker = chunker
68
+ self.logger = logger
69
+
70
+ # self.logger.info(f"RAGFacade initialized with corpus root: {self.root}, index: {type(self.index).__name__}, embed model: {getattr(self.embed, 'embed_model', None)}, llm model: {getattr(self.llm, 'model', None)}")
71
+
72
+ def set_llm_client(self, client: LLMClientProtocol) -> None:
73
+ """Set the LLM client to use for answering questions."""
74
+ assert client.model is not None, "RAG LLM client must have a model set"
75
+ assert client.embed_model is not None, "RAG LLM client must have an embedding model set"
76
+ self.llm = client
77
+ self.logger.info(
78
+ f"RAG LLM client set to model: {self.llm.model}, embed model: {self.llm.embed_model}"
79
+ )
80
+
81
+ def set_index_backend(self, index_backend) -> None:
82
+ """Set the vector index backend."""
83
+ self.index = index_backend
84
+ self.logger.info(f"RAG index backend set to: {type(self.index).__name__}")
85
+
86
+ def _cdir(self, corpus_id: str) -> str:
87
+ """Get corpus directory path based on corpus ID while ensuring the path work safely across OS.
88
+ Args:
89
+ corpus_id: Unique identifier for the corpus.
90
+ Returns:
91
+ Path to the corpus directory.
92
+ """
93
+
94
+ return os.path.join(self.root, make_fs_key(corpus_id))
95
+
96
+ # ---------- ingestion ----------
97
+ async def add_corpus(self, corpus_id: str, meta: dict[str, Any] | None = None):
98
+ """Create a new corpus with optional metadata.
99
+ Args:
100
+ corpus_id: Unique identifier for the corpus.
101
+ meta: Optional metadata dictionary to store with the corpus.
102
+ """
103
+ p = self._cdir(corpus_id)
104
+ os.makedirs(p, exist_ok=True)
105
+ meta_path = os.path.join(p, "corpus.json")
106
+ if not os.path.exists(meta_path):
107
+ with open(meta_path, "w", encoding="utf-8") as f:
108
+ json.dump(
109
+ {
110
+ "corpus_id": corpus_id,
111
+ "fs_key": make_fs_key(corpus_id), # for reference
112
+ "created_at": _now_iso(),
113
+ "meta": meta or {},
114
+ },
115
+ f,
116
+ )
117
+
118
+ async def upsert_docs(self, corpus_id: str, docs: list[dict[str, Any]]) -> dict[str, Any]:
119
+ """Ingest and index a list of documents into the specified corpus.
120
+ Args:
121
+ corpus_id: The target corpus identifier.
122
+ docs: A list of document specifications.
123
+
124
+ Docs can be specified as either:
125
+ - File-based documents: {"path": "/path/to/doc.pdf", "labels": {...}}
126
+ - Inline text documents: {"text": "Document content...", "title": "Doc Title", "labels": {...}}
127
+ """
128
+ if not self.embed:
129
+ raise RuntimeError("RAGFacade: embed client not configured")
130
+
131
+ await self.add_corpus(corpus_id)
132
+ cdir = self._cdir(corpus_id)
133
+ docs_jl = os.path.join(cdir, "docs.jsonl")
134
+ chunks_jl = os.path.join(cdir, "chunks.jsonl")
135
+ os.makedirs(cdir, exist_ok=True)
136
+
137
+ added_docs = 0
138
+ all_chunk_ids, all_vecs, all_metas = [], [], []
139
+ total_chunks = 0
140
+
141
+ for d in docs:
142
+ labels = d.get("labels", {})
143
+ title = d.get("title") or os.path.basename(d.get("path", "")) or "untitled"
144
+ doc_id = _stable_id({"title": title, "labels": labels, "ts": _now_iso()})
145
+ text = None
146
+ extra_meta = {}
147
+
148
+ if "path" in d and os.path.exists(d["path"]):
149
+ # save original file into artifacts CAS and parse
150
+ uri = await self.artifacts.save_file(
151
+ path=d["path"],
152
+ kind="doc",
153
+ run_id="rag",
154
+ graph_id="rag",
155
+ node_id="rag",
156
+ tool_name="rag.upsert",
157
+ tool_version="0.1.0",
158
+ labels=labels,
159
+ cleanup=False,
160
+ )
161
+ path = d["path"].lower()
162
+ if path.endswith(".pdf"):
163
+ from .parsers.pdf import extract_text
164
+
165
+ text, extra_meta = extract_text(d["path"]) # type: ignore
166
+ elif path.endswith(".md") or path.endswith(".markdown") or path.endswith(".mkd"):
167
+ from .parsers.md import extract_text
168
+
169
+ text, extra_meta = extract_text(d["path"]) # type: ignore
170
+ else:
171
+ from .parsers.txt import extract_text
172
+
173
+ text, extra_meta = extract_text(d["path"]) # type: ignore
174
+ doc_uri = uri.uri if hasattr(uri, "uri") else uri
175
+ else:
176
+ # inline text doc — persist as artifact first
177
+ payload = d.get("text", "")
178
+ uri = await self.artifacts.save_text(payload=payload) # store as temp artifact
179
+ doc_uri = uri.uri if hasattr(uri, "uri") else uri
180
+ text = payload
181
+
182
+ text = (text or "").strip()
183
+ if not text:
184
+ if self.logger:
185
+ self.logger.warning(f"RAG: empty text for doc {title}")
186
+ continue
187
+
188
+ # write doc record
189
+ with open(docs_jl, "a", encoding="utf-8") as f:
190
+ f.write(
191
+ json.dumps(
192
+ {
193
+ "doc_id": doc_id,
194
+ "corpus_id": corpus_id,
195
+ "uri": doc_uri,
196
+ "title": title,
197
+ "meta": {"labels": labels, **extra_meta},
198
+ "created_at": _now_iso(),
199
+ },
200
+ ensure_ascii=False,
201
+ )
202
+ + "\n"
203
+ )
204
+ added_docs += 1
205
+
206
+ # chunk + embed
207
+ chunks = self.chunker.split(text)
208
+ if not chunks:
209
+ continue
210
+ # batch embed
211
+ vecs = await self.embed.embed(chunks)
212
+ for i, (chunk_text, vec) in enumerate(zip(chunks, vecs, strict=True)):
213
+ chunk_id = _stable_id({"doc": doc_id, "i": i})
214
+ meta = {"doc_id": doc_id, "title": title, "i": i, "labels": labels}
215
+ # append chunk record
216
+ with open(chunks_jl, "a", encoding="utf-8") as f:
217
+ f.write(
218
+ json.dumps(
219
+ {
220
+ "chunk_id": chunk_id,
221
+ "doc_id": doc_id,
222
+ "corpus_id": corpus_id,
223
+ "text": chunk_text,
224
+ "meta": meta,
225
+ },
226
+ ensure_ascii=False,
227
+ )
228
+ + "\n"
229
+ )
230
+ all_chunk_ids.append(chunk_id)
231
+ all_vecs.append(vec)
232
+ all_metas.append({**meta})
233
+ total_chunks += len(chunks)
234
+
235
+ # add to index
236
+ if all_chunk_ids:
237
+ await self.index.add(corpus_id, all_chunk_ids, all_vecs, all_metas)
238
+
239
+ return {"added": added_docs, "chunks": total_chunks, "index": type(self.index).__name__}
240
+
241
+ # ---------- retrieval ----------
242
+ def _load_chunks_map(self, corpus_id: str) -> dict[str, dict[str, Any]]:
243
+ """Load chunk metadata for a given corpus."""
244
+ # Load latest chunk text+meta into a dict
245
+ cdir = self._cdir(corpus_id)
246
+ chunks_jl = os.path.join(cdir, "chunks.jsonl")
247
+ out = {}
248
+ if not os.path.exists(chunks_jl):
249
+ return out
250
+ with open(chunks_jl, encoding="utf-8") as f:
251
+ for line in f:
252
+ if not line.strip():
253
+ continue
254
+ obj = json.loads(line)
255
+ out[obj["chunk_id"]] = obj
256
+ return out
257
+
258
+ async def search(
259
+ self,
260
+ corpus_id: str,
261
+ query: str,
262
+ k: int = 8,
263
+ filters: dict[str, Any] | None = None,
264
+ mode: str = "hybrid",
265
+ ) -> list[SearchHit]:
266
+ """Search the corpus for relevant chunks given a query.
267
+ Args:
268
+ corpus_id: Target corpus identifier.
269
+ query: The search query string.
270
+ k: Number of top results to return.
271
+ filters: Optional metadata filters to apply.
272
+ mode: Search mode - "dense", "hybrid".
273
+ """
274
+ if not self.embed:
275
+ raise RuntimeError("RAGFacade: embed client not configured")
276
+
277
+ # dense search via index then optional lexical fusion
278
+ qvec = (await self.embed.embed([query]))[0]
279
+ dense_hits = await self.index.search(corpus_id, qvec, max(24, k))
280
+ chunks_map = self._load_chunks_map(corpus_id)
281
+ if mode == "dense" or not dense_hits:
282
+ dense_hits = dense_hits[:k]
283
+ return [
284
+ SearchHit(
285
+ chunk_id=h["chunk_id"],
286
+ doc_id=chunks_map.get(h["chunk_id"], {}).get("doc_id", ""),
287
+ corpus_id=corpus_id,
288
+ score=h["score"],
289
+ text=chunks_map.get(h["chunk_id"], {}).get("text", ""),
290
+ meta=h.get("meta", {}),
291
+ )
292
+ for h in dense_hits
293
+ ]
294
+
295
+ fused = topk_fuse(
296
+ query, dense_hits, {cid: rec.get("text", "") for cid, rec in chunks_map.items()}, k
297
+ )
298
+ out = []
299
+ for h in fused:
300
+ rec = chunks_map.get(h["chunk_id"], {})
301
+ out.append(
302
+ SearchHit(
303
+ chunk_id=h["chunk_id"],
304
+ doc_id=rec.get("doc_id", ""),
305
+ corpus_id=corpus_id,
306
+ score=h["score"],
307
+ text=rec.get("text", ""),
308
+ meta=h.get("meta", {}),
309
+ )
310
+ )
311
+ return out
312
+
313
+ async def retrieve(
314
+ self, corpus_id: str, query: str, k: int = 6, rerank: bool = True
315
+ ) -> list[SearchHit]:
316
+ """Retrieve top-k relevant chunks for a query from the corpus.
317
+ Args:
318
+ corpus_id: Target corpus identifier.
319
+ query: The retrieval query string.
320
+ k: Number of top results to return.
321
+ rerank: Whether to rerank results using hybrid scoring.
322
+ """
323
+ # For now, rerank flag is ignored; fused hybrid already sorts reasonably.
324
+ return await self.search(corpus_id, query, k=k, mode="hybrid")
325
+
326
+ async def answer(
327
+ self,
328
+ corpus_id: str,
329
+ question: str,
330
+ *,
331
+ llm: LLMClientProtocol | None = None,
332
+ style: str = "concise",
333
+ with_citations: bool = True,
334
+ k: int = 6,
335
+ ) -> dict[str, Any]:
336
+ """Answer a question using retrieved context from the corpus.
337
+ Args:
338
+ corpus_id: Target corpus identifier.
339
+ question: The question to answer.
340
+ llm: Language model client for generating the answer. If None, uses default LLM.
341
+ style: Answering style - "concise" or "detailed".
342
+ with_citations: Whether to include citations in the answer.
343
+ k: Number of context chunks to retrieve.
344
+ """
345
+ if not llm:
346
+ # use default LLM client
347
+ llm = self.llm
348
+
349
+ hits = await self.retrieve(corpus_id, question, k=k, rerank=True)
350
+ context = "\n\n".join([f"[{i + 1}] {h.text}" for i, h in enumerate(hits)])
351
+ sys = "You answer strictly from the provided context. Cite chunk numbers like [1],[2]. If insufficient, say you don't know."
352
+ if style == "detailed":
353
+ sys += " Be structured and explain reasoning briefly."
354
+ usr = f"Question: {question}\n\nContext:\n{context}"
355
+ text, usage = await llm.chat(
356
+ [{"role": "system", "content": sys}, {"role": "user", "content": usr}]
357
+ )
358
+ out = {
359
+ "answer": text,
360
+ "citations": [
361
+ {"chunk_id": h.chunk_id, "doc_id": h.doc_id, "rank": i + 1}
362
+ for i, h in enumerate(hits)
363
+ ],
364
+ "usage": usage,
365
+ }
366
+ if with_citations:
367
+ out["resolved_citations"] = self.resolve_citations(corpus_id, out["citations"])
368
+ return out
369
+
370
+ def resolve_citations(self, corpus_id: str, citations: list[dict]) -> list[dict]:
371
+ """Return [{rank, doc_id, title, uri, chunk_id, snippet}] sorted by rank."""
372
+ # load chunks + doc meta
373
+ cdir = self._cdir(corpus_id)
374
+ chunks_jl = os.path.join(cdir, "chunks.jsonl")
375
+ docs_jl = os.path.join(cdir, "docs.jsonl")
376
+
377
+ # build maps
378
+ chunk_map, doc_map = {}, {}
379
+ if os.path.exists(chunks_jl):
380
+ with open(chunks_jl, encoding="utf-8") as f:
381
+ for line in f:
382
+ o = json.loads(line)
383
+ chunk_map[o["chunk_id"]] = o
384
+ if os.path.exists(docs_jl):
385
+ with open(docs_jl, encoding="utf-8") as f:
386
+ for line in f:
387
+ o = json.loads(line)
388
+ doc_map[o["doc_id"]] = o
389
+
390
+ out = []
391
+ for c in sorted(citations, key=lambda x: x["rank"]):
392
+ ch = chunk_map.get(c["chunk_id"], {})
393
+ dd = doc_map.get(c["doc_id"], {})
394
+ text = (ch.get("text") or "").strip().replace("\n", " ")
395
+ snippet = (text[:220] + "…") if len(text) > 220 else text
396
+ out.append(
397
+ {
398
+ "rank": c["rank"],
399
+ "doc_id": c["doc_id"],
400
+ "title": dd.get("title", "(untitled)"),
401
+ "uri": dd.get("uri"), # CAS or file URI from artifact store
402
+ "chunk_id": c["chunk_id"],
403
+ "snippet": snippet,
404
+ }
405
+ )
406
+ return out
407
+
408
+ async def list_corpora(self) -> list[dict]:
409
+ out = []
410
+ for d in sorted(os.listdir(self.root)):
411
+ # cdir = self._cdir(d)
412
+ cdir = os.path.join(self.root, d) # d is already fs_key
413
+ if not os.path.isdir(cdir):
414
+ continue
415
+ meta_path = os.path.join(cdir, "corpus.json")
416
+ meta = {}
417
+ if os.path.exists(meta_path):
418
+ try:
419
+ with open(meta_path, encoding="utf-8") as f:
420
+ meta = json.load(f)
421
+ except Exception:
422
+ meta = {}
423
+ # Prefer the recorded logical id (i.e. corpus_id); fall back to folder name (which may be fs-safe key)
424
+ logical_id = meta.get("corpus_id") or meta.get("logical_id") or d
425
+ out.append({"corpus_id": logical_id, "meta": meta})
426
+ return out
427
+
428
+ async def list_docs(
429
+ self, corpus_id: str, limit: int = 200, after: str | None = None
430
+ ) -> list[dict]:
431
+ cdir = self._cdir(corpus_id)
432
+ docs_jl = os.path.join(cdir, "docs.jsonl")
433
+ if not os.path.exists(docs_jl):
434
+ return []
435
+ acc: list[dict] = []
436
+ seen_after = after is None
437
+ with open(docs_jl, encoding="utf-8") as f:
438
+ for line in f:
439
+ if not line.strip():
440
+ continue
441
+ obj = json.loads(line)
442
+ if not seen_after:
443
+ if obj.get("doc_id") == after:
444
+ seen_after = True
445
+ continue
446
+ acc.append(obj)
447
+ if len(acc) >= limit:
448
+ break
449
+ return acc
450
+
451
+ async def delete_docs(self, corpus_id: str, doc_ids: list[str]) -> dict:
452
+ """
453
+ Removes docs from docs.jsonl and any chunks in chunks.jsonl; asks the index to drop vectors if supported.
454
+ """
455
+ cdir = self._cdir(corpus_id)
456
+ docs_jl = os.path.join(cdir, "docs.jsonl")
457
+ chunks_jl = os.path.join(cdir, "chunks.jsonl")
458
+ kept_docs, kept_chunks = [], []
459
+ removed_chunks = []
460
+ doc_set = set(doc_ids)
461
+
462
+ if os.path.exists(chunks_jl):
463
+ with open(chunks_jl, encoding="utf-8") as f:
464
+ for line in f:
465
+ if not line.strip():
466
+ continue
467
+ o = json.loads(line)
468
+ if o.get("doc_id") in doc_set:
469
+ removed_chunks.append(o.get("chunk_id"))
470
+ else:
471
+ kept_chunks.append(line)
472
+ with open(chunks_jl, "w", encoding="utf-8") as f:
473
+ f.writelines(kept_chunks)
474
+
475
+ if os.path.exists(docs_jl):
476
+ with open(docs_jl, encoding="utf-8") as f:
477
+ for line in f:
478
+ if not line.strip():
479
+ continue
480
+ o = json.loads(line)
481
+ if o.get("doc_id") not in doc_set:
482
+ kept_docs.append(line)
483
+ with open(docs_jl, "w", encoding="utf-8") as f:
484
+ f.writelines(kept_docs)
485
+
486
+ # drop from index if supported
487
+ if hasattr(self.index, "remove"):
488
+ await self.index.remove(corpus_id, removed_chunks)
489
+ elif hasattr(self.index, "delete"):
490
+ await self.index.delete(corpus_id, removed_chunks)
491
+
492
+ return {"removed_docs": len(doc_ids), "removed_chunks": len(removed_chunks)}
493
+
494
+ async def reembed(
495
+ self, corpus_id: str, *, doc_ids: list[str] | None = None, batch: int = 64
496
+ ) -> dict:
497
+ """
498
+ Re-embeds selected docs (or all) and re-adds vectors. Uses the configured embed client or a model override if your client supports it.
499
+ """
500
+ cdir = self._cdir(corpus_id)
501
+ chunks_jl = os.path.join(cdir, "chunks.jsonl")
502
+ if not os.path.exists(chunks_jl):
503
+ return {"reembedded": 0}
504
+
505
+ targets: list[dict] = []
506
+ with open(chunks_jl, encoding="utf-8") as f:
507
+ for line in f:
508
+ if not line.strip():
509
+ continue
510
+ o = json.loads(line)
511
+ if doc_ids is None or o.get("doc_id") in set(doc_ids):
512
+ targets.append(o)
513
+
514
+ # set model on embed client if supported
515
+ embed = self.embed
516
+
517
+ # Re-embed in batches
518
+ added = 0
519
+ for i in range(0, len(targets), batch):
520
+ batch_ch = targets[i : i + batch]
521
+ vecs = await embed.embed([t["text"] for t in batch_ch])
522
+ chunk_ids = [t["chunk_id"] for t in batch_ch]
523
+ metas = [t.get("meta", {}) for t in batch_ch]
524
+ await self.index.add(corpus_id, chunk_ids, vecs, metas)
525
+ added += len(batch_ch)
526
+ return {"reembedded": added, "model": getattr(embed, "embed_model", None)}
527
+
528
+ async def stats(self, corpus_id: str) -> dict:
529
+ cdir = self._cdir(corpus_id)
530
+ docs_jl = os.path.join(cdir, "docs.jsonl")
531
+ chunks_jl = os.path.join(cdir, "chunks.jsonl")
532
+
533
+ def _count_lines(path: str) -> int:
534
+ if not os.path.exists(path):
535
+ return 0
536
+ with open(path, encoding="utf-8") as f:
537
+ return sum(1 for _ in f)
538
+
539
+ n_docs = _count_lines(docs_jl)
540
+ n_chunks = _count_lines(chunks_jl)
541
+
542
+ meta = {}
543
+ meta_path = os.path.join(cdir, "corpus.json")
544
+ if os.path.exists(meta_path):
545
+ try:
546
+ with open(meta_path, encoding="utf-8") as f:
547
+ meta = json.load(f)
548
+ except Exception:
549
+ meta = {}
550
+ return {"corpus_id": corpus_id, "docs": n_docs, "chunks": n_chunks, "meta": meta}
551
+
552
+ async def export(self, corpus_id: str) -> dict:
553
+ """
554
+ Create a simple tarball-like directory bundle (docs+chunks+corpus.json) and persist via artifacts store.
555
+ """
556
+ raise NotImplementedError("RAGFacade.export is not yet implemented")
557
+ # TODO: implement proper temp dir cleanup
558
+ cdir = self._cdir(corpus_id)
559
+ bundle_dir = os.path.join(cdir, f"bundle_{_now_iso().replace(':', '').replace('-', '')}")
560
+ os.makedirs(bundle_dir, exist_ok=True)
561
+ for name in ("corpus.json", "docs.jsonl", "chunks.jsonl"):
562
+ p = os.path.join(cdir, name)
563
+ if os.path.exists(p):
564
+ shutil.copy2(p, os.path.join(bundle_dir, name))
565
+ # Save dir via artifacts as a bundle
566
+ uri = await self.artifacts.save_dir(bundle_dir, labels={"corpus_id": corpus_id})
567
+ return {"uri": getattr(uri, "uri", uri)}
568
+
569
+ async def import_bundle(self, bundle_uri: str, into_corpus: str | None = None) -> dict:
570
+ """
571
+ Resolve artifact dir and merge into an existing/new corpus.
572
+ """
573
+ raise NotImplementedError("RAGFacade.import_bundle is not yet implemented")
574
+ # TODO: implement proper temp dir cleanup
575
+ # Assuming artifacts can resolve a dir path from URI
576
+ bundle_path = await self.artifacts.resolve_dir(bundle_uri)
577
+ with open(os.path.join(bundle_path, "corpus.json"), encoding="utf-8") as f:
578
+ meta = json.load(f)
579
+ target = into_corpus or meta.get("corpus_id")
580
+ await self.add_corpus(target, meta=meta.get("meta", {}))
581
+
582
+ # Append docs & chunks
583
+ for name in ("docs.jsonl", "chunks.jsonl"):
584
+ src = os.path.join(bundle_path, name)
585
+ if not os.path.exists(src):
586
+ continue
587
+ dst = os.path.join(self._cdir(target), name)
588
+ os.makedirs(os.path.dirname(dst), exist_ok=True)
589
+ with open(dst, "a", encoding="utf-8") as out_f, open(src, encoding="utf-8") as in_f:
590
+ for line in in_f:
591
+ if line.strip():
592
+ out_f.write(line)
593
+ return {"imported_into": target}
@@ -0,0 +1,27 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+
6
+ class VectorIndex:
7
+ def __init__(self, index_path: str):
8
+ self.index_path = index_path
9
+
10
+ async def add(
11
+ self,
12
+ corpus_id: str,
13
+ chunk_ids: list[str],
14
+ vectors: list[list[float]],
15
+ metas: list[dict[str, Any]],
16
+ ):
17
+ raise NotImplementedError
18
+
19
+ async def delete(self, corpus_id: str, chunk_ids: list[str] | None = None):
20
+ raise NotImplementedError
21
+
22
+ async def search(self, corpus_id: str, query_vec: list[float], k: int) -> list[dict[str, Any]]:
23
+ """Return a list of {{chunk_id, score, meta}} sorted by descending score."""
24
+ raise NotImplementedError
25
+
26
+ async def list_chunks(self, corpus_id: str) -> list[str]:
27
+ raise NotImplementedError