aethergraph 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aethergraph/__init__.py +49 -0
- aethergraph/config/__init__.py +0 -0
- aethergraph/config/config.py +121 -0
- aethergraph/config/context.py +16 -0
- aethergraph/config/llm.py +26 -0
- aethergraph/config/loader.py +60 -0
- aethergraph/config/runtime.py +9 -0
- aethergraph/contracts/errors/errors.py +44 -0
- aethergraph/contracts/services/artifacts.py +142 -0
- aethergraph/contracts/services/channel.py +72 -0
- aethergraph/contracts/services/continuations.py +23 -0
- aethergraph/contracts/services/eventbus.py +12 -0
- aethergraph/contracts/services/kv.py +24 -0
- aethergraph/contracts/services/llm.py +17 -0
- aethergraph/contracts/services/mcp.py +22 -0
- aethergraph/contracts/services/memory.py +108 -0
- aethergraph/contracts/services/resume.py +28 -0
- aethergraph/contracts/services/state_stores.py +33 -0
- aethergraph/contracts/services/wakeup.py +28 -0
- aethergraph/core/execution/base_scheduler.py +77 -0
- aethergraph/core/execution/forward_scheduler.py +777 -0
- aethergraph/core/execution/global_scheduler.py +634 -0
- aethergraph/core/execution/retry_policy.py +22 -0
- aethergraph/core/execution/step_forward.py +411 -0
- aethergraph/core/execution/step_result.py +18 -0
- aethergraph/core/execution/wait_types.py +72 -0
- aethergraph/core/graph/graph_builder.py +192 -0
- aethergraph/core/graph/graph_fn.py +219 -0
- aethergraph/core/graph/graph_io.py +67 -0
- aethergraph/core/graph/graph_refs.py +154 -0
- aethergraph/core/graph/graph_spec.py +115 -0
- aethergraph/core/graph/graph_state.py +59 -0
- aethergraph/core/graph/graphify.py +128 -0
- aethergraph/core/graph/interpreter.py +145 -0
- aethergraph/core/graph/node_handle.py +33 -0
- aethergraph/core/graph/node_spec.py +46 -0
- aethergraph/core/graph/node_state.py +63 -0
- aethergraph/core/graph/task_graph.py +747 -0
- aethergraph/core/graph/task_node.py +82 -0
- aethergraph/core/graph/utils.py +37 -0
- aethergraph/core/graph/visualize.py +239 -0
- aethergraph/core/runtime/ad_hoc_context.py +61 -0
- aethergraph/core/runtime/base_service.py +153 -0
- aethergraph/core/runtime/bind_adapter.py +42 -0
- aethergraph/core/runtime/bound_memory.py +69 -0
- aethergraph/core/runtime/execution_context.py +220 -0
- aethergraph/core/runtime/graph_runner.py +349 -0
- aethergraph/core/runtime/lifecycle.py +26 -0
- aethergraph/core/runtime/node_context.py +203 -0
- aethergraph/core/runtime/node_services.py +30 -0
- aethergraph/core/runtime/recovery.py +159 -0
- aethergraph/core/runtime/run_registration.py +33 -0
- aethergraph/core/runtime/runtime_env.py +157 -0
- aethergraph/core/runtime/runtime_registry.py +32 -0
- aethergraph/core/runtime/runtime_services.py +224 -0
- aethergraph/core/runtime/wakeup_watcher.py +40 -0
- aethergraph/core/tools/__init__.py +10 -0
- aethergraph/core/tools/builtins/channel_tools.py +194 -0
- aethergraph/core/tools/builtins/toolset.py +134 -0
- aethergraph/core/tools/toolkit.py +510 -0
- aethergraph/core/tools/waitable.py +109 -0
- aethergraph/plugins/channel/__init__.py +0 -0
- aethergraph/plugins/channel/adapters/__init__.py +0 -0
- aethergraph/plugins/channel/adapters/console.py +106 -0
- aethergraph/plugins/channel/adapters/file.py +102 -0
- aethergraph/plugins/channel/adapters/slack.py +285 -0
- aethergraph/plugins/channel/adapters/telegram.py +302 -0
- aethergraph/plugins/channel/adapters/webhook.py +104 -0
- aethergraph/plugins/channel/adapters/webui.py +134 -0
- aethergraph/plugins/channel/routes/__init__.py +0 -0
- aethergraph/plugins/channel/routes/console_routes.py +86 -0
- aethergraph/plugins/channel/routes/slack_routes.py +49 -0
- aethergraph/plugins/channel/routes/telegram_routes.py +26 -0
- aethergraph/plugins/channel/routes/webui_routes.py +136 -0
- aethergraph/plugins/channel/utils/__init__.py +0 -0
- aethergraph/plugins/channel/utils/slack_utils.py +278 -0
- aethergraph/plugins/channel/utils/telegram_utils.py +324 -0
- aethergraph/plugins/channel/websockets/slack_ws.py +68 -0
- aethergraph/plugins/channel/websockets/telegram_polling.py +151 -0
- aethergraph/plugins/mcp/fs_server.py +128 -0
- aethergraph/plugins/mcp/http_server.py +101 -0
- aethergraph/plugins/mcp/ws_server.py +180 -0
- aethergraph/plugins/net/http.py +10 -0
- aethergraph/plugins/utils/data_io.py +359 -0
- aethergraph/runner/__init__.py +5 -0
- aethergraph/runtime/__init__.py +62 -0
- aethergraph/server/__init__.py +3 -0
- aethergraph/server/app_factory.py +84 -0
- aethergraph/server/start.py +122 -0
- aethergraph/services/__init__.py +10 -0
- aethergraph/services/artifacts/facade.py +284 -0
- aethergraph/services/artifacts/factory.py +35 -0
- aethergraph/services/artifacts/fs_store.py +656 -0
- aethergraph/services/artifacts/jsonl_index.py +123 -0
- aethergraph/services/artifacts/paths.py +23 -0
- aethergraph/services/artifacts/sqlite_index.py +209 -0
- aethergraph/services/artifacts/utils.py +124 -0
- aethergraph/services/auth/dev.py +16 -0
- aethergraph/services/channel/channel_bus.py +293 -0
- aethergraph/services/channel/factory.py +44 -0
- aethergraph/services/channel/session.py +511 -0
- aethergraph/services/channel/wait_helpers.py +57 -0
- aethergraph/services/clock/clock.py +9 -0
- aethergraph/services/container/default_container.py +320 -0
- aethergraph/services/continuations/continuation.py +56 -0
- aethergraph/services/continuations/factory.py +34 -0
- aethergraph/services/continuations/stores/fs_store.py +264 -0
- aethergraph/services/continuations/stores/inmem_store.py +95 -0
- aethergraph/services/eventbus/inmem.py +21 -0
- aethergraph/services/features/static.py +10 -0
- aethergraph/services/kv/ephemeral.py +90 -0
- aethergraph/services/kv/factory.py +27 -0
- aethergraph/services/kv/layered.py +41 -0
- aethergraph/services/kv/sqlite_kv.py +128 -0
- aethergraph/services/llm/factory.py +157 -0
- aethergraph/services/llm/generic_client.py +542 -0
- aethergraph/services/llm/providers.py +3 -0
- aethergraph/services/llm/service.py +105 -0
- aethergraph/services/logger/base.py +36 -0
- aethergraph/services/logger/compat.py +50 -0
- aethergraph/services/logger/formatters.py +106 -0
- aethergraph/services/logger/std.py +203 -0
- aethergraph/services/mcp/helpers.py +23 -0
- aethergraph/services/mcp/http_client.py +70 -0
- aethergraph/services/mcp/mcp_tools.py +21 -0
- aethergraph/services/mcp/registry.py +14 -0
- aethergraph/services/mcp/service.py +100 -0
- aethergraph/services/mcp/stdio_client.py +70 -0
- aethergraph/services/mcp/ws_client.py +115 -0
- aethergraph/services/memory/bound.py +106 -0
- aethergraph/services/memory/distillers/episode.py +116 -0
- aethergraph/services/memory/distillers/rolling.py +74 -0
- aethergraph/services/memory/facade.py +633 -0
- aethergraph/services/memory/factory.py +78 -0
- aethergraph/services/memory/hotlog_kv.py +27 -0
- aethergraph/services/memory/indices.py +74 -0
- aethergraph/services/memory/io_helpers.py +72 -0
- aethergraph/services/memory/persist_fs.py +40 -0
- aethergraph/services/memory/resolver.py +152 -0
- aethergraph/services/metering/noop.py +4 -0
- aethergraph/services/prompts/file_store.py +41 -0
- aethergraph/services/rag/chunker.py +29 -0
- aethergraph/services/rag/facade.py +593 -0
- aethergraph/services/rag/index/base.py +27 -0
- aethergraph/services/rag/index/faiss_index.py +121 -0
- aethergraph/services/rag/index/sqlite_index.py +134 -0
- aethergraph/services/rag/index_factory.py +52 -0
- aethergraph/services/rag/parsers/md.py +7 -0
- aethergraph/services/rag/parsers/pdf.py +14 -0
- aethergraph/services/rag/parsers/txt.py +7 -0
- aethergraph/services/rag/utils/hybrid.py +39 -0
- aethergraph/services/rag/utils/make_fs_key.py +62 -0
- aethergraph/services/redactor/simple.py +16 -0
- aethergraph/services/registry/key_parsing.py +44 -0
- aethergraph/services/registry/registry_key.py +19 -0
- aethergraph/services/registry/unified_registry.py +185 -0
- aethergraph/services/resume/multi_scheduler_resume_bus.py +65 -0
- aethergraph/services/resume/router.py +73 -0
- aethergraph/services/schedulers/registry.py +41 -0
- aethergraph/services/secrets/base.py +7 -0
- aethergraph/services/secrets/env.py +8 -0
- aethergraph/services/state_stores/externalize.py +135 -0
- aethergraph/services/state_stores/graph_observer.py +131 -0
- aethergraph/services/state_stores/json_store.py +67 -0
- aethergraph/services/state_stores/resume_policy.py +119 -0
- aethergraph/services/state_stores/serialize.py +249 -0
- aethergraph/services/state_stores/utils.py +91 -0
- aethergraph/services/state_stores/validate.py +78 -0
- aethergraph/services/tracing/noop.py +18 -0
- aethergraph/services/waits/wait_registry.py +91 -0
- aethergraph/services/wakeup/memory_queue.py +57 -0
- aethergraph/services/wakeup/scanner_producer.py +56 -0
- aethergraph/services/wakeup/worker.py +31 -0
- aethergraph/tools/__init__.py +25 -0
- aethergraph/utils/optdeps.py +8 -0
- aethergraph-0.1.0a1.dist-info/METADATA +410 -0
- aethergraph-0.1.0a1.dist-info/RECORD +182 -0
- aethergraph-0.1.0a1.dist-info/WHEEL +5 -0
- aethergraph-0.1.0a1.dist-info/entry_points.txt +2 -0
- aethergraph-0.1.0a1.dist-info/licenses/LICENSE +176 -0
- aethergraph-0.1.0a1.dist-info/licenses/NOTICE +31 -0
- aethergraph-0.1.0a1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import pickle
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
import faiss
|
|
11
|
+
except Exception:
|
|
12
|
+
faiss = None
|
|
13
|
+
|
|
14
|
+
from .base import VectorIndex
|
|
15
|
+
|
|
16
|
+
"""A simple FAISS index per corpus (L2 on normalized vectors ~ cosine).
|
|
17
|
+
Stores vectors as BLOBs along with metadata in a simple schema.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class FAISSVectorIndex(VectorIndex):
|
|
22
|
+
"""A simple FAISS index per corpus (L2 on normalized vectors ~ cosine)."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, index_path: str, dim: int | None = None):
|
|
25
|
+
super().__init__(index_path)
|
|
26
|
+
self.dim = dim # optional default; will infer on first add
|
|
27
|
+
os.makedirs(index_path, exist_ok=True)
|
|
28
|
+
|
|
29
|
+
def _paths(self, corpus_id: str):
|
|
30
|
+
base = os.path.join(self.index_path, corpus_id)
|
|
31
|
+
return base + ".index", base + ".meta.pkl"
|
|
32
|
+
|
|
33
|
+
def _load(self, corpus_id: str):
|
|
34
|
+
idx_path, meta_path = self._paths(corpus_id)
|
|
35
|
+
if not (os.path.exists(idx_path) and os.path.exists(meta_path)):
|
|
36
|
+
return None, []
|
|
37
|
+
if faiss is None:
|
|
38
|
+
raise RuntimeError("FAISS not installed")
|
|
39
|
+
index = faiss.read_index(idx_path)
|
|
40
|
+
with open(meta_path, "rb") as f:
|
|
41
|
+
metas = pickle.load(f)
|
|
42
|
+
return index, metas
|
|
43
|
+
|
|
44
|
+
def _save(self, corpus_id: str, index, metas):
|
|
45
|
+
idx_path, meta_path = self._paths(corpus_id)
|
|
46
|
+
if faiss is None:
|
|
47
|
+
raise RuntimeError("FAISS not installed")
|
|
48
|
+
faiss.write_index(index, idx_path)
|
|
49
|
+
with open(meta_path, "wb") as f:
|
|
50
|
+
pickle.dump(metas, f)
|
|
51
|
+
|
|
52
|
+
async def add(
|
|
53
|
+
self,
|
|
54
|
+
corpus_id: str,
|
|
55
|
+
chunk_ids: list[str],
|
|
56
|
+
vectors: list[list[float]],
|
|
57
|
+
metas: list[dict[str, Any]],
|
|
58
|
+
):
|
|
59
|
+
if faiss is None:
|
|
60
|
+
raise RuntimeError("FAISS not installed")
|
|
61
|
+
vecs = np.asarray(vectors, dtype=np.float32)
|
|
62
|
+
# normalize for cosine
|
|
63
|
+
norms = np.linalg.norm(vecs, axis=1, keepdims=True) + 1e-9
|
|
64
|
+
vecs = vecs / norms
|
|
65
|
+
d = vecs.shape[1]
|
|
66
|
+
index, old_metas = self._load(corpus_id)
|
|
67
|
+
if index is None:
|
|
68
|
+
index = faiss.IndexFlatIP(d) # cosine via normalized dot
|
|
69
|
+
old_metas = []
|
|
70
|
+
index.add(vecs)
|
|
71
|
+
for cid, m in zip(chunk_ids, metas, strict=True):
|
|
72
|
+
old_metas.append({"chunk_id": cid, "meta": m})
|
|
73
|
+
self._save(corpus_id, index, old_metas)
|
|
74
|
+
|
|
75
|
+
async def delete(self, corpus_id: str, chunk_ids: list[str] | None = None):
|
|
76
|
+
# Simple approach: rebuild if filtering; or delete entire corpus.
|
|
77
|
+
if not chunk_ids:
|
|
78
|
+
idx_path, meta_path = self._paths(corpus_id)
|
|
79
|
+
for p in (idx_path, meta_path):
|
|
80
|
+
if os.path.exists(p):
|
|
81
|
+
os.remove(p)
|
|
82
|
+
else:
|
|
83
|
+
index, metas = self._load(corpus_id)
|
|
84
|
+
if index is None:
|
|
85
|
+
return
|
|
86
|
+
# Rebuild without those ids
|
|
87
|
+
keep = [i for i, m in enumerate(metas) if m["chunk_id"] not in set(chunk_ids)]
|
|
88
|
+
if not keep:
|
|
89
|
+
await self.delete(corpus_id, None)
|
|
90
|
+
return
|
|
91
|
+
# Need stored vectors to rebuild — this simple implementation does not persist them.
|
|
92
|
+
# In production, persist vectors or recompute from text.
|
|
93
|
+
raise NotImplementedError(
|
|
94
|
+
"Selective delete requires stored vectors; not implemented here."
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
async def list_chunks(self, corpus_id: str) -> list[str]:
|
|
98
|
+
_, metas = self._load(corpus_id)
|
|
99
|
+
return [m["chunk_id"] for m in metas] if metas else []
|
|
100
|
+
|
|
101
|
+
async def search(self, corpus_id: str, query_vec: list[float], k: int):
|
|
102
|
+
if faiss is None:
|
|
103
|
+
raise RuntimeError("FAISS not installed")
|
|
104
|
+
index, metas = self._load(corpus_id)
|
|
105
|
+
if index is None:
|
|
106
|
+
return []
|
|
107
|
+
q = np.asarray([query_vec], dtype=np.float32)
|
|
108
|
+
q = q / (np.linalg.norm(q, axis=1, keepdims=True) + 1e-9)
|
|
109
|
+
D, I = index.search(q, k) # noqa: E741
|
|
110
|
+
out = []
|
|
111
|
+
for score, idx in zip(D[0].tolist(), I[0].tolist(), strict=True):
|
|
112
|
+
if idx < 0 or idx >= len(metas):
|
|
113
|
+
continue
|
|
114
|
+
out.append(
|
|
115
|
+
{
|
|
116
|
+
"chunk_id": metas[idx]["chunk_id"],
|
|
117
|
+
"score": float(score),
|
|
118
|
+
"meta": metas[idx]["meta"],
|
|
119
|
+
}
|
|
120
|
+
)
|
|
121
|
+
return out
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import sqlite3
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
from .base import VectorIndex
|
|
11
|
+
|
|
12
|
+
"""A simple SQLite-based vector index per corpus (brute-force cosine similarity).
|
|
13
|
+
Stores vectors as BLOBs along with metadata in a simple schema.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
SCHEMA = """
|
|
18
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
19
|
+
corpus_id TEXT,
|
|
20
|
+
chunk_id TEXT,
|
|
21
|
+
meta_json TEXT,
|
|
22
|
+
PRIMARY KEY (corpus_id, chunk_id)
|
|
23
|
+
);
|
|
24
|
+
CREATE TABLE IF NOT EXISTS embeddings (
|
|
25
|
+
corpus_id TEXT,
|
|
26
|
+
chunk_id TEXT,
|
|
27
|
+
vec BLOB, -- np.float32 array bytes
|
|
28
|
+
norm REAL,
|
|
29
|
+
PRIMARY KEY (corpus_id, chunk_id)
|
|
30
|
+
);
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _ensure_db(path: str):
|
|
35
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
36
|
+
conn = sqlite3.connect(path)
|
|
37
|
+
try:
|
|
38
|
+
for stmt in SCHEMA.strip().split(";\n"):
|
|
39
|
+
s = stmt.strip()
|
|
40
|
+
if s:
|
|
41
|
+
conn.execute(s)
|
|
42
|
+
conn.commit()
|
|
43
|
+
finally:
|
|
44
|
+
conn.close()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class SQLiteVectorIndex(VectorIndex):
|
|
48
|
+
def __init__(self, index_path: str):
|
|
49
|
+
super().__init__(index_path)
|
|
50
|
+
self.db_path = os.path.join(index_path, "index.sqlite")
|
|
51
|
+
_ensure_db(self.db_path)
|
|
52
|
+
|
|
53
|
+
def _connect(self):
|
|
54
|
+
return sqlite3.connect(self.db_path)
|
|
55
|
+
|
|
56
|
+
async def add(
|
|
57
|
+
self,
|
|
58
|
+
corpus_id: str,
|
|
59
|
+
chunk_ids: list[str],
|
|
60
|
+
vectors: list[list[float]],
|
|
61
|
+
metas: list[dict[str, Any]],
|
|
62
|
+
):
|
|
63
|
+
conn = self._connect()
|
|
64
|
+
try:
|
|
65
|
+
cur = conn.cursor()
|
|
66
|
+
for cid, vec, meta in zip(chunk_ids, vectors, metas, strict=True):
|
|
67
|
+
v = np.asarray(vec, dtype=np.float32)
|
|
68
|
+
norm = float(np.linalg.norm(v) + 1e-9)
|
|
69
|
+
cur.execute(
|
|
70
|
+
"REPLACE INTO chunks(corpus_id,chunk_id,meta_json) VALUES(?,?,?)",
|
|
71
|
+
(corpus_id, cid, json.dumps(meta, ensure_ascii=False)),
|
|
72
|
+
)
|
|
73
|
+
cur.execute(
|
|
74
|
+
"REPLACE INTO embeddings(corpus_id,chunk_id,vec,norm) VALUES(?,?,?,?)",
|
|
75
|
+
(corpus_id, cid, v.tobytes(), norm),
|
|
76
|
+
)
|
|
77
|
+
conn.commit()
|
|
78
|
+
finally:
|
|
79
|
+
conn.close()
|
|
80
|
+
|
|
81
|
+
async def delete(self, corpus_id: str, chunk_ids: list[str] | None = None):
|
|
82
|
+
conn = self._connect()
|
|
83
|
+
try:
|
|
84
|
+
cur = conn.cursor()
|
|
85
|
+
if chunk_ids:
|
|
86
|
+
q = f"DELETE FROM chunks WHERE corpus_id=? AND chunk_id IN ({','.join(['?'] * len(chunk_ids))})"
|
|
87
|
+
cur.execute(q, [corpus_id, *chunk_ids])
|
|
88
|
+
q2 = f"DELETE FROM embeddings WHERE corpus_id=? AND chunk_id IN ({','.join(['?'] * len(chunk_ids))})"
|
|
89
|
+
cur.execute(q2, [corpus_id, *chunk_ids])
|
|
90
|
+
else:
|
|
91
|
+
cur.execute("DELETE FROM chunks WHERE corpus_id=?", (corpus_id,))
|
|
92
|
+
cur.execute("DELETE FROM embeddings WHERE corpus_id=?", (corpus_id,))
|
|
93
|
+
conn.commit()
|
|
94
|
+
finally:
|
|
95
|
+
conn.close()
|
|
96
|
+
|
|
97
|
+
async def list_chunks(self, corpus_id: str) -> list[str]:
|
|
98
|
+
conn = self._connect()
|
|
99
|
+
try:
|
|
100
|
+
cur = conn.cursor()
|
|
101
|
+
cur.execute("SELECT chunk_id FROM chunks WHERE corpus_id=?", (corpus_id,))
|
|
102
|
+
return [r[0] for r in cur.fetchall()]
|
|
103
|
+
finally:
|
|
104
|
+
conn.close()
|
|
105
|
+
|
|
106
|
+
async def search(
|
|
107
|
+
self, corpus_id: str, query_vec: list[float], k: int
|
|
108
|
+
) -> list[dict[str, Any]]: # Brute-force cosine similarity. Loads vectors for that corpus.
|
|
109
|
+
q = np.asarray(query_vec, dtype=np.float32)
|
|
110
|
+
qn = float(np.linalg.norm(q) + 1e-9)
|
|
111
|
+
|
|
112
|
+
conn = self._connect()
|
|
113
|
+
try:
|
|
114
|
+
cur = conn.cursor()
|
|
115
|
+
cur.execute(
|
|
116
|
+
"SELECT e.chunk_id, e.vec, e.norm, c.meta_json FROM embeddings e JOIN chunks c USING(corpus_id,chunk_id) WHERE e.corpus_id=?",
|
|
117
|
+
(corpus_id,),
|
|
118
|
+
)
|
|
119
|
+
rows = cur.fetchall()
|
|
120
|
+
finally:
|
|
121
|
+
conn.close()
|
|
122
|
+
|
|
123
|
+
scores = []
|
|
124
|
+
for chunk_id, vec_bytes, norm, meta_json in rows:
|
|
125
|
+
v = np.frombuffer(vec_bytes, dtype=np.float32)
|
|
126
|
+
score = float(np.dot(q, v) / (qn * norm))
|
|
127
|
+
scores.append((score, chunk_id, meta_json))
|
|
128
|
+
|
|
129
|
+
scores.sort(reverse=True, key=lambda x: x[0])
|
|
130
|
+
top = scores[:k]
|
|
131
|
+
out = []
|
|
132
|
+
for score, chunk_id, meta_json in top:
|
|
133
|
+
out.append({"chunk_id": chunk_id, "score": score, "meta": json.loads(meta_json)})
|
|
134
|
+
return out
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from aethergraph.utils.optdeps import require
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger("aethergraph.rag.index_factory")
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _default_index_path(root: str, backend: str) -> str:
|
|
10
|
+
base = Path(root) / "rag_index"
|
|
11
|
+
if backend == "faiss":
|
|
12
|
+
return str(base / "faiss.index")
|
|
13
|
+
return str(base / "sqlite.index")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def create_vector_index(
|
|
17
|
+
*, backend: str, index_path: str | None, dim: int | None, root: str = "./aethergraph_data/rag"
|
|
18
|
+
):
|
|
19
|
+
"""
|
|
20
|
+
Create a vector index instance. Supported backends: 'sqlite', 'faiss'.
|
|
21
|
+
Falls back to 'sqlite' if FAISS is unavailable.
|
|
22
|
+
"""
|
|
23
|
+
backend = (backend or "sqlite").lower()
|
|
24
|
+
if backend not in {"sqlite", "faiss"}:
|
|
25
|
+
logger.warning(f"Unknown RAG backend {backend!r}; falling back to sqlite.")
|
|
26
|
+
backend = "sqlite"
|
|
27
|
+
|
|
28
|
+
if backend == "faiss":
|
|
29
|
+
# try FAISS, fallback to sqlite with a warning
|
|
30
|
+
try:
|
|
31
|
+
require("faiss", "faiss") # faiss-cpu exposes module 'faiss'
|
|
32
|
+
from .index.faiss_index import FAISSVectorIndex
|
|
33
|
+
|
|
34
|
+
path = (
|
|
35
|
+
str(Path(index_path) / "faiss")
|
|
36
|
+
if index_path is not None
|
|
37
|
+
else _default_index_path(root, "faiss")
|
|
38
|
+
)
|
|
39
|
+
return FAISSVectorIndex(path, dim=dim)
|
|
40
|
+
except Exception as e:
|
|
41
|
+
logger.warning(f"FAISS backend unavailable ({e}); falling back to sqlite.")
|
|
42
|
+
backend = "sqlite"
|
|
43
|
+
|
|
44
|
+
# sqlite (default)
|
|
45
|
+
from .index.sqlite_index import SQLiteVectorIndex
|
|
46
|
+
|
|
47
|
+
path = (
|
|
48
|
+
str(Path(index_path) / "sqlite")
|
|
49
|
+
if index_path is not None
|
|
50
|
+
else _default_index_path(root, "sqlite")
|
|
51
|
+
)
|
|
52
|
+
return SQLiteVectorIndex(path)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pypdf import PdfReader
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def extract_text(path: str) -> tuple[str, dict]:
|
|
7
|
+
reader = PdfReader(path)
|
|
8
|
+
texts = []
|
|
9
|
+
for page in reader.pages:
|
|
10
|
+
try:
|
|
11
|
+
texts.append(page.extract_text() or "")
|
|
12
|
+
except Exception:
|
|
13
|
+
continue
|
|
14
|
+
return "\n\n".join(texts), {"pages": len(reader.pages)}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import Counter
|
|
4
|
+
import re
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def lexical_score(query: str, text: str) -> float:
|
|
9
|
+
# Extremely lightweight bag-of-words match score.
|
|
10
|
+
def ws(s: str) -> list[str]:
|
|
11
|
+
return re.findall(r"\w+", s.lower())
|
|
12
|
+
|
|
13
|
+
q = ws(query)
|
|
14
|
+
t = ws(text)
|
|
15
|
+
if not q or not t:
|
|
16
|
+
return 0.0
|
|
17
|
+
cq = Counter(q)
|
|
18
|
+
ct = Counter(t)
|
|
19
|
+
# normalized term overlap
|
|
20
|
+
overlap = sum(min(cq[w], ct.get(w, 0)) for w in cq)
|
|
21
|
+
return overlap / (sum(cq.values()) + 1e-9)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def fuse_scores(dense_score: float, lexical: float, alpha: float = 0.8) -> float:
|
|
25
|
+
# Linear fusion; alpha favors dense similarity.
|
|
26
|
+
return alpha * dense_score + (1.0 - alpha) * lexical
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def topk_fuse(
|
|
30
|
+
query: str, dense_hits: list[dict[str, Any]], chunk_lookup: dict[str, str], k: int
|
|
31
|
+
) -> list[dict[str, Any]]:
|
|
32
|
+
out = []
|
|
33
|
+
for h in dense_hits:
|
|
34
|
+
txt = chunk_lookup.get(h["chunk_id"], "")
|
|
35
|
+
lex = lexical_score(query, txt)
|
|
36
|
+
fused = fuse_scores(h.get("score", 0.0), lex)
|
|
37
|
+
out.append({**h, "score": fused})
|
|
38
|
+
out.sort(key=lambda x: x["score"], reverse=True)
|
|
39
|
+
return out[:k]
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import hashlib
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
# Windows forbidden characters and device names
|
|
6
|
+
_INVALID_CHARS_RE = re.compile(r'[<>:"/\\|?\*\x00-\x1F]')
|
|
7
|
+
_RESERVED_WIN = {
|
|
8
|
+
"CON",
|
|
9
|
+
"PRN",
|
|
10
|
+
"AUX",
|
|
11
|
+
"NUL",
|
|
12
|
+
"COM1",
|
|
13
|
+
"COM2",
|
|
14
|
+
"COM3",
|
|
15
|
+
"COM4",
|
|
16
|
+
"COM5",
|
|
17
|
+
"COM6",
|
|
18
|
+
"COM7",
|
|
19
|
+
"COM8",
|
|
20
|
+
"COM9",
|
|
21
|
+
"LPT1",
|
|
22
|
+
"LPT2",
|
|
23
|
+
"LPT3",
|
|
24
|
+
"LPT4",
|
|
25
|
+
"LPT5",
|
|
26
|
+
"LPT6",
|
|
27
|
+
"LPT7",
|
|
28
|
+
"LPT8",
|
|
29
|
+
"LPT9",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def make_fs_key(cid: str, max_len: int = 128) -> str:
|
|
34
|
+
"""
|
|
35
|
+
Convert any logical corpus_id (may include ':', Unicode, etc.)
|
|
36
|
+
into a portable filename segment: [a-zA-Z0-9._-] only, no trailing space/dot,
|
|
37
|
+
not a reserved device name.
|
|
38
|
+
"""
|
|
39
|
+
# 1) Keep a short human-friendly prefix if present (e.g., "proj", "sess", "run")
|
|
40
|
+
if ":" in cid:
|
|
41
|
+
prefix, rest = cid.split(":", 1)
|
|
42
|
+
else:
|
|
43
|
+
prefix, rest = "cid", cid
|
|
44
|
+
|
|
45
|
+
# 2) Encode the rest to a compact, portable token (urlsafe base64 without padding)
|
|
46
|
+
# This avoids collisions from simple character replacement.
|
|
47
|
+
token = base64.urlsafe_b64encode(rest.encode("utf-8")).decode("ascii").rstrip("=")
|
|
48
|
+
|
|
49
|
+
# 3) Build candidate and sanitize any stray chars just in case
|
|
50
|
+
key = f"{prefix}-{token}"
|
|
51
|
+
key = _INVALID_CHARS_RE.sub("_", key).rstrip(" .")
|
|
52
|
+
|
|
53
|
+
# 4) Avoid Windows reserved device names
|
|
54
|
+
if key.upper() in _RESERVED_WIN:
|
|
55
|
+
key = f"_{key}_"
|
|
56
|
+
|
|
57
|
+
# 5) Enforce a reasonable max length (append a short hash if truncated)
|
|
58
|
+
if len(key) > max_len:
|
|
59
|
+
h = hashlib.sha1(key.encode("utf-8")).hexdigest()[:8]
|
|
60
|
+
key = key[: max_len - 9] + "-" + h
|
|
61
|
+
|
|
62
|
+
return key
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# services/redactor/simple.py
|
|
2
|
+
# PII/secret scrubbing for logs/events/artifacts
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class RegexRedactor:
|
|
7
|
+
PATTERNS = [
|
|
8
|
+
(re.compile(r"sk-[A-Za-z0-9]{20,}"), "[REDACTED:APIKEY]"),
|
|
9
|
+
(re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"), "[REDACTED:EMAIL]"),
|
|
10
|
+
(re.compile(r"\b\d{16}\b"), "[REDACTED:NUM]"),
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
def scrub(self, text: str) -> str:
|
|
14
|
+
for pat, repl in self.PATTERNS:
|
|
15
|
+
text = pat.sub(repl, text)
|
|
16
|
+
return text
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from .registry_key import _REG_PREFIX, NS, Key
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def parse_ref(ref: str) -> Key:
|
|
5
|
+
"""
|
|
6
|
+
Parse "<nspace>:<name>[@<version>]" or "<name>[@<version>]" (defaults to tool).
|
|
7
|
+
Also accepts "registry:<...>" prefix.
|
|
8
|
+
|
|
9
|
+
Examples:
|
|
10
|
+
"tool:my_tool@0.1.0"
|
|
11
|
+
"graph:my_graph"
|
|
12
|
+
"agent:router@latest"
|
|
13
|
+
"my_tool@0.1.0" # -> tool
|
|
14
|
+
"registry:tool:my_tool@0.1.0"
|
|
15
|
+
"registry:my_tool@0.1.0" # -> tool
|
|
16
|
+
"""
|
|
17
|
+
if not ref:
|
|
18
|
+
raise ValueError("Empty ref")
|
|
19
|
+
m = _REG_PREFIX.match(ref)
|
|
20
|
+
s = m.group(1) if m else ref
|
|
21
|
+
|
|
22
|
+
# If a namespace is present, it looks like "ns:name..."
|
|
23
|
+
if ":" in s:
|
|
24
|
+
nspace, rest = s.split(":", 1)
|
|
25
|
+
if nspace not in NS:
|
|
26
|
+
# If the left side is not a namespace, treat whole thing as name (default to tool)
|
|
27
|
+
nspace, rest = "tool", s
|
|
28
|
+
else:
|
|
29
|
+
nspace, rest = "tool", s # default namespace
|
|
30
|
+
|
|
31
|
+
if "@" in rest:
|
|
32
|
+
name, ver = rest.split("@", 1)
|
|
33
|
+
ver = ver or None
|
|
34
|
+
else:
|
|
35
|
+
name, ver = rest, None
|
|
36
|
+
|
|
37
|
+
# normalize @latest → None (caller treats None as "pick latest")
|
|
38
|
+
if ver and ver.lower() == "latest":
|
|
39
|
+
ver = None
|
|
40
|
+
|
|
41
|
+
if not name:
|
|
42
|
+
raise ValueError(f"Invalid ref (missing name): {ref}")
|
|
43
|
+
|
|
44
|
+
return Key(nspace=nspace, name=name, version=ver)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
NS = {"tool", "graph", "graphfn", "agent"}
|
|
5
|
+
|
|
6
|
+
# Simple ref regex to detect optional leading 'registry:'
|
|
7
|
+
_REG_PREFIX = re.compile(r"^registry:(.+)$", re.I)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class Key:
|
|
12
|
+
nspace: str
|
|
13
|
+
name: str
|
|
14
|
+
version: str | None = None # None or "latest" means resolve latest
|
|
15
|
+
|
|
16
|
+
def canonical(self) -> str:
|
|
17
|
+
ver = self.version
|
|
18
|
+
# Normalize "latest" to omitted for display
|
|
19
|
+
return f"{self.nspace}:{self.name}" + (f"@{ver}" if ver and ver.lower() != "latest" else "")
|