aethergraph 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. aethergraph/__init__.py +49 -0
  2. aethergraph/config/__init__.py +0 -0
  3. aethergraph/config/config.py +121 -0
  4. aethergraph/config/context.py +16 -0
  5. aethergraph/config/llm.py +26 -0
  6. aethergraph/config/loader.py +60 -0
  7. aethergraph/config/runtime.py +9 -0
  8. aethergraph/contracts/errors/errors.py +44 -0
  9. aethergraph/contracts/services/artifacts.py +142 -0
  10. aethergraph/contracts/services/channel.py +72 -0
  11. aethergraph/contracts/services/continuations.py +23 -0
  12. aethergraph/contracts/services/eventbus.py +12 -0
  13. aethergraph/contracts/services/kv.py +24 -0
  14. aethergraph/contracts/services/llm.py +17 -0
  15. aethergraph/contracts/services/mcp.py +22 -0
  16. aethergraph/contracts/services/memory.py +108 -0
  17. aethergraph/contracts/services/resume.py +28 -0
  18. aethergraph/contracts/services/state_stores.py +33 -0
  19. aethergraph/contracts/services/wakeup.py +28 -0
  20. aethergraph/core/execution/base_scheduler.py +77 -0
  21. aethergraph/core/execution/forward_scheduler.py +777 -0
  22. aethergraph/core/execution/global_scheduler.py +634 -0
  23. aethergraph/core/execution/retry_policy.py +22 -0
  24. aethergraph/core/execution/step_forward.py +411 -0
  25. aethergraph/core/execution/step_result.py +18 -0
  26. aethergraph/core/execution/wait_types.py +72 -0
  27. aethergraph/core/graph/graph_builder.py +192 -0
  28. aethergraph/core/graph/graph_fn.py +219 -0
  29. aethergraph/core/graph/graph_io.py +67 -0
  30. aethergraph/core/graph/graph_refs.py +154 -0
  31. aethergraph/core/graph/graph_spec.py +115 -0
  32. aethergraph/core/graph/graph_state.py +59 -0
  33. aethergraph/core/graph/graphify.py +128 -0
  34. aethergraph/core/graph/interpreter.py +145 -0
  35. aethergraph/core/graph/node_handle.py +33 -0
  36. aethergraph/core/graph/node_spec.py +46 -0
  37. aethergraph/core/graph/node_state.py +63 -0
  38. aethergraph/core/graph/task_graph.py +747 -0
  39. aethergraph/core/graph/task_node.py +82 -0
  40. aethergraph/core/graph/utils.py +37 -0
  41. aethergraph/core/graph/visualize.py +239 -0
  42. aethergraph/core/runtime/ad_hoc_context.py +61 -0
  43. aethergraph/core/runtime/base_service.py +153 -0
  44. aethergraph/core/runtime/bind_adapter.py +42 -0
  45. aethergraph/core/runtime/bound_memory.py +69 -0
  46. aethergraph/core/runtime/execution_context.py +220 -0
  47. aethergraph/core/runtime/graph_runner.py +349 -0
  48. aethergraph/core/runtime/lifecycle.py +26 -0
  49. aethergraph/core/runtime/node_context.py +203 -0
  50. aethergraph/core/runtime/node_services.py +30 -0
  51. aethergraph/core/runtime/recovery.py +159 -0
  52. aethergraph/core/runtime/run_registration.py +33 -0
  53. aethergraph/core/runtime/runtime_env.py +157 -0
  54. aethergraph/core/runtime/runtime_registry.py +32 -0
  55. aethergraph/core/runtime/runtime_services.py +224 -0
  56. aethergraph/core/runtime/wakeup_watcher.py +40 -0
  57. aethergraph/core/tools/__init__.py +10 -0
  58. aethergraph/core/tools/builtins/channel_tools.py +194 -0
  59. aethergraph/core/tools/builtins/toolset.py +134 -0
  60. aethergraph/core/tools/toolkit.py +510 -0
  61. aethergraph/core/tools/waitable.py +109 -0
  62. aethergraph/plugins/channel/__init__.py +0 -0
  63. aethergraph/plugins/channel/adapters/__init__.py +0 -0
  64. aethergraph/plugins/channel/adapters/console.py +106 -0
  65. aethergraph/plugins/channel/adapters/file.py +102 -0
  66. aethergraph/plugins/channel/adapters/slack.py +285 -0
  67. aethergraph/plugins/channel/adapters/telegram.py +302 -0
  68. aethergraph/plugins/channel/adapters/webhook.py +104 -0
  69. aethergraph/plugins/channel/adapters/webui.py +134 -0
  70. aethergraph/plugins/channel/routes/__init__.py +0 -0
  71. aethergraph/plugins/channel/routes/console_routes.py +86 -0
  72. aethergraph/plugins/channel/routes/slack_routes.py +49 -0
  73. aethergraph/plugins/channel/routes/telegram_routes.py +26 -0
  74. aethergraph/plugins/channel/routes/webui_routes.py +136 -0
  75. aethergraph/plugins/channel/utils/__init__.py +0 -0
  76. aethergraph/plugins/channel/utils/slack_utils.py +278 -0
  77. aethergraph/plugins/channel/utils/telegram_utils.py +324 -0
  78. aethergraph/plugins/channel/websockets/slack_ws.py +68 -0
  79. aethergraph/plugins/channel/websockets/telegram_polling.py +151 -0
  80. aethergraph/plugins/mcp/fs_server.py +128 -0
  81. aethergraph/plugins/mcp/http_server.py +101 -0
  82. aethergraph/plugins/mcp/ws_server.py +180 -0
  83. aethergraph/plugins/net/http.py +10 -0
  84. aethergraph/plugins/utils/data_io.py +359 -0
  85. aethergraph/runner/__init__.py +5 -0
  86. aethergraph/runtime/__init__.py +62 -0
  87. aethergraph/server/__init__.py +3 -0
  88. aethergraph/server/app_factory.py +84 -0
  89. aethergraph/server/start.py +122 -0
  90. aethergraph/services/__init__.py +10 -0
  91. aethergraph/services/artifacts/facade.py +284 -0
  92. aethergraph/services/artifacts/factory.py +35 -0
  93. aethergraph/services/artifacts/fs_store.py +656 -0
  94. aethergraph/services/artifacts/jsonl_index.py +123 -0
  95. aethergraph/services/artifacts/paths.py +23 -0
  96. aethergraph/services/artifacts/sqlite_index.py +209 -0
  97. aethergraph/services/artifacts/utils.py +124 -0
  98. aethergraph/services/auth/dev.py +16 -0
  99. aethergraph/services/channel/channel_bus.py +293 -0
  100. aethergraph/services/channel/factory.py +44 -0
  101. aethergraph/services/channel/session.py +511 -0
  102. aethergraph/services/channel/wait_helpers.py +57 -0
  103. aethergraph/services/clock/clock.py +9 -0
  104. aethergraph/services/container/default_container.py +320 -0
  105. aethergraph/services/continuations/continuation.py +56 -0
  106. aethergraph/services/continuations/factory.py +34 -0
  107. aethergraph/services/continuations/stores/fs_store.py +264 -0
  108. aethergraph/services/continuations/stores/inmem_store.py +95 -0
  109. aethergraph/services/eventbus/inmem.py +21 -0
  110. aethergraph/services/features/static.py +10 -0
  111. aethergraph/services/kv/ephemeral.py +90 -0
  112. aethergraph/services/kv/factory.py +27 -0
  113. aethergraph/services/kv/layered.py +41 -0
  114. aethergraph/services/kv/sqlite_kv.py +128 -0
  115. aethergraph/services/llm/factory.py +157 -0
  116. aethergraph/services/llm/generic_client.py +542 -0
  117. aethergraph/services/llm/providers.py +3 -0
  118. aethergraph/services/llm/service.py +105 -0
  119. aethergraph/services/logger/base.py +36 -0
  120. aethergraph/services/logger/compat.py +50 -0
  121. aethergraph/services/logger/formatters.py +106 -0
  122. aethergraph/services/logger/std.py +203 -0
  123. aethergraph/services/mcp/helpers.py +23 -0
  124. aethergraph/services/mcp/http_client.py +70 -0
  125. aethergraph/services/mcp/mcp_tools.py +21 -0
  126. aethergraph/services/mcp/registry.py +14 -0
  127. aethergraph/services/mcp/service.py +100 -0
  128. aethergraph/services/mcp/stdio_client.py +70 -0
  129. aethergraph/services/mcp/ws_client.py +115 -0
  130. aethergraph/services/memory/bound.py +106 -0
  131. aethergraph/services/memory/distillers/episode.py +116 -0
  132. aethergraph/services/memory/distillers/rolling.py +74 -0
  133. aethergraph/services/memory/facade.py +633 -0
  134. aethergraph/services/memory/factory.py +78 -0
  135. aethergraph/services/memory/hotlog_kv.py +27 -0
  136. aethergraph/services/memory/indices.py +74 -0
  137. aethergraph/services/memory/io_helpers.py +72 -0
  138. aethergraph/services/memory/persist_fs.py +40 -0
  139. aethergraph/services/memory/resolver.py +152 -0
  140. aethergraph/services/metering/noop.py +4 -0
  141. aethergraph/services/prompts/file_store.py +41 -0
  142. aethergraph/services/rag/chunker.py +29 -0
  143. aethergraph/services/rag/facade.py +593 -0
  144. aethergraph/services/rag/index/base.py +27 -0
  145. aethergraph/services/rag/index/faiss_index.py +121 -0
  146. aethergraph/services/rag/index/sqlite_index.py +134 -0
  147. aethergraph/services/rag/index_factory.py +52 -0
  148. aethergraph/services/rag/parsers/md.py +7 -0
  149. aethergraph/services/rag/parsers/pdf.py +14 -0
  150. aethergraph/services/rag/parsers/txt.py +7 -0
  151. aethergraph/services/rag/utils/hybrid.py +39 -0
  152. aethergraph/services/rag/utils/make_fs_key.py +62 -0
  153. aethergraph/services/redactor/simple.py +16 -0
  154. aethergraph/services/registry/key_parsing.py +44 -0
  155. aethergraph/services/registry/registry_key.py +19 -0
  156. aethergraph/services/registry/unified_registry.py +185 -0
  157. aethergraph/services/resume/multi_scheduler_resume_bus.py +65 -0
  158. aethergraph/services/resume/router.py +73 -0
  159. aethergraph/services/schedulers/registry.py +41 -0
  160. aethergraph/services/secrets/base.py +7 -0
  161. aethergraph/services/secrets/env.py +8 -0
  162. aethergraph/services/state_stores/externalize.py +135 -0
  163. aethergraph/services/state_stores/graph_observer.py +131 -0
  164. aethergraph/services/state_stores/json_store.py +67 -0
  165. aethergraph/services/state_stores/resume_policy.py +119 -0
  166. aethergraph/services/state_stores/serialize.py +249 -0
  167. aethergraph/services/state_stores/utils.py +91 -0
  168. aethergraph/services/state_stores/validate.py +78 -0
  169. aethergraph/services/tracing/noop.py +18 -0
  170. aethergraph/services/waits/wait_registry.py +91 -0
  171. aethergraph/services/wakeup/memory_queue.py +57 -0
  172. aethergraph/services/wakeup/scanner_producer.py +56 -0
  173. aethergraph/services/wakeup/worker.py +31 -0
  174. aethergraph/tools/__init__.py +25 -0
  175. aethergraph/utils/optdeps.py +8 -0
  176. aethergraph-0.1.0a1.dist-info/METADATA +410 -0
  177. aethergraph-0.1.0a1.dist-info/RECORD +182 -0
  178. aethergraph-0.1.0a1.dist-info/WHEEL +5 -0
  179. aethergraph-0.1.0a1.dist-info/entry_points.txt +2 -0
  180. aethergraph-0.1.0a1.dist-info/licenses/LICENSE +176 -0
  181. aethergraph-0.1.0a1.dist-info/licenses/NOTICE +31 -0
  182. aethergraph-0.1.0a1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,121 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import pickle
5
+ from typing import Any
6
+
7
+ import numpy as np
8
+
9
+ try:
10
+ import faiss
11
+ except Exception:
12
+ faiss = None
13
+
14
+ from .base import VectorIndex
15
+
16
+ """A simple FAISS index per corpus (L2 on normalized vectors ~ cosine).
17
+ Stores vectors as BLOBs along with metadata in a simple schema.
18
+ """
19
+
20
+
21
+ class FAISSVectorIndex(VectorIndex):
22
+ """A simple FAISS index per corpus (L2 on normalized vectors ~ cosine)."""
23
+
24
+ def __init__(self, index_path: str, dim: int | None = None):
25
+ super().__init__(index_path)
26
+ self.dim = dim # optional default; will infer on first add
27
+ os.makedirs(index_path, exist_ok=True)
28
+
29
+ def _paths(self, corpus_id: str):
30
+ base = os.path.join(self.index_path, corpus_id)
31
+ return base + ".index", base + ".meta.pkl"
32
+
33
+ def _load(self, corpus_id: str):
34
+ idx_path, meta_path = self._paths(corpus_id)
35
+ if not (os.path.exists(idx_path) and os.path.exists(meta_path)):
36
+ return None, []
37
+ if faiss is None:
38
+ raise RuntimeError("FAISS not installed")
39
+ index = faiss.read_index(idx_path)
40
+ with open(meta_path, "rb") as f:
41
+ metas = pickle.load(f)
42
+ return index, metas
43
+
44
+ def _save(self, corpus_id: str, index, metas):
45
+ idx_path, meta_path = self._paths(corpus_id)
46
+ if faiss is None:
47
+ raise RuntimeError("FAISS not installed")
48
+ faiss.write_index(index, idx_path)
49
+ with open(meta_path, "wb") as f:
50
+ pickle.dump(metas, f)
51
+
52
+ async def add(
53
+ self,
54
+ corpus_id: str,
55
+ chunk_ids: list[str],
56
+ vectors: list[list[float]],
57
+ metas: list[dict[str, Any]],
58
+ ):
59
+ if faiss is None:
60
+ raise RuntimeError("FAISS not installed")
61
+ vecs = np.asarray(vectors, dtype=np.float32)
62
+ # normalize for cosine
63
+ norms = np.linalg.norm(vecs, axis=1, keepdims=True) + 1e-9
64
+ vecs = vecs / norms
65
+ d = vecs.shape[1]
66
+ index, old_metas = self._load(corpus_id)
67
+ if index is None:
68
+ index = faiss.IndexFlatIP(d) # cosine via normalized dot
69
+ old_metas = []
70
+ index.add(vecs)
71
+ for cid, m in zip(chunk_ids, metas, strict=True):
72
+ old_metas.append({"chunk_id": cid, "meta": m})
73
+ self._save(corpus_id, index, old_metas)
74
+
75
+ async def delete(self, corpus_id: str, chunk_ids: list[str] | None = None):
76
+ # Simple approach: rebuild if filtering; or delete entire corpus.
77
+ if not chunk_ids:
78
+ idx_path, meta_path = self._paths(corpus_id)
79
+ for p in (idx_path, meta_path):
80
+ if os.path.exists(p):
81
+ os.remove(p)
82
+ else:
83
+ index, metas = self._load(corpus_id)
84
+ if index is None:
85
+ return
86
+ # Rebuild without those ids
87
+ keep = [i for i, m in enumerate(metas) if m["chunk_id"] not in set(chunk_ids)]
88
+ if not keep:
89
+ await self.delete(corpus_id, None)
90
+ return
91
+ # Need stored vectors to rebuild — this simple implementation does not persist them.
92
+ # In production, persist vectors or recompute from text.
93
+ raise NotImplementedError(
94
+ "Selective delete requires stored vectors; not implemented here."
95
+ )
96
+
97
+ async def list_chunks(self, corpus_id: str) -> list[str]:
98
+ _, metas = self._load(corpus_id)
99
+ return [m["chunk_id"] for m in metas] if metas else []
100
+
101
+ async def search(self, corpus_id: str, query_vec: list[float], k: int):
102
+ if faiss is None:
103
+ raise RuntimeError("FAISS not installed")
104
+ index, metas = self._load(corpus_id)
105
+ if index is None:
106
+ return []
107
+ q = np.asarray([query_vec], dtype=np.float32)
108
+ q = q / (np.linalg.norm(q, axis=1, keepdims=True) + 1e-9)
109
+ D, I = index.search(q, k) # noqa: E741
110
+ out = []
111
+ for score, idx in zip(D[0].tolist(), I[0].tolist(), strict=True):
112
+ if idx < 0 or idx >= len(metas):
113
+ continue
114
+ out.append(
115
+ {
116
+ "chunk_id": metas[idx]["chunk_id"],
117
+ "score": float(score),
118
+ "meta": metas[idx]["meta"],
119
+ }
120
+ )
121
+ return out
@@ -0,0 +1,134 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import sqlite3
6
+ from typing import Any
7
+
8
+ import numpy as np
9
+
10
+ from .base import VectorIndex
11
+
12
+ """A simple SQLite-based vector index per corpus (brute-force cosine similarity).
13
+ Stores vectors as BLOBs along with metadata in a simple schema.
14
+ """
15
+
16
+
17
+ SCHEMA = """
18
+ CREATE TABLE IF NOT EXISTS chunks (
19
+ corpus_id TEXT,
20
+ chunk_id TEXT,
21
+ meta_json TEXT,
22
+ PRIMARY KEY (corpus_id, chunk_id)
23
+ );
24
+ CREATE TABLE IF NOT EXISTS embeddings (
25
+ corpus_id TEXT,
26
+ chunk_id TEXT,
27
+ vec BLOB, -- np.float32 array bytes
28
+ norm REAL,
29
+ PRIMARY KEY (corpus_id, chunk_id)
30
+ );
31
+ """
32
+
33
+
34
+ def _ensure_db(path: str):
35
+ os.makedirs(os.path.dirname(path), exist_ok=True)
36
+ conn = sqlite3.connect(path)
37
+ try:
38
+ for stmt in SCHEMA.strip().split(";\n"):
39
+ s = stmt.strip()
40
+ if s:
41
+ conn.execute(s)
42
+ conn.commit()
43
+ finally:
44
+ conn.close()
45
+
46
+
47
+ class SQLiteVectorIndex(VectorIndex):
48
+ def __init__(self, index_path: str):
49
+ super().__init__(index_path)
50
+ self.db_path = os.path.join(index_path, "index.sqlite")
51
+ _ensure_db(self.db_path)
52
+
53
+ def _connect(self):
54
+ return sqlite3.connect(self.db_path)
55
+
56
+ async def add(
57
+ self,
58
+ corpus_id: str,
59
+ chunk_ids: list[str],
60
+ vectors: list[list[float]],
61
+ metas: list[dict[str, Any]],
62
+ ):
63
+ conn = self._connect()
64
+ try:
65
+ cur = conn.cursor()
66
+ for cid, vec, meta in zip(chunk_ids, vectors, metas, strict=True):
67
+ v = np.asarray(vec, dtype=np.float32)
68
+ norm = float(np.linalg.norm(v) + 1e-9)
69
+ cur.execute(
70
+ "REPLACE INTO chunks(corpus_id,chunk_id,meta_json) VALUES(?,?,?)",
71
+ (corpus_id, cid, json.dumps(meta, ensure_ascii=False)),
72
+ )
73
+ cur.execute(
74
+ "REPLACE INTO embeddings(corpus_id,chunk_id,vec,norm) VALUES(?,?,?,?)",
75
+ (corpus_id, cid, v.tobytes(), norm),
76
+ )
77
+ conn.commit()
78
+ finally:
79
+ conn.close()
80
+
81
+ async def delete(self, corpus_id: str, chunk_ids: list[str] | None = None):
82
+ conn = self._connect()
83
+ try:
84
+ cur = conn.cursor()
85
+ if chunk_ids:
86
+ q = f"DELETE FROM chunks WHERE corpus_id=? AND chunk_id IN ({','.join(['?'] * len(chunk_ids))})"
87
+ cur.execute(q, [corpus_id, *chunk_ids])
88
+ q2 = f"DELETE FROM embeddings WHERE corpus_id=? AND chunk_id IN ({','.join(['?'] * len(chunk_ids))})"
89
+ cur.execute(q2, [corpus_id, *chunk_ids])
90
+ else:
91
+ cur.execute("DELETE FROM chunks WHERE corpus_id=?", (corpus_id,))
92
+ cur.execute("DELETE FROM embeddings WHERE corpus_id=?", (corpus_id,))
93
+ conn.commit()
94
+ finally:
95
+ conn.close()
96
+
97
+ async def list_chunks(self, corpus_id: str) -> list[str]:
98
+ conn = self._connect()
99
+ try:
100
+ cur = conn.cursor()
101
+ cur.execute("SELECT chunk_id FROM chunks WHERE corpus_id=?", (corpus_id,))
102
+ return [r[0] for r in cur.fetchall()]
103
+ finally:
104
+ conn.close()
105
+
106
+ async def search(
107
+ self, corpus_id: str, query_vec: list[float], k: int
108
+ ) -> list[dict[str, Any]]: # Brute-force cosine similarity. Loads vectors for that corpus.
109
+ q = np.asarray(query_vec, dtype=np.float32)
110
+ qn = float(np.linalg.norm(q) + 1e-9)
111
+
112
+ conn = self._connect()
113
+ try:
114
+ cur = conn.cursor()
115
+ cur.execute(
116
+ "SELECT e.chunk_id, e.vec, e.norm, c.meta_json FROM embeddings e JOIN chunks c USING(corpus_id,chunk_id) WHERE e.corpus_id=?",
117
+ (corpus_id,),
118
+ )
119
+ rows = cur.fetchall()
120
+ finally:
121
+ conn.close()
122
+
123
+ scores = []
124
+ for chunk_id, vec_bytes, norm, meta_json in rows:
125
+ v = np.frombuffer(vec_bytes, dtype=np.float32)
126
+ score = float(np.dot(q, v) / (qn * norm))
127
+ scores.append((score, chunk_id, meta_json))
128
+
129
+ scores.sort(reverse=True, key=lambda x: x[0])
130
+ top = scores[:k]
131
+ out = []
132
+ for score, chunk_id, meta_json in top:
133
+ out.append({"chunk_id": chunk_id, "score": score, "meta": json.loads(meta_json)})
134
+ return out
@@ -0,0 +1,52 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ from aethergraph.utils.optdeps import require
5
+
6
+ logger = logging.getLogger("aethergraph.rag.index_factory")
7
+
8
+
9
+ def _default_index_path(root: str, backend: str) -> str:
10
+ base = Path(root) / "rag_index"
11
+ if backend == "faiss":
12
+ return str(base / "faiss.index")
13
+ return str(base / "sqlite.index")
14
+
15
+
16
+ def create_vector_index(
17
+ *, backend: str, index_path: str | None, dim: int | None, root: str = "./aethergraph_data/rag"
18
+ ):
19
+ """
20
+ Create a vector index instance. Supported backends: 'sqlite', 'faiss'.
21
+ Falls back to 'sqlite' if FAISS is unavailable.
22
+ """
23
+ backend = (backend or "sqlite").lower()
24
+ if backend not in {"sqlite", "faiss"}:
25
+ logger.warning(f"Unknown RAG backend {backend!r}; falling back to sqlite.")
26
+ backend = "sqlite"
27
+
28
+ if backend == "faiss":
29
+ # try FAISS, fallback to sqlite with a warning
30
+ try:
31
+ require("faiss", "faiss") # faiss-cpu exposes module 'faiss'
32
+ from .index.faiss_index import FAISSVectorIndex
33
+
34
+ path = (
35
+ str(Path(index_path) / "faiss")
36
+ if index_path is not None
37
+ else _default_index_path(root, "faiss")
38
+ )
39
+ return FAISSVectorIndex(path, dim=dim)
40
+ except Exception as e:
41
+ logger.warning(f"FAISS backend unavailable ({e}); falling back to sqlite.")
42
+ backend = "sqlite"
43
+
44
+ # sqlite (default)
45
+ from .index.sqlite_index import SQLiteVectorIndex
46
+
47
+ path = (
48
+ str(Path(index_path) / "sqlite")
49
+ if index_path is not None
50
+ else _default_index_path(root, "sqlite")
51
+ )
52
+ return SQLiteVectorIndex(path)
@@ -0,0 +1,7 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ def extract_text(path: str) -> tuple[str, dict]:
5
+ with open(path, encoding="utf-8", errors="ignore") as f:
6
+ txt = f.read()
7
+ return txt, {}
@@ -0,0 +1,14 @@
1
+ from __future__ import annotations
2
+
3
+ from pypdf import PdfReader
4
+
5
+
6
+ def extract_text(path: str) -> tuple[str, dict]:
7
+ reader = PdfReader(path)
8
+ texts = []
9
+ for page in reader.pages:
10
+ try:
11
+ texts.append(page.extract_text() or "")
12
+ except Exception:
13
+ continue
14
+ return "\n\n".join(texts), {"pages": len(reader.pages)}
@@ -0,0 +1,7 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ def extract_text(path: str) -> tuple[str, dict]:
5
+ with open(path, encoding="utf-8", errors="ignore") as f:
6
+ txt = f.read()
7
+ return txt, {}
@@ -0,0 +1,39 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import Counter
4
+ import re
5
+ from typing import Any
6
+
7
+
8
+ def lexical_score(query: str, text: str) -> float:
9
+ # Extremely lightweight bag-of-words match score.
10
+ def ws(s: str) -> list[str]:
11
+ return re.findall(r"\w+", s.lower())
12
+
13
+ q = ws(query)
14
+ t = ws(text)
15
+ if not q or not t:
16
+ return 0.0
17
+ cq = Counter(q)
18
+ ct = Counter(t)
19
+ # normalized term overlap
20
+ overlap = sum(min(cq[w], ct.get(w, 0)) for w in cq)
21
+ return overlap / (sum(cq.values()) + 1e-9)
22
+
23
+
24
+ def fuse_scores(dense_score: float, lexical: float, alpha: float = 0.8) -> float:
25
+ # Linear fusion; alpha favors dense similarity.
26
+ return alpha * dense_score + (1.0 - alpha) * lexical
27
+
28
+
29
+ def topk_fuse(
30
+ query: str, dense_hits: list[dict[str, Any]], chunk_lookup: dict[str, str], k: int
31
+ ) -> list[dict[str, Any]]:
32
+ out = []
33
+ for h in dense_hits:
34
+ txt = chunk_lookup.get(h["chunk_id"], "")
35
+ lex = lexical_score(query, txt)
36
+ fused = fuse_scores(h.get("score", 0.0), lex)
37
+ out.append({**h, "score": fused})
38
+ out.sort(key=lambda x: x["score"], reverse=True)
39
+ return out[:k]
@@ -0,0 +1,62 @@
1
+ import base64
2
+ import hashlib
3
+ import re
4
+
5
+ # Windows forbidden characters and device names
6
+ _INVALID_CHARS_RE = re.compile(r'[<>:"/\\|?\*\x00-\x1F]')
7
+ _RESERVED_WIN = {
8
+ "CON",
9
+ "PRN",
10
+ "AUX",
11
+ "NUL",
12
+ "COM1",
13
+ "COM2",
14
+ "COM3",
15
+ "COM4",
16
+ "COM5",
17
+ "COM6",
18
+ "COM7",
19
+ "COM8",
20
+ "COM9",
21
+ "LPT1",
22
+ "LPT2",
23
+ "LPT3",
24
+ "LPT4",
25
+ "LPT5",
26
+ "LPT6",
27
+ "LPT7",
28
+ "LPT8",
29
+ "LPT9",
30
+ }
31
+
32
+
33
+ def make_fs_key(cid: str, max_len: int = 128) -> str:
34
+ """
35
+ Convert any logical corpus_id (may include ':', Unicode, etc.)
36
+ into a portable filename segment: [a-zA-Z0-9._-] only, no trailing space/dot,
37
+ not a reserved device name.
38
+ """
39
+ # 1) Keep a short human-friendly prefix if present (e.g., "proj", "sess", "run")
40
+ if ":" in cid:
41
+ prefix, rest = cid.split(":", 1)
42
+ else:
43
+ prefix, rest = "cid", cid
44
+
45
+ # 2) Encode the rest to a compact, portable token (urlsafe base64 without padding)
46
+ # This avoids collisions from simple character replacement.
47
+ token = base64.urlsafe_b64encode(rest.encode("utf-8")).decode("ascii").rstrip("=")
48
+
49
+ # 3) Build candidate and sanitize any stray chars just in case
50
+ key = f"{prefix}-{token}"
51
+ key = _INVALID_CHARS_RE.sub("_", key).rstrip(" .")
52
+
53
+ # 4) Avoid Windows reserved device names
54
+ if key.upper() in _RESERVED_WIN:
55
+ key = f"_{key}_"
56
+
57
+ # 5) Enforce a reasonable max length (append a short hash if truncated)
58
+ if len(key) > max_len:
59
+ h = hashlib.sha1(key.encode("utf-8")).hexdigest()[:8]
60
+ key = key[: max_len - 9] + "-" + h
61
+
62
+ return key
@@ -0,0 +1,16 @@
1
+ # services/redactor/simple.py
2
+ # PII/secret scrubbing for logs/events/artifacts
3
+ import re
4
+
5
+
6
+ class RegexRedactor:
7
+ PATTERNS = [
8
+ (re.compile(r"sk-[A-Za-z0-9]{20,}"), "[REDACTED:APIKEY]"),
9
+ (re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"), "[REDACTED:EMAIL]"),
10
+ (re.compile(r"\b\d{16}\b"), "[REDACTED:NUM]"),
11
+ ]
12
+
13
+ def scrub(self, text: str) -> str:
14
+ for pat, repl in self.PATTERNS:
15
+ text = pat.sub(repl, text)
16
+ return text
@@ -0,0 +1,44 @@
1
+ from .registry_key import _REG_PREFIX, NS, Key
2
+
3
+
4
+ def parse_ref(ref: str) -> Key:
5
+ """
6
+ Parse "<nspace>:<name>[@<version>]" or "<name>[@<version>]" (defaults to tool).
7
+ Also accepts "registry:<...>" prefix.
8
+
9
+ Examples:
10
+ "tool:my_tool@0.1.0"
11
+ "graph:my_graph"
12
+ "agent:router@latest"
13
+ "my_tool@0.1.0" # -> tool
14
+ "registry:tool:my_tool@0.1.0"
15
+ "registry:my_tool@0.1.0" # -> tool
16
+ """
17
+ if not ref:
18
+ raise ValueError("Empty ref")
19
+ m = _REG_PREFIX.match(ref)
20
+ s = m.group(1) if m else ref
21
+
22
+ # If a namespace is present, it looks like "ns:name..."
23
+ if ":" in s:
24
+ nspace, rest = s.split(":", 1)
25
+ if nspace not in NS:
26
+ # If the left side is not a namespace, treat whole thing as name (default to tool)
27
+ nspace, rest = "tool", s
28
+ else:
29
+ nspace, rest = "tool", s # default namespace
30
+
31
+ if "@" in rest:
32
+ name, ver = rest.split("@", 1)
33
+ ver = ver or None
34
+ else:
35
+ name, ver = rest, None
36
+
37
+ # normalize @latest → None (caller treats None as "pick latest")
38
+ if ver and ver.lower() == "latest":
39
+ ver = None
40
+
41
+ if not name:
42
+ raise ValueError(f"Invalid ref (missing name): {ref}")
43
+
44
+ return Key(nspace=nspace, name=name, version=ver)
@@ -0,0 +1,19 @@
1
+ from dataclasses import dataclass
2
+ import re
3
+
4
+ NS = {"tool", "graph", "graphfn", "agent"}
5
+
6
+ # Simple ref regex to detect optional leading 'registry:'
7
+ _REG_PREFIX = re.compile(r"^registry:(.+)$", re.I)
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class Key:
12
+ nspace: str
13
+ name: str
14
+ version: str | None = None # None or "latest" means resolve latest
15
+
16
+ def canonical(self) -> str:
17
+ ver = self.version
18
+ # Normalize "latest" to omitted for display
19
+ return f"{self.nspace}:{self.name}" + (f"@{ver}" if ver and ver.lower() != "latest" else "")