@agentikos/omega-os 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +25 -13
  2. package/bootstrap/lib/steps.sh +214 -9
  3. package/bootstrap/manifest.example.yaml +6 -1
  4. package/docs/COMPLETION-PLAN.md +48 -0
  5. package/omega/Agentik_Engine/README.md +25 -10
  6. package/omega/Agentik_Engine/omega_engine/__init__.py +66 -2
  7. package/omega/Agentik_Engine/omega_engine/account.py +505 -0
  8. package/omega/Agentik_Engine/omega_engine/autonomous.py +538 -0
  9. package/omega/Agentik_Engine/omega_engine/cli.py +467 -29
  10. package/omega/Agentik_Engine/omega_engine/daemons/__init__.py +14 -0
  11. package/omega/Agentik_Engine/omega_engine/daemons/autonomous.py +56 -0
  12. package/omega/Agentik_Engine/omega_engine/daemons/engine.py +187 -0
  13. package/omega/Agentik_Engine/omega_engine/daemons/telegram.py +231 -0
  14. package/omega/Agentik_Engine/omega_engine/educators/__init__.py +51 -0
  15. package/omega/Agentik_Engine/omega_engine/educators/artifact.py +65 -0
  16. package/omega/Agentik_Engine/omega_engine/educators/automation.py +76 -0
  17. package/omega/Agentik_Engine/omega_engine/educators/base.py +327 -0
  18. package/omega/Agentik_Engine/omega_engine/educators/claudecode.py +71 -0
  19. package/omega/Agentik_Engine/omega_engine/educators/connection.py +75 -0
  20. package/omega/Agentik_Engine/omega_engine/educators/coworker.py +68 -0
  21. package/omega/Agentik_Engine/omega_engine/educators/loop.py +82 -0
  22. package/omega/Agentik_Engine/omega_engine/educators/prompt.py +68 -0
  23. package/omega/Agentik_Engine/omega_engine/educators/skill.py +69 -0
  24. package/omega/Agentik_Engine/omega_engine/executor.py +46 -6
  25. package/omega/Agentik_Engine/omega_engine/mission.py +13 -1
  26. package/omega/Agentik_Engine/omega_engine/provider.py +247 -1
  27. package/omega/Agentik_Engine/omega_engine/rag/__init__.py +21 -0
  28. package/omega/Agentik_Engine/omega_engine/rag/agentic.py +83 -0
  29. package/omega/Agentik_Engine/omega_engine/rag/base.py +42 -0
  30. package/omega/Agentik_Engine/omega_engine/rag/corrective.py +119 -0
  31. package/omega/Agentik_Engine/omega_engine/rag/graph.py +169 -0
  32. package/omega/Agentik_Engine/omega_engine/rag/hybrid.py +205 -0
  33. package/omega/Agentik_Engine/omega_engine/rag/multimodal.py +136 -0
  34. package/omega/Agentik_Engine/omega_engine/rag/router.py +110 -0
  35. package/omega/Agentik_Engine/omega_engine/reducer.py +21 -3
  36. package/omega/Agentik_Engine/omega_engine/store.py +65 -5
  37. package/omega/Agentik_Engine/omega_engine/sync.py +304 -0
  38. package/omega/Agentik_Engine/omega_engine/tools.py +272 -0
  39. package/omega/Agentik_Engine/pyproject.toml +1 -1
  40. package/omega/Agentik_Engine/tests/test_account.py +333 -0
  41. package/omega/Agentik_Engine/tests/test_autonomous.py +361 -0
  42. package/omega/Agentik_Engine/tests/test_educators.py +233 -0
  43. package/omega/Agentik_Engine/tests/test_rag.py +287 -0
  44. package/omega/Agentik_Engine/tests/test_snapshot_partial.py +172 -0
  45. package/omega/Agentik_Engine/tests/test_tools_and_sync.py +312 -0
  46. package/omega/Agentik_SSOT/skills/rag-route.md +73 -0
  47. package/package.json +1 -1
  48. package/omega/Agentik_Engine/omega_engine/__pycache__/__init__.cpython-313.pyc +0 -0
  49. package/omega/Agentik_Engine/omega_engine/__pycache__/audit.cpython-313.pyc +0 -0
  50. package/omega/Agentik_Engine/omega_engine/__pycache__/audit_arsenal.cpython-313.pyc +0 -0
  51. package/omega/Agentik_Engine/omega_engine/__pycache__/barrier.cpython-313.pyc +0 -0
  52. package/omega/Agentik_Engine/omega_engine/__pycache__/bus.cpython-313.pyc +0 -0
  53. package/omega/Agentik_Engine/omega_engine/__pycache__/cli.cpython-313.pyc +0 -0
  54. package/omega/Agentik_Engine/omega_engine/__pycache__/events.cpython-313.pyc +0 -0
  55. package/omega/Agentik_Engine/omega_engine/__pycache__/executor.cpython-313.pyc +0 -0
  56. package/omega/Agentik_Engine/omega_engine/__pycache__/mission.cpython-313.pyc +0 -0
  57. package/omega/Agentik_Engine/omega_engine/__pycache__/progress.cpython-313.pyc +0 -0
  58. package/omega/Agentik_Engine/omega_engine/__pycache__/project.cpython-313.pyc +0 -0
  59. package/omega/Agentik_Engine/omega_engine/__pycache__/provider.cpython-313.pyc +0 -0
  60. package/omega/Agentik_Engine/omega_engine/__pycache__/reducer.cpython-313.pyc +0 -0
  61. package/omega/Agentik_Engine/omega_engine/__pycache__/report.cpython-313.pyc +0 -0
  62. package/omega/Agentik_Engine/omega_engine/__pycache__/router.cpython-313.pyc +0 -0
  63. package/omega/Agentik_Engine/omega_engine/__pycache__/store.cpython-313.pyc +0 -0
  64. package/omega/Agentik_Engine/omega_engine/__pycache__/supervisor.cpython-313.pyc +0 -0
  65. package/omega/Agentik_Engine/omega_engine/__pycache__/task.cpython-313.pyc +0 -0
  66. package/omega/Agentik_Engine/omega_engine/__pycache__/telegram.cpython-313.pyc +0 -0
  67. package/omega/Agentik_Engine/tests/__pycache__/test_audit_arsenal.cpython-313.pyc +0 -0
  68. package/omega/Agentik_Engine/tests/__pycache__/test_executor.cpython-313.pyc +0 -0
  69. package/omega/Agentik_Engine/tests/__pycache__/test_mission.cpython-313.pyc +0 -0
  70. package/omega/Agentik_Engine/tests/__pycache__/test_progress.cpython-313.pyc +0 -0
  71. package/omega/Agentik_Engine/tests/__pycache__/test_project.cpython-313.pyc +0 -0
  72. package/omega/Agentik_Engine/tests/__pycache__/test_reducer.cpython-313.pyc +0 -0
  73. package/omega/Agentik_Engine/tests/__pycache__/test_report.cpython-313.pyc +0 -0
@@ -0,0 +1,119 @@
1
+ """Corrective RAG (CRAG) — the envelope every strategy wears.
2
+
3
+ For each retrieval, the provider (role "rag-grader") scores every doc 0-100.
4
+ If the average is below `threshold`, the provider (role "rag-agent") produces
5
+ a refined query and we re-retrieve. Bounded retries — never an infinite loop.
6
+
7
+ This is the user's explicit design: corrective wraps the chosen strategy.
8
+ The router resolves WHICH retriever runs; the corrective layer ensures the
9
+ RESULT meets a quality floor.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ from omega_engine.provider import AgentProvider, AgentRequest
14
+ from omega_engine.rag.base import Document, RetrievalResult, Retriever
15
+
16
+
17
+ class CorrectiveRetriever:
18
+ """Grade → if low, refine query and retry. Bounded by `max_retries`."""
19
+
20
+ strategy = "corrective"
21
+
22
+ def __init__(
23
+ self,
24
+ inner: Retriever,
25
+ provider: AgentProvider,
26
+ *,
27
+ threshold: float = 70.0,
28
+ max_retries: int = 2,
29
+ ) -> None:
30
+ self.inner = inner
31
+ self.provider = provider
32
+ self.threshold = threshold
33
+ self.max_retries = max_retries
34
+
35
+ def retrieve(self, query: str, k: int = 5) -> RetrievalResult:
36
+ current = query
37
+ last: RetrievalResult | None = None
38
+
39
+ for attempt in range(self.max_retries + 1):
40
+ result = self.inner.retrieve(current, k=k)
41
+ last = result
42
+ if not result.documents:
43
+ break
44
+
45
+ grade_avg = self._grade(current, result.documents)
46
+ # Stamp the grade onto every document so the caller can see it.
47
+ for d in result.documents:
48
+ d.metadata["grader_avg"] = grade_avg
49
+ result.score = max(result.score, grade_avg / 100.0)
50
+
51
+ if grade_avg >= self.threshold or attempt == self.max_retries:
52
+ # Either good enough OR we've used up our retries.
53
+ inner_strategy = result.strategy
54
+ return RetrievalResult(
55
+ query=query,
56
+ documents=result.documents,
57
+ score=result.score,
58
+ strategy=f"corrective+{inner_strategy}",
59
+ )
60
+
61
+ # Below threshold AND retries remain — ask provider for a refined
62
+ # query and try again.
63
+ current = self._refine(query, current, result.documents)
64
+
65
+ # Defensive fallback — exhausted retries but never returned (e.g.
66
+ # zero docs from inner). Surface what we have, untouched.
67
+ if last is None:
68
+ return RetrievalResult(query=query, documents=[], score=0.0,
69
+ strategy="corrective+empty")
70
+ return RetrievalResult(
71
+ query=query,
72
+ documents=last.documents,
73
+ score=last.score,
74
+ strategy=f"corrective+{last.strategy}",
75
+ )
76
+
77
+ # ----- internals -----
78
+
79
+ def _grade(self, query: str, docs: list[Document]) -> float:
80
+ """Average grader score in 0-100. Bounded; missing scores → 0."""
81
+ result = self.provider.run(AgentRequest(
82
+ role="rag-grader",
83
+ prompt=f"Score relevance 0-100 for query: {query}",
84
+ context={
85
+ "query": query,
86
+ "documents": [
87
+ {"id": d.id, "text": d.text[:300]} for d in docs
88
+ ],
89
+ },
90
+ ))
91
+ raw = (result.artifacts or {}).get("scores", [])
92
+ scores = [float(s) for s in raw if isinstance(s, (int, float))]
93
+ if not scores:
94
+ return 0.0
95
+ avg = sum(scores) / len(scores)
96
+ return max(0.0, min(100.0, avg))
97
+
98
+ def _refine(self, original: str, current: str, docs: list[Document]) -> str:
99
+ """Ask the provider for a refined query. Fallback: append context."""
100
+ result = self.provider.run(AgentRequest(
101
+ role="rag-agent",
102
+ prompt=(
103
+ f"Original query: {original}\n"
104
+ f"Current query: {current}\n"
105
+ "These docs scored below the relevance threshold. "
106
+ "Suggest a better query."
107
+ ),
108
+ context={
109
+ "original_query": original,
110
+ "current_query": current,
111
+ "snippets": [d.text[:200] for d in docs[:3]],
112
+ },
113
+ ))
114
+ next_q = (result.artifacts or {}).get("next_query")
115
+ if next_q and next_q != current:
116
+ return str(next_q)
117
+ # Provider had nothing useful — perturb the query so the inner
118
+ # retriever sees something different.
119
+ return f"{current} context"
@@ -0,0 +1,169 @@
1
+ """Graph retriever — entity-relation graph with depth-limited expansion.
2
+
3
+ Plain Python: dict-of-sets adjacency, typed edges. Persists to JSON. The
4
+ retrieval contract picks seed nodes from query tokens, expands `depth` hops,
5
+ and returns each visited node as a Document.
6
+
7
+ This is the "second strategy" in the multi-RAG router: when a query is
8
+ relational ("who works on X", "what depends on Y"), the graph wins over
9
+ hybrid.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ import re
15
+ from collections import defaultdict, deque
16
+ from pathlib import Path
17
+ from typing import Any, Iterable
18
+
19
+ from omega_engine.rag.base import Document, RetrievalResult
20
+
21
+
22
+ _TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
23
+
24
+
25
+ def _tokens(text: str) -> list[str]:
26
+ return [t.lower() for t in _TOKEN_RE.findall(text)]
27
+
28
+
29
+ class GraphRetriever:
30
+ """In-memory entity-relation graph with JSON persistence.
31
+
32
+ Edge tuples are `(neighbour, edge_type)`. The graph is undirected for
33
+ expansion — adding `(a, b, "depends_on")` makes `b` a neighbour of `a`
34
+ AND `a` a neighbour of `b`. The edge type travels with the neighbour so
35
+ the caller can filter on it.
36
+ """
37
+
38
+ strategy = "graph"
39
+
40
+ def __init__(self, json_path: str | Path | None = None) -> None:
41
+ self.json_path = str(json_path) if json_path is not None else None
42
+ self._adj: dict[str, set[tuple[str, str]]] = defaultdict(set)
43
+ self._node_text: dict[str, str] = {} # optional node text body
44
+ self._node_meta: dict[str, dict[str, Any]] = {}
45
+ if self.json_path and Path(self.json_path).exists():
46
+ self._load()
47
+
48
+ # ----- mutation -----
49
+
50
+ def add_node(
51
+ self,
52
+ node: str,
53
+ text: str | None = None,
54
+ metadata: dict[str, Any] | None = None,
55
+ ) -> None:
56
+ self._adj.setdefault(node, set())
57
+ if text is not None:
58
+ self._node_text[node] = text
59
+ if metadata is not None:
60
+ self._node_meta[node] = dict(metadata)
61
+
62
+ def add_edge(self, a: str, b: str, edge_type: str = "rel") -> None:
63
+ """Add an undirected typed edge between two nodes. Idempotent."""
64
+ self.add_node(a)
65
+ self.add_node(b)
66
+ self._adj[a].add((b, edge_type))
67
+ self._adj[b].add((a, edge_type))
68
+ if self.json_path:
69
+ self._save()
70
+
71
+ def neighbors(self, node: str, depth: int = 1) -> list[str]:
72
+ """BFS up to `depth` hops. Returns nodes in discovery order, excluding
73
+ the seed itself."""
74
+ if depth <= 0 or node not in self._adj:
75
+ return []
76
+ seen: set[str] = {node}
77
+ out: list[str] = []
78
+ q: deque[tuple[str, int]] = deque([(node, 0)])
79
+ while q:
80
+ cur, d = q.popleft()
81
+ if d == depth:
82
+ continue
83
+ for nbr, _etype in self._adj[cur]:
84
+ if nbr in seen:
85
+ continue
86
+ seen.add(nbr)
87
+ out.append(nbr)
88
+ q.append((nbr, d + 1))
89
+ return out
90
+
91
+ # ----- retrieval -----
92
+
93
+ def retrieve(self, query: str, k: int = 5, depth: int = 2) -> RetrievalResult:
94
+ """Pick seeds whose name overlaps a query token; expand `depth` hops.
95
+
96
+ Score = closeness (closer hops rank higher) + seed-name overlap.
97
+ """
98
+ if not self._adj:
99
+ return RetrievalResult(query=query, documents=[], score=0.0,
100
+ strategy=self.strategy)
101
+
102
+ q_tokens = set(_tokens(query))
103
+ seeds: list[str] = []
104
+ for node in self._adj:
105
+ if any(t in node.lower() for t in q_tokens):
106
+ seeds.append(node)
107
+ if not seeds:
108
+ # No direct hit — fall back to seeding from every node, capped.
109
+ # Lets the graph still surface SOMETHING for ambient queries.
110
+ seeds = list(self._adj.keys())[:3]
111
+
112
+ scored: dict[str, float] = {}
113
+ for seed in seeds:
114
+ scored.setdefault(seed, max(scored.get(seed, 0.0), 1.0))
115
+ # BFS, score = 1 / (1 + hops)
116
+ seen: set[str] = {seed}
117
+ q: deque[tuple[str, int]] = deque([(seed, 0)])
118
+ while q:
119
+ cur, d = q.popleft()
120
+ if d == depth:
121
+ continue
122
+ for nbr, _etype in self._adj[cur]:
123
+ if nbr in seen:
124
+ continue
125
+ seen.add(nbr)
126
+ s = 1.0 / (1.0 + d + 1)
127
+ scored[nbr] = max(scored.get(nbr, 0.0), s)
128
+ q.append((nbr, d + 1))
129
+
130
+ ranked = sorted(scored.items(), key=lambda p: p[1], reverse=True)[:k]
131
+ docs: list[Document] = []
132
+ for node, s in ranked:
133
+ text = self._node_text.get(node, node)
134
+ meta = dict(self._node_meta.get(node, {}))
135
+ meta["score"] = s
136
+ meta["edges"] = sorted(
137
+ (nbr, etype) for nbr, etype in self._adj[node]
138
+ )
139
+ docs.append(Document(id=node, text=text, metadata=meta))
140
+ agg = sum(s for _, s in ranked) / len(ranked) if ranked else 0.0
141
+ return RetrievalResult(query=query, documents=docs, score=agg,
142
+ strategy=self.strategy)
143
+
144
+ # ----- persistence -----
145
+
146
+ def _save(self) -> None:
147
+ assert self.json_path is not None
148
+ Path(self.json_path).parent.mkdir(parents=True, exist_ok=True)
149
+ # Sets aren't JSON-serialisable — flatten to lists.
150
+ data = {
151
+ "adj": {
152
+ k: sorted([list(t) for t in v]) for k, v in self._adj.items()
153
+ },
154
+ "node_text": self._node_text,
155
+ "node_meta": self._node_meta,
156
+ }
157
+ Path(self.json_path).write_text(json.dumps(data, indent=2))
158
+
159
+ def _load(self) -> None:
160
+ assert self.json_path is not None
161
+ data = json.loads(Path(self.json_path).read_text())
162
+ self._adj = defaultdict(set, {
163
+ k: {tuple(t) for t in v} for k, v in data.get("adj", {}).items()
164
+ })
165
+ self._node_text = data.get("node_text", {})
166
+ self._node_meta = data.get("node_meta", {})
167
+
168
+ def nodes(self) -> Iterable[str]:
169
+ return self._adj.keys()
@@ -0,0 +1,205 @@
1
+ """Hybrid retriever — sparse FTS5 (BM25) blended with dense cosine.
2
+
3
+ SQLite FTS5 gives a real BM25 score via its `bm25()` ranking function.
4
+ For the dense leg, we use a *hashing trick* — every token is hashed into one
5
+ of `dim` buckets, weighted by sublinear TF, then ℓ2-normalised. Cosine
6
+ similarity is the dot product of two normalised vectors. Pure stdlib.
7
+
8
+ The combined score is `alpha * dense + (1 - alpha) * sparse`, both legs
9
+ normalised to [0, 1] across the candidate pool.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import hashlib
14
+ import json
15
+ import math
16
+ import re
17
+ import sqlite3
18
+ from collections import Counter
19
+ from pathlib import Path
20
+
21
+ from omega_engine.rag.base import Document, RetrievalResult
22
+
23
+
24
+ _TOKEN_RE = re.compile(r"[A-Za-z0-9]+")
25
+
26
+
27
+ def _tokens(text: str) -> list[str]:
28
+ return [t.lower() for t in _TOKEN_RE.findall(text)]
29
+
30
+
31
+ def _hash_to_bucket(token: str, dim: int) -> int:
32
+ # md5 is fast and stable — we only need a uniform-ish bucket index, not
33
+ # cryptographic security.
34
+ h = hashlib.md5(token.encode("utf-8")).digest()
35
+ return int.from_bytes(h[:4], "big") % dim
36
+
37
+
38
+ def _vectorise(text: str, dim: int) -> list[float]:
39
+ """Hashed-feature vector with sublinear TF, ℓ2-normalised."""
40
+ counts: Counter[int] = Counter()
41
+ for tok in _tokens(text):
42
+ counts[_hash_to_bucket(tok, dim)] += 1
43
+ vec = [0.0] * dim
44
+ for bucket, c in counts.items():
45
+ # 1 + log(tf) damps very common tokens — the classic tf-idf weighting.
46
+ vec[bucket] = 1.0 + math.log(c)
47
+ norm = math.sqrt(sum(v * v for v in vec))
48
+ if norm == 0.0:
49
+ return vec
50
+ return [v / norm for v in vec]
51
+
52
+
53
+ def _cosine(a: list[float], b: list[float]) -> float:
54
+ # Both vectors are already ℓ2-normalised → cosine == dot product.
55
+ return sum(x * y for x, y in zip(a, b))
56
+
57
+
58
+ def _normalise(values: list[float]) -> list[float]:
59
+ """Min-max scale a list into [0, 1]. Empty / flat → all zeros."""
60
+ if not values:
61
+ return values
62
+ lo, hi = min(values), max(values)
63
+ if hi - lo < 1e-12:
64
+ return [0.0 for _ in values]
65
+ return [(v - lo) / (hi - lo) for v in values]
66
+
67
+
68
+ _SCHEMA = """
69
+ CREATE VIRTUAL TABLE IF NOT EXISTS docs_fts USING fts5(
70
+ id UNINDEXED, text, tokenize = 'unicode61'
71
+ );
72
+ CREATE TABLE IF NOT EXISTS docs_meta (
73
+ id TEXT PRIMARY KEY,
74
+ text TEXT NOT NULL,
75
+ meta TEXT NOT NULL DEFAULT '{}',
76
+ vec TEXT NOT NULL
77
+ );
78
+ """
79
+
80
+
81
+ class HybridRetriever:
82
+ """Sparse FTS5 + dense hashed-cosine, combined per `alpha`.
83
+
84
+ `alpha = 1` → pure dense. `alpha = 0` → pure sparse. Default 0.5 blends.
85
+ """
86
+
87
+ strategy = "hybrid"
88
+
89
+ def __init__(
90
+ self,
91
+ db_path: str | Path,
92
+ *,
93
+ dim: int = 256,
94
+ alpha: float = 0.5,
95
+ ) -> None:
96
+ self.db_path = str(db_path)
97
+ self.dim = dim
98
+ self.alpha = alpha
99
+ Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
100
+ self._conn = sqlite3.connect(self.db_path, isolation_level=None)
101
+ self._conn.row_factory = sqlite3.Row
102
+ self._conn.execute("PRAGMA journal_mode=WAL;")
103
+ self._conn.execute("PRAGMA synchronous=NORMAL;")
104
+ self._conn.executescript(_SCHEMA)
105
+
106
+ # ----- ingest -----
107
+
108
+ def index(self, documents: list[Document]) -> int:
109
+ """Index a batch. Returns the number of docs written. Idempotent on id."""
110
+ n = 0
111
+ with self._conn:
112
+ for d in documents:
113
+ vec = _vectorise(d.text, self.dim)
114
+ self._conn.execute(
115
+ "INSERT OR REPLACE INTO docs_meta (id, text, meta, vec) "
116
+ "VALUES (?, ?, ?, ?)",
117
+ (d.id, d.text, json.dumps(d.metadata), json.dumps(vec)),
118
+ )
119
+ # FTS5 has no native upsert — delete-then-insert is the idiom.
120
+ self._conn.execute("DELETE FROM docs_fts WHERE id = ?", (d.id,))
121
+ self._conn.execute(
122
+ "INSERT INTO docs_fts (id, text) VALUES (?, ?)", (d.id, d.text)
123
+ )
124
+ n += 1
125
+ return n
126
+
127
+ def count(self) -> int:
128
+ cur = self._conn.execute("SELECT COUNT(*) AS c FROM docs_meta")
129
+ return int(cur.fetchone()["c"])
130
+
131
+ # ----- retrieval -----
132
+
133
+ def retrieve(self, query: str, k: int = 5) -> RetrievalResult:
134
+ if k <= 0 or self.count() == 0:
135
+ return RetrievalResult(query=query, documents=[], score=0.0,
136
+ strategy=self.strategy)
137
+
138
+ # --- sparse leg: FTS5 + bm25. bm25() returns a NEGATIVE score (lower
139
+ # is better), so we negate it before normalising.
140
+ sparse_rows: dict[str, float] = {}
141
+ fts_query = _fts5_safe(query)
142
+ if fts_query:
143
+ try:
144
+ cur = self._conn.execute(
145
+ "SELECT id, bm25(docs_fts) AS score FROM docs_fts "
146
+ "WHERE docs_fts MATCH ? ORDER BY score LIMIT ?",
147
+ (fts_query, k * 4),
148
+ )
149
+ for row in cur.fetchall():
150
+ sparse_rows[row["id"]] = -float(row["score"])
151
+ except sqlite3.OperationalError:
152
+ # malformed FTS5 query — fall back to dense-only
153
+ sparse_rows = {}
154
+
155
+ # --- dense leg: cosine over every indexed doc (small corpora; for
156
+ # larger corpora a clustered ANN goes here, contract identical).
157
+ qvec = _vectorise(query, self.dim)
158
+ dense_rows: dict[str, float] = {}
159
+ cur = self._conn.execute("SELECT id, vec FROM docs_meta")
160
+ for row in cur.fetchall():
161
+ v = json.loads(row["vec"])
162
+ dense_rows[row["id"]] = _cosine(qvec, v)
163
+
164
+ # --- combine: union of candidates, min-max each leg, alpha blend.
165
+ candidates = set(sparse_rows) | set(dense_rows)
166
+ if not candidates:
167
+ return RetrievalResult(query=query, documents=[], score=0.0,
168
+ strategy=self.strategy)
169
+
170
+ ids = list(candidates)
171
+ sparse_vec = [sparse_rows.get(i, 0.0) for i in ids]
172
+ dense_vec = [dense_rows.get(i, 0.0) for i in ids]
173
+ sparse_n = _normalise(sparse_vec)
174
+ dense_n = _normalise(dense_vec)
175
+ combined = [
176
+ (self.alpha * d) + ((1 - self.alpha) * s)
177
+ for d, s in zip(dense_n, sparse_n)
178
+ ]
179
+
180
+ ranked = sorted(zip(ids, combined), key=lambda p: p[1], reverse=True)[:k]
181
+ docs: list[Document] = []
182
+ for doc_id, score in ranked:
183
+ cur = self._conn.execute(
184
+ "SELECT text, meta FROM docs_meta WHERE id = ?", (doc_id,)
185
+ )
186
+ row = cur.fetchone()
187
+ if not row:
188
+ continue
189
+ meta = json.loads(row["meta"])
190
+ meta["score"] = score
191
+ docs.append(Document(id=doc_id, text=row["text"], metadata=meta))
192
+
193
+ agg = sum(s for _, s in ranked) / len(ranked) if ranked else 0.0
194
+ return RetrievalResult(query=query, documents=docs, score=agg,
195
+ strategy=self.strategy)
196
+
197
+ def close(self) -> None:
198
+ self._conn.close()
199
+
200
+
201
+ def _fts5_safe(query: str) -> str:
202
+ """Coerce a freeform query into something FTS5 can parse — strip syntax
203
+ chars, OR the surviving tokens together. Returns "" if nothing remains."""
204
+ toks = _tokens(query)
205
+ return " OR ".join(toks) if toks else ""
@@ -0,0 +1,136 @@
1
+ """Multimodal retriever — PDFs and images on top of a text retriever.
2
+
3
+ v1 keeps the surface honest:
4
+ * PDFs are converted to text via `pdftotext` if the binary is on PATH,
5
+ otherwise the doc is registered with its path + an empty body (no fake
6
+ text, no hallucinated extraction).
7
+ * Images get a caption from the provider (role "rag-caption") and are
8
+ indexed by that caption + their path. If the provider returns nothing,
9
+ the image still indexes by filename.
10
+
11
+ The underlying text index is any text-mode Retriever (typically
12
+ HybridRetriever) — so a query "the architecture diagram explaining the
13
+ event log" can surface an image whose caption matches.
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import shutil
18
+ import subprocess
19
+ from pathlib import Path
20
+
21
+ from omega_engine.provider import AgentProvider, AgentRequest
22
+ from omega_engine.rag.base import Document, RetrievalResult, Retriever
23
+
24
+
25
+ def _pdftotext_available() -> bool:
26
+ return shutil.which("pdftotext") is not None
27
+
28
+
29
+ def _extract_pdf_text(path: Path) -> str:
30
+ """Best-effort PDF → text. Returns "" when pdftotext is missing or fails.
31
+ NEVER fakes output (Karpathy: think before coding — no hallucination)."""
32
+ if not _pdftotext_available():
33
+ return ""
34
+ try:
35
+ out = subprocess.run(
36
+ ["pdftotext", "-q", str(path), "-"],
37
+ capture_output=True, text=True, timeout=15, check=False,
38
+ )
39
+ return out.stdout if out.returncode == 0 else ""
40
+ except (subprocess.TimeoutExpired, OSError):
41
+ return ""
42
+
43
+
44
+ class MultimodalRetriever:
45
+ """Wraps a text retriever; understands `add_pdf` and `add_image`."""
46
+
47
+ strategy = "multimodal"
48
+
49
+ # `Retriever` (Protocol) needs an `index` method when we use HybridRetriever.
50
+ # We type-hint loosely so any object with `retrieve` works AND any
51
+ # object that *also* has `index` accepts new docs.
52
+ def __init__(
53
+ self,
54
+ inner: Retriever,
55
+ provider: AgentProvider | None = None,
56
+ ) -> None:
57
+ self.inner = inner
58
+ self.provider = provider
59
+ self._registered: dict[str, dict] = {} # id → {kind, path, caption}
60
+
61
+ # ----- ingest -----
62
+
63
+ def add_pdf(self, doc_id: str, path: str | Path) -> Document:
64
+ """Extract text via pdftotext if available; index whatever survives."""
65
+ p = Path(path)
66
+ text = _extract_pdf_text(p)
67
+ meta = {
68
+ "modality": "pdf",
69
+ "source": str(p),
70
+ "extracted": bool(text),
71
+ }
72
+ doc = Document(id=doc_id, text=text or f"[pdf:{p.name}]", metadata=meta)
73
+ self._register(doc)
74
+ return doc
75
+
76
+ def add_image(
77
+ self,
78
+ doc_id: str,
79
+ path: str | Path,
80
+ *,
81
+ caption: str | None = None,
82
+ ) -> Document:
83
+ """Index an image by its caption. Caption is provider-generated
84
+ unless one is passed in. With no provider and no caption, we fall
85
+ back to the filename so the image is still discoverable."""
86
+ p = Path(path)
87
+ final_caption = caption
88
+ if final_caption is None and self.provider is not None:
89
+ result = self.provider.run(AgentRequest(
90
+ role="rag-caption",
91
+ prompt=f"Caption the image at {p}",
92
+ context={"path": str(p)},
93
+ ))
94
+ final_caption = (result.artifacts or {}).get("caption")
95
+ if not final_caption:
96
+ final_caption = p.stem.replace("_", " ").replace("-", " ")
97
+ meta = {
98
+ "modality": "image",
99
+ "source": str(p),
100
+ "caption": final_caption,
101
+ }
102
+ doc = Document(id=doc_id, text=final_caption, metadata=meta)
103
+ self._register(doc)
104
+ return doc
105
+
106
+ def add_text(self, doc_id: str, text: str, **meta) -> Document:
107
+ """Plain text passthrough — keeps the retriever uniformly usable."""
108
+ m = {"modality": "text", **meta}
109
+ doc = Document(id=doc_id, text=text, metadata=m)
110
+ self._register(doc)
111
+ return doc
112
+
113
+ def _register(self, doc: Document) -> None:
114
+ self._registered[doc.id] = doc.metadata
115
+ index = getattr(self.inner, "index", None)
116
+ if callable(index):
117
+ index([doc])
118
+ # If the inner retriever has no `index`, the caller is responsible
119
+ # for populating it elsewhere — we still track the modality metadata.
120
+
121
+ # ----- retrieval -----
122
+
123
+ def retrieve(self, query: str, k: int = 5) -> RetrievalResult:
124
+ result = self.inner.retrieve(query, k=k)
125
+ # Inject modality metadata for any doc we registered.
126
+ for d in result.documents:
127
+ if d.id in self._registered:
128
+ d.metadata.setdefault(
129
+ "modality", self._registered[d.id].get("modality", "text"),
130
+ )
131
+ return RetrievalResult(
132
+ query=query,
133
+ documents=result.documents,
134
+ score=result.score,
135
+ strategy=self.strategy,
136
+ )