@agentikos/omega-os 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -13
- package/bootstrap/lib/steps.sh +214 -9
- package/bootstrap/manifest.example.yaml +6 -1
- package/docs/COMPLETION-PLAN.md +48 -0
- package/omega/Agentik_Engine/README.md +25 -10
- package/omega/Agentik_Engine/omega_engine/__init__.py +66 -2
- package/omega/Agentik_Engine/omega_engine/account.py +505 -0
- package/omega/Agentik_Engine/omega_engine/autonomous.py +538 -0
- package/omega/Agentik_Engine/omega_engine/cli.py +467 -29
- package/omega/Agentik_Engine/omega_engine/daemons/__init__.py +14 -0
- package/omega/Agentik_Engine/omega_engine/daemons/autonomous.py +56 -0
- package/omega/Agentik_Engine/omega_engine/daemons/engine.py +187 -0
- package/omega/Agentik_Engine/omega_engine/daemons/telegram.py +231 -0
- package/omega/Agentik_Engine/omega_engine/educators/__init__.py +51 -0
- package/omega/Agentik_Engine/omega_engine/educators/artifact.py +65 -0
- package/omega/Agentik_Engine/omega_engine/educators/automation.py +76 -0
- package/omega/Agentik_Engine/omega_engine/educators/base.py +327 -0
- package/omega/Agentik_Engine/omega_engine/educators/claudecode.py +71 -0
- package/omega/Agentik_Engine/omega_engine/educators/connection.py +75 -0
- package/omega/Agentik_Engine/omega_engine/educators/coworker.py +68 -0
- package/omega/Agentik_Engine/omega_engine/educators/loop.py +82 -0
- package/omega/Agentik_Engine/omega_engine/educators/prompt.py +68 -0
- package/omega/Agentik_Engine/omega_engine/educators/skill.py +69 -0
- package/omega/Agentik_Engine/omega_engine/executor.py +46 -6
- package/omega/Agentik_Engine/omega_engine/mission.py +13 -1
- package/omega/Agentik_Engine/omega_engine/provider.py +247 -1
- package/omega/Agentik_Engine/omega_engine/rag/__init__.py +21 -0
- package/omega/Agentik_Engine/omega_engine/rag/agentic.py +83 -0
- package/omega/Agentik_Engine/omega_engine/rag/base.py +42 -0
- package/omega/Agentik_Engine/omega_engine/rag/corrective.py +119 -0
- package/omega/Agentik_Engine/omega_engine/rag/graph.py +169 -0
- package/omega/Agentik_Engine/omega_engine/rag/hybrid.py +205 -0
- package/omega/Agentik_Engine/omega_engine/rag/multimodal.py +136 -0
- package/omega/Agentik_Engine/omega_engine/rag/router.py +110 -0
- package/omega/Agentik_Engine/omega_engine/reducer.py +21 -3
- package/omega/Agentik_Engine/omega_engine/store.py +65 -5
- package/omega/Agentik_Engine/omega_engine/sync.py +304 -0
- package/omega/Agentik_Engine/omega_engine/tools.py +272 -0
- package/omega/Agentik_Engine/pyproject.toml +1 -1
- package/omega/Agentik_Engine/tests/test_account.py +333 -0
- package/omega/Agentik_Engine/tests/test_autonomous.py +361 -0
- package/omega/Agentik_Engine/tests/test_educators.py +233 -0
- package/omega/Agentik_Engine/tests/test_rag.py +287 -0
- package/omega/Agentik_Engine/tests/test_snapshot_partial.py +172 -0
- package/omega/Agentik_Engine/tests/test_tools_and_sync.py +312 -0
- package/omega/Agentik_SSOT/skills/rag-route.md +73 -0
- package/package.json +1 -1
- package/omega/Agentik_Engine/omega_engine/__pycache__/__init__.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/omega_engine/__pycache__/audit.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/omega_engine/__pycache__/audit_arsenal.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/omega_engine/__pycache__/barrier.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/omega_engine/__pycache__/bus.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/omega_engine/__pycache__/cli.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/omega_engine/__pycache__/events.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/omega_engine/__pycache__/executor.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/omega_engine/__pycache__/mission.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/omega_engine/__pycache__/progress.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/omega_engine/__pycache__/project.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/omega_engine/__pycache__/provider.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/omega_engine/__pycache__/reducer.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/omega_engine/__pycache__/report.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/omega_engine/__pycache__/router.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/omega_engine/__pycache__/store.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/omega_engine/__pycache__/supervisor.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/omega_engine/__pycache__/task.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/omega_engine/__pycache__/telegram.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/tests/__pycache__/test_audit_arsenal.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/tests/__pycache__/test_executor.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/tests/__pycache__/test_mission.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/tests/__pycache__/test_progress.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/tests/__pycache__/test_project.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/tests/__pycache__/test_reducer.cpython-313.pyc +0 -0
- package/omega/Agentik_Engine/tests/__pycache__/test_report.cpython-313.pyc +0 -0
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Corrective RAG (CRAG) — the envelope every strategy wears.
|
|
2
|
+
|
|
3
|
+
For each retrieval, the provider (role "rag-grader") scores every doc 0-100.
|
|
4
|
+
If the average is below `threshold`, the provider (role "rag-agent") produces
|
|
5
|
+
a refined query and we re-retrieve. Bounded retries — never an infinite loop.
|
|
6
|
+
|
|
7
|
+
This is the user's explicit design: corrective wraps the chosen strategy.
|
|
8
|
+
The router resolves WHICH retriever runs; the corrective layer ensures the
|
|
9
|
+
RESULT meets a quality floor.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from omega_engine.provider import AgentProvider, AgentRequest
|
|
14
|
+
from omega_engine.rag.base import Document, RetrievalResult, Retriever
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CorrectiveRetriever:
|
|
18
|
+
"""Grade → if low, refine query and retry. Bounded by `max_retries`."""
|
|
19
|
+
|
|
20
|
+
strategy = "corrective"
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
inner: Retriever,
|
|
25
|
+
provider: AgentProvider,
|
|
26
|
+
*,
|
|
27
|
+
threshold: float = 70.0,
|
|
28
|
+
max_retries: int = 2,
|
|
29
|
+
) -> None:
|
|
30
|
+
self.inner = inner
|
|
31
|
+
self.provider = provider
|
|
32
|
+
self.threshold = threshold
|
|
33
|
+
self.max_retries = max_retries
|
|
34
|
+
|
|
35
|
+
def retrieve(self, query: str, k: int = 5) -> RetrievalResult:
|
|
36
|
+
current = query
|
|
37
|
+
last: RetrievalResult | None = None
|
|
38
|
+
|
|
39
|
+
for attempt in range(self.max_retries + 1):
|
|
40
|
+
result = self.inner.retrieve(current, k=k)
|
|
41
|
+
last = result
|
|
42
|
+
if not result.documents:
|
|
43
|
+
break
|
|
44
|
+
|
|
45
|
+
grade_avg = self._grade(current, result.documents)
|
|
46
|
+
# Stamp the grade onto every document so the caller can see it.
|
|
47
|
+
for d in result.documents:
|
|
48
|
+
d.metadata["grader_avg"] = grade_avg
|
|
49
|
+
result.score = max(result.score, grade_avg / 100.0)
|
|
50
|
+
|
|
51
|
+
if grade_avg >= self.threshold or attempt == self.max_retries:
|
|
52
|
+
# Either good enough OR we've used up our retries.
|
|
53
|
+
inner_strategy = result.strategy
|
|
54
|
+
return RetrievalResult(
|
|
55
|
+
query=query,
|
|
56
|
+
documents=result.documents,
|
|
57
|
+
score=result.score,
|
|
58
|
+
strategy=f"corrective+{inner_strategy}",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Below threshold AND retries remain — ask provider for a refined
|
|
62
|
+
# query and try again.
|
|
63
|
+
current = self._refine(query, current, result.documents)
|
|
64
|
+
|
|
65
|
+
# Defensive fallback — exhausted retries but never returned (e.g.
|
|
66
|
+
# zero docs from inner). Surface what we have, untouched.
|
|
67
|
+
if last is None:
|
|
68
|
+
return RetrievalResult(query=query, documents=[], score=0.0,
|
|
69
|
+
strategy="corrective+empty")
|
|
70
|
+
return RetrievalResult(
|
|
71
|
+
query=query,
|
|
72
|
+
documents=last.documents,
|
|
73
|
+
score=last.score,
|
|
74
|
+
strategy=f"corrective+{last.strategy}",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# ----- internals -----
|
|
78
|
+
|
|
79
|
+
def _grade(self, query: str, docs: list[Document]) -> float:
|
|
80
|
+
"""Average grader score in 0-100. Bounded; missing scores → 0."""
|
|
81
|
+
result = self.provider.run(AgentRequest(
|
|
82
|
+
role="rag-grader",
|
|
83
|
+
prompt=f"Score relevance 0-100 for query: {query}",
|
|
84
|
+
context={
|
|
85
|
+
"query": query,
|
|
86
|
+
"documents": [
|
|
87
|
+
{"id": d.id, "text": d.text[:300]} for d in docs
|
|
88
|
+
],
|
|
89
|
+
},
|
|
90
|
+
))
|
|
91
|
+
raw = (result.artifacts or {}).get("scores", [])
|
|
92
|
+
scores = [float(s) for s in raw if isinstance(s, (int, float))]
|
|
93
|
+
if not scores:
|
|
94
|
+
return 0.0
|
|
95
|
+
avg = sum(scores) / len(scores)
|
|
96
|
+
return max(0.0, min(100.0, avg))
|
|
97
|
+
|
|
98
|
+
def _refine(self, original: str, current: str, docs: list[Document]) -> str:
|
|
99
|
+
"""Ask the provider for a refined query. Fallback: append context."""
|
|
100
|
+
result = self.provider.run(AgentRequest(
|
|
101
|
+
role="rag-agent",
|
|
102
|
+
prompt=(
|
|
103
|
+
f"Original query: {original}\n"
|
|
104
|
+
f"Current query: {current}\n"
|
|
105
|
+
"These docs scored below the relevance threshold. "
|
|
106
|
+
"Suggest a better query."
|
|
107
|
+
),
|
|
108
|
+
context={
|
|
109
|
+
"original_query": original,
|
|
110
|
+
"current_query": current,
|
|
111
|
+
"snippets": [d.text[:200] for d in docs[:3]],
|
|
112
|
+
},
|
|
113
|
+
))
|
|
114
|
+
next_q = (result.artifacts or {}).get("next_query")
|
|
115
|
+
if next_q and next_q != current:
|
|
116
|
+
return str(next_q)
|
|
117
|
+
# Provider had nothing useful — perturb the query so the inner
|
|
118
|
+
# retriever sees something different.
|
|
119
|
+
return f"{current} context"
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""Graph retriever — entity-relation graph with depth-limited expansion.
|
|
2
|
+
|
|
3
|
+
Plain Python: dict-of-sets adjacency, typed edges. Persists to JSON. The
|
|
4
|
+
retrieval contract picks seed nodes from query tokens, expands `depth` hops,
|
|
5
|
+
and returns each visited node as a Document.
|
|
6
|
+
|
|
7
|
+
This is the "second strategy" in the multi-RAG router: when a query is
|
|
8
|
+
relational ("who works on X", "what depends on Y"), the graph wins over
|
|
9
|
+
hybrid.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import re
|
|
15
|
+
from collections import defaultdict, deque
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Iterable
|
|
18
|
+
|
|
19
|
+
from omega_engine.rag.base import Document, RetrievalResult
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
_TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _tokens(text: str) -> list[str]:
|
|
26
|
+
return [t.lower() for t in _TOKEN_RE.findall(text)]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class GraphRetriever:
|
|
30
|
+
"""In-memory entity-relation graph with JSON persistence.
|
|
31
|
+
|
|
32
|
+
Edge tuples are `(neighbour, edge_type)`. The graph is undirected for
|
|
33
|
+
expansion — adding `(a, b, "depends_on")` makes `b` a neighbour of `a`
|
|
34
|
+
AND `a` a neighbour of `b`. The edge type travels with the neighbour so
|
|
35
|
+
the caller can filter on it.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
strategy = "graph"
|
|
39
|
+
|
|
40
|
+
def __init__(self, json_path: str | Path | None = None) -> None:
|
|
41
|
+
self.json_path = str(json_path) if json_path is not None else None
|
|
42
|
+
self._adj: dict[str, set[tuple[str, str]]] = defaultdict(set)
|
|
43
|
+
self._node_text: dict[str, str] = {} # optional node text body
|
|
44
|
+
self._node_meta: dict[str, dict[str, Any]] = {}
|
|
45
|
+
if self.json_path and Path(self.json_path).exists():
|
|
46
|
+
self._load()
|
|
47
|
+
|
|
48
|
+
# ----- mutation -----
|
|
49
|
+
|
|
50
|
+
def add_node(
|
|
51
|
+
self,
|
|
52
|
+
node: str,
|
|
53
|
+
text: str | None = None,
|
|
54
|
+
metadata: dict[str, Any] | None = None,
|
|
55
|
+
) -> None:
|
|
56
|
+
self._adj.setdefault(node, set())
|
|
57
|
+
if text is not None:
|
|
58
|
+
self._node_text[node] = text
|
|
59
|
+
if metadata is not None:
|
|
60
|
+
self._node_meta[node] = dict(metadata)
|
|
61
|
+
|
|
62
|
+
def add_edge(self, a: str, b: str, edge_type: str = "rel") -> None:
|
|
63
|
+
"""Add an undirected typed edge between two nodes. Idempotent."""
|
|
64
|
+
self.add_node(a)
|
|
65
|
+
self.add_node(b)
|
|
66
|
+
self._adj[a].add((b, edge_type))
|
|
67
|
+
self._adj[b].add((a, edge_type))
|
|
68
|
+
if self.json_path:
|
|
69
|
+
self._save()
|
|
70
|
+
|
|
71
|
+
def neighbors(self, node: str, depth: int = 1) -> list[str]:
|
|
72
|
+
"""BFS up to `depth` hops. Returns nodes in discovery order, excluding
|
|
73
|
+
the seed itself."""
|
|
74
|
+
if depth <= 0 or node not in self._adj:
|
|
75
|
+
return []
|
|
76
|
+
seen: set[str] = {node}
|
|
77
|
+
out: list[str] = []
|
|
78
|
+
q: deque[tuple[str, int]] = deque([(node, 0)])
|
|
79
|
+
while q:
|
|
80
|
+
cur, d = q.popleft()
|
|
81
|
+
if d == depth:
|
|
82
|
+
continue
|
|
83
|
+
for nbr, _etype in self._adj[cur]:
|
|
84
|
+
if nbr in seen:
|
|
85
|
+
continue
|
|
86
|
+
seen.add(nbr)
|
|
87
|
+
out.append(nbr)
|
|
88
|
+
q.append((nbr, d + 1))
|
|
89
|
+
return out
|
|
90
|
+
|
|
91
|
+
# ----- retrieval -----
|
|
92
|
+
|
|
93
|
+
def retrieve(self, query: str, k: int = 5, depth: int = 2) -> RetrievalResult:
|
|
94
|
+
"""Pick seeds whose name overlaps a query token; expand `depth` hops.
|
|
95
|
+
|
|
96
|
+
Score = closeness (closer hops rank higher) + seed-name overlap.
|
|
97
|
+
"""
|
|
98
|
+
if not self._adj:
|
|
99
|
+
return RetrievalResult(query=query, documents=[], score=0.0,
|
|
100
|
+
strategy=self.strategy)
|
|
101
|
+
|
|
102
|
+
q_tokens = set(_tokens(query))
|
|
103
|
+
seeds: list[str] = []
|
|
104
|
+
for node in self._adj:
|
|
105
|
+
if any(t in node.lower() for t in q_tokens):
|
|
106
|
+
seeds.append(node)
|
|
107
|
+
if not seeds:
|
|
108
|
+
# No direct hit — fall back to seeding from every node, capped.
|
|
109
|
+
# Lets the graph still surface SOMETHING for ambient queries.
|
|
110
|
+
seeds = list(self._adj.keys())[:3]
|
|
111
|
+
|
|
112
|
+
scored: dict[str, float] = {}
|
|
113
|
+
for seed in seeds:
|
|
114
|
+
scored.setdefault(seed, max(scored.get(seed, 0.0), 1.0))
|
|
115
|
+
# BFS, score = 1 / (1 + hops)
|
|
116
|
+
seen: set[str] = {seed}
|
|
117
|
+
q: deque[tuple[str, int]] = deque([(seed, 0)])
|
|
118
|
+
while q:
|
|
119
|
+
cur, d = q.popleft()
|
|
120
|
+
if d == depth:
|
|
121
|
+
continue
|
|
122
|
+
for nbr, _etype in self._adj[cur]:
|
|
123
|
+
if nbr in seen:
|
|
124
|
+
continue
|
|
125
|
+
seen.add(nbr)
|
|
126
|
+
s = 1.0 / (1.0 + d + 1)
|
|
127
|
+
scored[nbr] = max(scored.get(nbr, 0.0), s)
|
|
128
|
+
q.append((nbr, d + 1))
|
|
129
|
+
|
|
130
|
+
ranked = sorted(scored.items(), key=lambda p: p[1], reverse=True)[:k]
|
|
131
|
+
docs: list[Document] = []
|
|
132
|
+
for node, s in ranked:
|
|
133
|
+
text = self._node_text.get(node, node)
|
|
134
|
+
meta = dict(self._node_meta.get(node, {}))
|
|
135
|
+
meta["score"] = s
|
|
136
|
+
meta["edges"] = sorted(
|
|
137
|
+
(nbr, etype) for nbr, etype in self._adj[node]
|
|
138
|
+
)
|
|
139
|
+
docs.append(Document(id=node, text=text, metadata=meta))
|
|
140
|
+
agg = sum(s for _, s in ranked) / len(ranked) if ranked else 0.0
|
|
141
|
+
return RetrievalResult(query=query, documents=docs, score=agg,
|
|
142
|
+
strategy=self.strategy)
|
|
143
|
+
|
|
144
|
+
# ----- persistence -----
|
|
145
|
+
|
|
146
|
+
def _save(self) -> None:
|
|
147
|
+
assert self.json_path is not None
|
|
148
|
+
Path(self.json_path).parent.mkdir(parents=True, exist_ok=True)
|
|
149
|
+
# Sets aren't JSON-serialisable — flatten to lists.
|
|
150
|
+
data = {
|
|
151
|
+
"adj": {
|
|
152
|
+
k: sorted([list(t) for t in v]) for k, v in self._adj.items()
|
|
153
|
+
},
|
|
154
|
+
"node_text": self._node_text,
|
|
155
|
+
"node_meta": self._node_meta,
|
|
156
|
+
}
|
|
157
|
+
Path(self.json_path).write_text(json.dumps(data, indent=2))
|
|
158
|
+
|
|
159
|
+
def _load(self) -> None:
|
|
160
|
+
assert self.json_path is not None
|
|
161
|
+
data = json.loads(Path(self.json_path).read_text())
|
|
162
|
+
self._adj = defaultdict(set, {
|
|
163
|
+
k: {tuple(t) for t in v} for k, v in data.get("adj", {}).items()
|
|
164
|
+
})
|
|
165
|
+
self._node_text = data.get("node_text", {})
|
|
166
|
+
self._node_meta = data.get("node_meta", {})
|
|
167
|
+
|
|
168
|
+
def nodes(self) -> Iterable[str]:
|
|
169
|
+
return self._adj.keys()
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""Hybrid retriever — sparse FTS5 (BM25) blended with dense cosine.
|
|
2
|
+
|
|
3
|
+
SQLite FTS5 gives a real BM25 score via its `bm25()` ranking function.
|
|
4
|
+
For the dense leg, we use a *hashing trick* — every token is hashed into one
|
|
5
|
+
of `dim` buckets, weighted by sublinear TF, then ℓ2-normalised. Cosine
|
|
6
|
+
similarity is the dot product of two normalised vectors. Pure stdlib.
|
|
7
|
+
|
|
8
|
+
The combined score is `alpha * dense + (1 - alpha) * sparse`, both legs
|
|
9
|
+
normalised to [0, 1] across the candidate pool.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import hashlib
|
|
14
|
+
import json
|
|
15
|
+
import math
|
|
16
|
+
import re
|
|
17
|
+
import sqlite3
|
|
18
|
+
from collections import Counter
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
from omega_engine.rag.base import Document, RetrievalResult
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
_TOKEN_RE = re.compile(r"[A-Za-z0-9]+")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _tokens(text: str) -> list[str]:
|
|
28
|
+
return [t.lower() for t in _TOKEN_RE.findall(text)]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _hash_to_bucket(token: str, dim: int) -> int:
|
|
32
|
+
# md5 is fast and stable — we only need a uniform-ish bucket index, not
|
|
33
|
+
# cryptographic security.
|
|
34
|
+
h = hashlib.md5(token.encode("utf-8")).digest()
|
|
35
|
+
return int.from_bytes(h[:4], "big") % dim
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _vectorise(text: str, dim: int) -> list[float]:
|
|
39
|
+
"""Hashed-feature vector with sublinear TF, ℓ2-normalised."""
|
|
40
|
+
counts: Counter[int] = Counter()
|
|
41
|
+
for tok in _tokens(text):
|
|
42
|
+
counts[_hash_to_bucket(tok, dim)] += 1
|
|
43
|
+
vec = [0.0] * dim
|
|
44
|
+
for bucket, c in counts.items():
|
|
45
|
+
# 1 + log(tf) damps very common tokens — the classic tf-idf weighting.
|
|
46
|
+
vec[bucket] = 1.0 + math.log(c)
|
|
47
|
+
norm = math.sqrt(sum(v * v for v in vec))
|
|
48
|
+
if norm == 0.0:
|
|
49
|
+
return vec
|
|
50
|
+
return [v / norm for v in vec]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _cosine(a: list[float], b: list[float]) -> float:
|
|
54
|
+
# Both vectors are already ℓ2-normalised → cosine == dot product.
|
|
55
|
+
return sum(x * y for x, y in zip(a, b))
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _normalise(values: list[float]) -> list[float]:
|
|
59
|
+
"""Min-max scale a list into [0, 1]. Empty / flat → all zeros."""
|
|
60
|
+
if not values:
|
|
61
|
+
return values
|
|
62
|
+
lo, hi = min(values), max(values)
|
|
63
|
+
if hi - lo < 1e-12:
|
|
64
|
+
return [0.0 for _ in values]
|
|
65
|
+
return [(v - lo) / (hi - lo) for v in values]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
_SCHEMA = """
|
|
69
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS docs_fts USING fts5(
|
|
70
|
+
id UNINDEXED, text, tokenize = 'unicode61'
|
|
71
|
+
);
|
|
72
|
+
CREATE TABLE IF NOT EXISTS docs_meta (
|
|
73
|
+
id TEXT PRIMARY KEY,
|
|
74
|
+
text TEXT NOT NULL,
|
|
75
|
+
meta TEXT NOT NULL DEFAULT '{}',
|
|
76
|
+
vec TEXT NOT NULL
|
|
77
|
+
);
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class HybridRetriever:
|
|
82
|
+
"""Sparse FTS5 + dense hashed-cosine, combined per `alpha`.
|
|
83
|
+
|
|
84
|
+
`alpha = 1` → pure dense. `alpha = 0` → pure sparse. Default 0.5 blends.
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
strategy = "hybrid"
|
|
88
|
+
|
|
89
|
+
def __init__(
|
|
90
|
+
self,
|
|
91
|
+
db_path: str | Path,
|
|
92
|
+
*,
|
|
93
|
+
dim: int = 256,
|
|
94
|
+
alpha: float = 0.5,
|
|
95
|
+
) -> None:
|
|
96
|
+
self.db_path = str(db_path)
|
|
97
|
+
self.dim = dim
|
|
98
|
+
self.alpha = alpha
|
|
99
|
+
Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
|
|
100
|
+
self._conn = sqlite3.connect(self.db_path, isolation_level=None)
|
|
101
|
+
self._conn.row_factory = sqlite3.Row
|
|
102
|
+
self._conn.execute("PRAGMA journal_mode=WAL;")
|
|
103
|
+
self._conn.execute("PRAGMA synchronous=NORMAL;")
|
|
104
|
+
self._conn.executescript(_SCHEMA)
|
|
105
|
+
|
|
106
|
+
# ----- ingest -----
|
|
107
|
+
|
|
108
|
+
def index(self, documents: list[Document]) -> int:
|
|
109
|
+
"""Index a batch. Returns the number of docs written. Idempotent on id."""
|
|
110
|
+
n = 0
|
|
111
|
+
with self._conn:
|
|
112
|
+
for d in documents:
|
|
113
|
+
vec = _vectorise(d.text, self.dim)
|
|
114
|
+
self._conn.execute(
|
|
115
|
+
"INSERT OR REPLACE INTO docs_meta (id, text, meta, vec) "
|
|
116
|
+
"VALUES (?, ?, ?, ?)",
|
|
117
|
+
(d.id, d.text, json.dumps(d.metadata), json.dumps(vec)),
|
|
118
|
+
)
|
|
119
|
+
# FTS5 has no native upsert — delete-then-insert is the idiom.
|
|
120
|
+
self._conn.execute("DELETE FROM docs_fts WHERE id = ?", (d.id,))
|
|
121
|
+
self._conn.execute(
|
|
122
|
+
"INSERT INTO docs_fts (id, text) VALUES (?, ?)", (d.id, d.text)
|
|
123
|
+
)
|
|
124
|
+
n += 1
|
|
125
|
+
return n
|
|
126
|
+
|
|
127
|
+
def count(self) -> int:
|
|
128
|
+
cur = self._conn.execute("SELECT COUNT(*) AS c FROM docs_meta")
|
|
129
|
+
return int(cur.fetchone()["c"])
|
|
130
|
+
|
|
131
|
+
# ----- retrieval -----
|
|
132
|
+
|
|
133
|
+
def retrieve(self, query: str, k: int = 5) -> RetrievalResult:
|
|
134
|
+
if k <= 0 or self.count() == 0:
|
|
135
|
+
return RetrievalResult(query=query, documents=[], score=0.0,
|
|
136
|
+
strategy=self.strategy)
|
|
137
|
+
|
|
138
|
+
# --- sparse leg: FTS5 + bm25. bm25() returns a NEGATIVE score (lower
|
|
139
|
+
# is better), so we negate it before normalising.
|
|
140
|
+
sparse_rows: dict[str, float] = {}
|
|
141
|
+
fts_query = _fts5_safe(query)
|
|
142
|
+
if fts_query:
|
|
143
|
+
try:
|
|
144
|
+
cur = self._conn.execute(
|
|
145
|
+
"SELECT id, bm25(docs_fts) AS score FROM docs_fts "
|
|
146
|
+
"WHERE docs_fts MATCH ? ORDER BY score LIMIT ?",
|
|
147
|
+
(fts_query, k * 4),
|
|
148
|
+
)
|
|
149
|
+
for row in cur.fetchall():
|
|
150
|
+
sparse_rows[row["id"]] = -float(row["score"])
|
|
151
|
+
except sqlite3.OperationalError:
|
|
152
|
+
# malformed FTS5 query — fall back to dense-only
|
|
153
|
+
sparse_rows = {}
|
|
154
|
+
|
|
155
|
+
# --- dense leg: cosine over every indexed doc (small corpora; for
|
|
156
|
+
# larger corpora a clustered ANN goes here, contract identical).
|
|
157
|
+
qvec = _vectorise(query, self.dim)
|
|
158
|
+
dense_rows: dict[str, float] = {}
|
|
159
|
+
cur = self._conn.execute("SELECT id, vec FROM docs_meta")
|
|
160
|
+
for row in cur.fetchall():
|
|
161
|
+
v = json.loads(row["vec"])
|
|
162
|
+
dense_rows[row["id"]] = _cosine(qvec, v)
|
|
163
|
+
|
|
164
|
+
# --- combine: union of candidates, min-max each leg, alpha blend.
|
|
165
|
+
candidates = set(sparse_rows) | set(dense_rows)
|
|
166
|
+
if not candidates:
|
|
167
|
+
return RetrievalResult(query=query, documents=[], score=0.0,
|
|
168
|
+
strategy=self.strategy)
|
|
169
|
+
|
|
170
|
+
ids = list(candidates)
|
|
171
|
+
sparse_vec = [sparse_rows.get(i, 0.0) for i in ids]
|
|
172
|
+
dense_vec = [dense_rows.get(i, 0.0) for i in ids]
|
|
173
|
+
sparse_n = _normalise(sparse_vec)
|
|
174
|
+
dense_n = _normalise(dense_vec)
|
|
175
|
+
combined = [
|
|
176
|
+
(self.alpha * d) + ((1 - self.alpha) * s)
|
|
177
|
+
for d, s in zip(dense_n, sparse_n)
|
|
178
|
+
]
|
|
179
|
+
|
|
180
|
+
ranked = sorted(zip(ids, combined), key=lambda p: p[1], reverse=True)[:k]
|
|
181
|
+
docs: list[Document] = []
|
|
182
|
+
for doc_id, score in ranked:
|
|
183
|
+
cur = self._conn.execute(
|
|
184
|
+
"SELECT text, meta FROM docs_meta WHERE id = ?", (doc_id,)
|
|
185
|
+
)
|
|
186
|
+
row = cur.fetchone()
|
|
187
|
+
if not row:
|
|
188
|
+
continue
|
|
189
|
+
meta = json.loads(row["meta"])
|
|
190
|
+
meta["score"] = score
|
|
191
|
+
docs.append(Document(id=doc_id, text=row["text"], metadata=meta))
|
|
192
|
+
|
|
193
|
+
agg = sum(s for _, s in ranked) / len(ranked) if ranked else 0.0
|
|
194
|
+
return RetrievalResult(query=query, documents=docs, score=agg,
|
|
195
|
+
strategy=self.strategy)
|
|
196
|
+
|
|
197
|
+
def close(self) -> None:
|
|
198
|
+
self._conn.close()
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _fts5_safe(query: str) -> str:
|
|
202
|
+
"""Coerce a freeform query into something FTS5 can parse — strip syntax
|
|
203
|
+
chars, OR the surviving tokens together. Returns "" if nothing remains."""
|
|
204
|
+
toks = _tokens(query)
|
|
205
|
+
return " OR ".join(toks) if toks else ""
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Multimodal retriever — PDFs and images on top of a text retriever.
|
|
2
|
+
|
|
3
|
+
v1 keeps the surface honest:
|
|
4
|
+
* PDFs are converted to text via `pdftotext` if the binary is on PATH,
|
|
5
|
+
otherwise the doc is registered with its path + an empty body (no fake
|
|
6
|
+
text, no hallucinated extraction).
|
|
7
|
+
* Images get a caption from the provider (role "rag-caption") and are
|
|
8
|
+
indexed by that caption + their path. If the provider returns nothing,
|
|
9
|
+
the image still indexes by filename.
|
|
10
|
+
|
|
11
|
+
The underlying text index is any text-mode Retriever (typically
|
|
12
|
+
HybridRetriever) — so a query "the architecture diagram explaining the
|
|
13
|
+
event log" can surface an image whose caption matches.
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import shutil
|
|
18
|
+
import subprocess
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
from omega_engine.provider import AgentProvider, AgentRequest
|
|
22
|
+
from omega_engine.rag.base import Document, RetrievalResult, Retriever
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _pdftotext_available() -> bool:
|
|
26
|
+
return shutil.which("pdftotext") is not None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _extract_pdf_text(path: Path) -> str:
|
|
30
|
+
"""Best-effort PDF → text. Returns "" when pdftotext is missing or fails.
|
|
31
|
+
NEVER fakes output (Karpathy: think before coding — no hallucination)."""
|
|
32
|
+
if not _pdftotext_available():
|
|
33
|
+
return ""
|
|
34
|
+
try:
|
|
35
|
+
out = subprocess.run(
|
|
36
|
+
["pdftotext", "-q", str(path), "-"],
|
|
37
|
+
capture_output=True, text=True, timeout=15, check=False,
|
|
38
|
+
)
|
|
39
|
+
return out.stdout if out.returncode == 0 else ""
|
|
40
|
+
except (subprocess.TimeoutExpired, OSError):
|
|
41
|
+
return ""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class MultimodalRetriever:
|
|
45
|
+
"""Wraps a text retriever; understands `add_pdf` and `add_image`."""
|
|
46
|
+
|
|
47
|
+
strategy = "multimodal"
|
|
48
|
+
|
|
49
|
+
# `Retriever` (Protocol) needs an `index` method when we use HybridRetriever.
|
|
50
|
+
# We type-hint loosely so any object with `retrieve` works AND any
|
|
51
|
+
# object that *also* has `index` accepts new docs.
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
inner: Retriever,
|
|
55
|
+
provider: AgentProvider | None = None,
|
|
56
|
+
) -> None:
|
|
57
|
+
self.inner = inner
|
|
58
|
+
self.provider = provider
|
|
59
|
+
self._registered: dict[str, dict] = {} # id → {kind, path, caption}
|
|
60
|
+
|
|
61
|
+
# ----- ingest -----
|
|
62
|
+
|
|
63
|
+
def add_pdf(self, doc_id: str, path: str | Path) -> Document:
|
|
64
|
+
"""Extract text via pdftotext if available; index whatever survives."""
|
|
65
|
+
p = Path(path)
|
|
66
|
+
text = _extract_pdf_text(p)
|
|
67
|
+
meta = {
|
|
68
|
+
"modality": "pdf",
|
|
69
|
+
"source": str(p),
|
|
70
|
+
"extracted": bool(text),
|
|
71
|
+
}
|
|
72
|
+
doc = Document(id=doc_id, text=text or f"[pdf:{p.name}]", metadata=meta)
|
|
73
|
+
self._register(doc)
|
|
74
|
+
return doc
|
|
75
|
+
|
|
76
|
+
def add_image(
|
|
77
|
+
self,
|
|
78
|
+
doc_id: str,
|
|
79
|
+
path: str | Path,
|
|
80
|
+
*,
|
|
81
|
+
caption: str | None = None,
|
|
82
|
+
) -> Document:
|
|
83
|
+
"""Index an image by its caption. Caption is provider-generated
|
|
84
|
+
unless one is passed in. With no provider and no caption, we fall
|
|
85
|
+
back to the filename so the image is still discoverable."""
|
|
86
|
+
p = Path(path)
|
|
87
|
+
final_caption = caption
|
|
88
|
+
if final_caption is None and self.provider is not None:
|
|
89
|
+
result = self.provider.run(AgentRequest(
|
|
90
|
+
role="rag-caption",
|
|
91
|
+
prompt=f"Caption the image at {p}",
|
|
92
|
+
context={"path": str(p)},
|
|
93
|
+
))
|
|
94
|
+
final_caption = (result.artifacts or {}).get("caption")
|
|
95
|
+
if not final_caption:
|
|
96
|
+
final_caption = p.stem.replace("_", " ").replace("-", " ")
|
|
97
|
+
meta = {
|
|
98
|
+
"modality": "image",
|
|
99
|
+
"source": str(p),
|
|
100
|
+
"caption": final_caption,
|
|
101
|
+
}
|
|
102
|
+
doc = Document(id=doc_id, text=final_caption, metadata=meta)
|
|
103
|
+
self._register(doc)
|
|
104
|
+
return doc
|
|
105
|
+
|
|
106
|
+
def add_text(self, doc_id: str, text: str, **meta) -> Document:
|
|
107
|
+
"""Plain text passthrough — keeps the retriever uniformly usable."""
|
|
108
|
+
m = {"modality": "text", **meta}
|
|
109
|
+
doc = Document(id=doc_id, text=text, metadata=m)
|
|
110
|
+
self._register(doc)
|
|
111
|
+
return doc
|
|
112
|
+
|
|
113
|
+
def _register(self, doc: Document) -> None:
|
|
114
|
+
self._registered[doc.id] = doc.metadata
|
|
115
|
+
index = getattr(self.inner, "index", None)
|
|
116
|
+
if callable(index):
|
|
117
|
+
index([doc])
|
|
118
|
+
# If the inner retriever has no `index`, the caller is responsible
|
|
119
|
+
# for populating it elsewhere — we still track the modality metadata.
|
|
120
|
+
|
|
121
|
+
# ----- retrieval -----
|
|
122
|
+
|
|
123
|
+
def retrieve(self, query: str, k: int = 5) -> RetrievalResult:
|
|
124
|
+
result = self.inner.retrieve(query, k=k)
|
|
125
|
+
# Inject modality metadata for any doc we registered.
|
|
126
|
+
for d in result.documents:
|
|
127
|
+
if d.id in self._registered:
|
|
128
|
+
d.metadata.setdefault(
|
|
129
|
+
"modality", self._registered[d.id].get("modality", "text"),
|
|
130
|
+
)
|
|
131
|
+
return RetrievalResult(
|
|
132
|
+
query=query,
|
|
133
|
+
documents=result.documents,
|
|
134
|
+
score=result.score,
|
|
135
|
+
strategy=self.strategy,
|
|
136
|
+
)
|