memuron 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memuron/__init__.py +3 -0
- memuron/actions/__init__.py +12 -0
- memuron/actions/context.py +63 -0
- memuron/actions/helpers.py +88 -0
- memuron/actions/memory.py +340 -0
- memuron/actions/memory_write.py +290 -0
- memuron/actions/nodes.py +340 -0
- memuron/actions/registry.py +5 -0
- memuron/actions/runtime.py +37 -0
- memuron/actions/spaces_documents.py +720 -0
- memuron/actions/sync.py +155 -0
- memuron/application/__init__.py +1 -0
- memuron/application/api.py +206 -0
- memuron/application/app.py +103 -0
- memuron/application/capabilities.py +82 -0
- memuron/application/cli.py +35 -0
- memuron/application/config.py +176 -0
- memuron/application/mcp.py +44 -0
- memuron/application/mcp_oauth.py +290 -0
- memuron/application/registry.py +52 -0
- memuron/context.py +532 -0
- memuron/documents/__init__.py +1 -0
- memuron/documents/link_guardian.py +192 -0
- memuron/documents/linking.py +292 -0
- memuron/documents/parser.py +1152 -0
- memuron/documents/storage.py +151 -0
- memuron/documents/url_ingest.py +375 -0
- memuron/domain/__init__.py +1 -0
- memuron/domain/decoders.py +1 -0
- memuron/domain/encoders.py +185 -0
- memuron/domain/lifecycles.py +8 -0
- memuron/domain/limits.py +6 -0
- memuron/domain/representations.py +56 -0
- memuron/domain/schemas.py +581 -0
- memuron/domain/scope_filter.py +104 -0
- memuron/graphfs/__init__.py +1 -0
- memuron/graphfs/manual.py +635 -0
- memuron/graphfs/projection.py +578 -0
- memuron/graphfs/query.py +1782 -0
- memuron/graphfs/read_model.py +574 -0
- memuron/ingest/__init__.py +1 -0
- memuron/ingest/guardian.py +213 -0
- memuron/ingest/jobs.py +424 -0
- memuron/ingest/prompts.py +147 -0
- memuron/memory/__init__.py +1 -0
- memuron/memory/engine.py +35 -0
- memuron/memory/projections.py +452 -0
- memuron/memory/recipes.py +3247 -0
- memuron/persistence/__init__.py +1 -0
- memuron/persistence/db_pool.py +57 -0
- memuron/persistence/identity_store.py +918 -0
- memuron/persistence/store_helpers.py +16 -0
- memuron/search/__init__.py +1 -0
- memuron/search/fulltext.py +110 -0
- memuron/search/hybrid.py +284 -0
- memuron/search/pgvector.py +252 -0
- memuron/security/__init__.py +1 -0
- memuron/security/auth.py +143 -0
- memuron/security/auth_provider.py +119 -0
- memuron/security/authorization.py +53 -0
- memuron/security/clerk_scopes.py +94 -0
- memuron/security/clerk_webhooks.py +61 -0
- memuron/security/jwt_tokens.py +53 -0
- memuron/security/passwords.py +38 -0
- memuron/security/tenant.py +58 -0
- memuron/spaces/__init__.py +1 -0
- memuron/spaces/model.py +35 -0
- memuron/spaces/service.py +155 -0
- memuron/sync/__init__.py +25 -0
- memuron/sync/folder.py +828 -0
- memuron-0.1.1.dist-info/METADATA +242 -0
- memuron-0.1.1.dist-info/RECORD +74 -0
- memuron-0.1.1.dist-info/WHEEL +4 -0
- memuron-0.1.1.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Store helpers that stay in userland (no engine fork required)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from artha_engine.store.projection_sql import sql_store_execute, sql_store_has_tables
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def purge_arthaanu(store: object, artha_id: str) -> bool:
|
|
9
|
+
"""Remove the current Arthaanu projection row without appending a store.delete event."""
|
|
10
|
+
if sql_store_has_tables(store):
|
|
11
|
+
sql_store_execute(store, "DELETE FROM arthaanu WHERE artha_id = ?", (artha_id,))
|
|
12
|
+
return True
|
|
13
|
+
items = getattr(store, "_items", None)
|
|
14
|
+
if isinstance(items, dict):
|
|
15
|
+
return items.pop(artha_id, None) is not None
|
|
16
|
+
return False
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Vector, full-text, and hybrid retrieval backends."""
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""PostgreSQL full-text (tsvector + GIN) for hybrid keyword retrieval."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from artha_engine.store.projection_sql import sql_store_execute, sql_store_fetchall, sql_store_has_tables
|
|
9
|
+
|
|
10
|
+
from memuron.domain.scope_filter import scope_sql_clauses
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
_FULLTEXT_FLAG = "_memuron_fulltext_ready"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def fulltext_is_ready(store: object) -> bool:
|
|
18
|
+
return bool(getattr(store, _FULLTEXT_FLAG, False))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def ensure_fulltext_schema(store: object) -> bool:
|
|
22
|
+
"""Add content_tsv column, backfill, and GIN index for BM25-style retrieval."""
|
|
23
|
+
if not sql_store_has_tables(store):
|
|
24
|
+
return False
|
|
25
|
+
if fulltext_is_ready(store):
|
|
26
|
+
return True
|
|
27
|
+
try:
|
|
28
|
+
sql_store_execute(
|
|
29
|
+
store,
|
|
30
|
+
"ALTER TABLE memuron_memories ADD COLUMN IF NOT EXISTS content_tsv tsvector",
|
|
31
|
+
)
|
|
32
|
+
sql_store_execute(
|
|
33
|
+
store,
|
|
34
|
+
"""
|
|
35
|
+
UPDATE memuron_memories
|
|
36
|
+
SET content_tsv = to_tsvector('english', coalesce(content, ''))
|
|
37
|
+
WHERE content_tsv IS NULL
|
|
38
|
+
""",
|
|
39
|
+
)
|
|
40
|
+
sql_store_execute(
|
|
41
|
+
store,
|
|
42
|
+
"""
|
|
43
|
+
CREATE INDEX IF NOT EXISTS idx_memuron_memories_content_tsv_gin
|
|
44
|
+
ON memuron_memories USING GIN (content_tsv)
|
|
45
|
+
""",
|
|
46
|
+
)
|
|
47
|
+
setattr(store, _FULLTEXT_FLAG, True)
|
|
48
|
+
logger.info("memuron full-text (tsvector GIN) ready")
|
|
49
|
+
return True
|
|
50
|
+
except Exception as exc:
|
|
51
|
+
logger.warning("full-text setup skipped: %s", exc)
|
|
52
|
+
return False
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def sync_memory_fulltext(store: object, artha_id: str, content: str) -> None:
|
|
56
|
+
if not fulltext_is_ready(store):
|
|
57
|
+
return
|
|
58
|
+
sql_store_execute(
|
|
59
|
+
store,
|
|
60
|
+
"""
|
|
61
|
+
UPDATE memuron_memories
|
|
62
|
+
SET content_tsv = to_tsvector('english', coalesce(?, ''))
|
|
63
|
+
WHERE artha_id = ?
|
|
64
|
+
""",
|
|
65
|
+
(content, artha_id),
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _scope_where(scope: list[str] | None) -> tuple[str, list[str]]:
|
|
70
|
+
scope_sql, scope_params = scope_sql_clauses("scope_json", scope)
|
|
71
|
+
if scope_sql:
|
|
72
|
+
return f" AND {scope_sql}", scope_params
|
|
73
|
+
return "", []
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def fulltext_memory_search(
|
|
77
|
+
store: object,
|
|
78
|
+
query: str,
|
|
79
|
+
*,
|
|
80
|
+
top_k: int,
|
|
81
|
+
scope: list[str] | None = None,
|
|
82
|
+
) -> list[dict[str, Any]]:
|
|
83
|
+
"""Rank memories with Postgres ts_rank_cd over content_tsv."""
|
|
84
|
+
if not fulltext_is_ready(store) or not query.strip():
|
|
85
|
+
return []
|
|
86
|
+
|
|
87
|
+
scope_where, scope_params = _scope_where(scope)
|
|
88
|
+
rows = sql_store_fetchall(
|
|
89
|
+
store,
|
|
90
|
+
f"""
|
|
91
|
+
SELECT artha_id, content, scope_json, metadata_json,
|
|
92
|
+
ts_rank_cd(content_tsv, plainto_tsquery('english', ?)) AS fts_score
|
|
93
|
+
FROM memuron_memories
|
|
94
|
+
WHERE content_tsv IS NOT NULL
|
|
95
|
+
AND content_tsv @@ plainto_tsquery('english', ?){scope_where}
|
|
96
|
+
ORDER BY fts_score DESC
|
|
97
|
+
LIMIT ?
|
|
98
|
+
""",
|
|
99
|
+
(query, query, *scope_params, top_k),
|
|
100
|
+
)
|
|
101
|
+
return [
|
|
102
|
+
{
|
|
103
|
+
"artha_id": str(row["artha_id"]),
|
|
104
|
+
"content": str(row.get("content") or ""),
|
|
105
|
+
"scope_json": row.get("scope_json"),
|
|
106
|
+
"metadata_json": row.get("metadata_json"),
|
|
107
|
+
"fts_score": float(row["fts_score"]),
|
|
108
|
+
}
|
|
109
|
+
for row in rows
|
|
110
|
+
]
|
memuron/search/hybrid.py
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"""Hybrid memory retrieval: pgvector + Postgres FTS fused with RRF."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from artha_engine import ArthaEngine, TextArthaanu
|
|
8
|
+
from artha_engine.decoders.search import Bm25SearchDecoder, Bm25SearchParams, TextCandidate
|
|
9
|
+
from artha_engine.retrieval.rrf import reciprocal_rank_fusion
|
|
10
|
+
|
|
11
|
+
from memuron.application.config import settings
|
|
12
|
+
from memuron.search.fulltext import fulltext_is_ready, fulltext_memory_search
|
|
13
|
+
from memuron.search.pgvector import pgvector_is_ready, pgvector_memory_search
|
|
14
|
+
from memuron.domain.representations import MemoryArthaanu, MemoryValue
|
|
15
|
+
from memuron.domain.scope_filter import scope_matches_filter
|
|
16
|
+
from memuron.domain.schemas import source_identity_from_metadata
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def retrieve_pool_size(k: int, multiplier: int | None = None) -> int:
|
|
20
|
+
mult = multiplier if multiplier is not None else settings.search_retrieve_multiplier
|
|
21
|
+
return max(k * mult, 40)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _memory_item(memory_id: str, content: str, scope: list[str]) -> MemoryArthaanu:
|
|
25
|
+
return MemoryArthaanu(
|
|
26
|
+
artha_id=memory_id,
|
|
27
|
+
name="memory",
|
|
28
|
+
value=MemoryValue(content=content, scope=scope, embedding=[]),
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _bm25_hits_in_memory(
|
|
33
|
+
engine: ArthaEngine,
|
|
34
|
+
query: str,
|
|
35
|
+
corpus: list[tuple[str, str, list[str], dict[str, Any]]],
|
|
36
|
+
*,
|
|
37
|
+
top_k: int,
|
|
38
|
+
) -> list[tuple[str, float]]:
|
|
39
|
+
if not corpus:
|
|
40
|
+
return []
|
|
41
|
+
decoder = Bm25SearchDecoder()
|
|
42
|
+
candidates = [
|
|
43
|
+
TextCandidate(
|
|
44
|
+
item=_memory_item(memory_id, content, scope),
|
|
45
|
+
text=TextArthaanu(name="content", value=content),
|
|
46
|
+
)
|
|
47
|
+
for memory_id, content, scope, _metadata in corpus
|
|
48
|
+
]
|
|
49
|
+
result = decoder.decode(
|
|
50
|
+
TextArthaanu(name="query", value=query),
|
|
51
|
+
Bm25SearchParams(candidates=candidates, top_k=top_k),
|
|
52
|
+
)
|
|
53
|
+
return [(hit.item.artha_id, float(hit.score)) for hit in result.hits]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _sqlite_corpus(
|
|
57
|
+
engine: ArthaEngine,
|
|
58
|
+
scope: list[str] | None,
|
|
59
|
+
) -> list[tuple[str, str, list[str], dict[str, Any]]]:
|
|
60
|
+
from memuron.memory.recipes import _fetch_memory_rows_by_ids, _list_memory_search_rows, _parse_json_field
|
|
61
|
+
|
|
62
|
+
rows = _list_memory_search_rows(engine)
|
|
63
|
+
if not rows:
|
|
64
|
+
return []
|
|
65
|
+
details = _fetch_memory_rows_by_ids(engine, [str(row["artha_id"]) for row in rows])
|
|
66
|
+
corpus: list[tuple[str, str, list[str], dict[str, Any]]] = []
|
|
67
|
+
for memory_id, row in details.items():
|
|
68
|
+
scope_tokens = _parse_json_field(row.get("scope_json"), [])
|
|
69
|
+
if not isinstance(scope_tokens, list):
|
|
70
|
+
scope_tokens = []
|
|
71
|
+
if scope and not scope_matches_filter([str(s) for s in scope_tokens], scope):
|
|
72
|
+
continue
|
|
73
|
+
content = str(row.get("content") or "")
|
|
74
|
+
metadata = _parse_json_field(row.get("metadata_json"), {})
|
|
75
|
+
if content.strip():
|
|
76
|
+
corpus.append(
|
|
77
|
+
(
|
|
78
|
+
memory_id,
|
|
79
|
+
content,
|
|
80
|
+
[str(s) for s in scope_tokens],
|
|
81
|
+
metadata if isinstance(metadata, dict) else {},
|
|
82
|
+
)
|
|
83
|
+
)
|
|
84
|
+
return corpus
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _vector_memory_hits(
|
|
88
|
+
engine: ArthaEngine,
|
|
89
|
+
query_vector: list[float],
|
|
90
|
+
*,
|
|
91
|
+
top_k: int,
|
|
92
|
+
scope: list[str] | None,
|
|
93
|
+
min_semantic_score: float,
|
|
94
|
+
) -> list[tuple[str, float, dict[str, Any]]]:
|
|
95
|
+
"""Vector leg; min_semantic_score applies here only (not to FTS/RRF)."""
|
|
96
|
+
from memuron.memory.recipes import (
|
|
97
|
+
_fetch_memory_rows_by_ids,
|
|
98
|
+
_memory_similarity_hits,
|
|
99
|
+
_parse_json_field,
|
|
100
|
+
_row_to_memory_dict,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
store = engine.store
|
|
104
|
+
output: list[tuple[str, float, dict[str, Any]]] = []
|
|
105
|
+
if pgvector_is_ready(store):
|
|
106
|
+
for row in pgvector_memory_search(
|
|
107
|
+
store,
|
|
108
|
+
query_vector,
|
|
109
|
+
top_k=top_k,
|
|
110
|
+
scope=scope,
|
|
111
|
+
include_content=True,
|
|
112
|
+
):
|
|
113
|
+
score = float(row["semantic_score"])
|
|
114
|
+
if score < min_semantic_score:
|
|
115
|
+
continue
|
|
116
|
+
memory_id = str(row["artha_id"])
|
|
117
|
+
scope_tokens = _parse_json_field(row.get("scope_json"), [])
|
|
118
|
+
output.append(
|
|
119
|
+
(
|
|
120
|
+
memory_id,
|
|
121
|
+
score,
|
|
122
|
+
{
|
|
123
|
+
"type": "memory_node",
|
|
124
|
+
"id": memory_id,
|
|
125
|
+
"content": str(row.get("content") or ""),
|
|
126
|
+
"scope": scope_tokens if isinstance(scope_tokens, list) else [],
|
|
127
|
+
"source_metadata": source_identity_from_metadata(
|
|
128
|
+
_parse_json_field(row.get("metadata_json"), {})
|
|
129
|
+
),
|
|
130
|
+
"matched_via": "memory",
|
|
131
|
+
"matched_via_link_id": None,
|
|
132
|
+
},
|
|
133
|
+
)
|
|
134
|
+
)
|
|
135
|
+
return output
|
|
136
|
+
|
|
137
|
+
raw = _memory_similarity_hits(engine, query_vector, scope=scope, top_k=top_k)
|
|
138
|
+
details = _fetch_memory_rows_by_ids(engine, [memory_id for memory_id, _ in raw])
|
|
139
|
+
for memory_id, score in raw:
|
|
140
|
+
if score < min_semantic_score:
|
|
141
|
+
continue
|
|
142
|
+
row = details.get(memory_id)
|
|
143
|
+
if not row:
|
|
144
|
+
continue
|
|
145
|
+
memory = _row_to_memory_dict(row)
|
|
146
|
+
output.append(
|
|
147
|
+
(
|
|
148
|
+
memory_id,
|
|
149
|
+
score,
|
|
150
|
+
{
|
|
151
|
+
"type": "memory_node",
|
|
152
|
+
"id": memory_id,
|
|
153
|
+
"content": memory["content"],
|
|
154
|
+
"scope": memory.get("scope") or [],
|
|
155
|
+
"source_metadata": source_identity_from_metadata(
|
|
156
|
+
memory.get("metadata") if isinstance(memory.get("metadata"), dict) else {}
|
|
157
|
+
),
|
|
158
|
+
"matched_via": "memory",
|
|
159
|
+
"matched_via_link_id": None,
|
|
160
|
+
},
|
|
161
|
+
)
|
|
162
|
+
)
|
|
163
|
+
return output
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _fts_memory_hits(
|
|
167
|
+
engine: ArthaEngine,
|
|
168
|
+
query: str,
|
|
169
|
+
*,
|
|
170
|
+
top_k: int,
|
|
171
|
+
scope: list[str] | None,
|
|
172
|
+
) -> list[tuple[str, float, dict[str, Any]]]:
|
|
173
|
+
from memuron.memory.recipes import _parse_json_field
|
|
174
|
+
|
|
175
|
+
store = engine.store
|
|
176
|
+
if fulltext_is_ready(store):
|
|
177
|
+
output: list[tuple[str, float, dict[str, Any]]] = []
|
|
178
|
+
for row in fulltext_memory_search(store, query, top_k=top_k, scope=scope):
|
|
179
|
+
scope_tokens = _parse_json_field(row.get("scope_json"), [])
|
|
180
|
+
memory_id = str(row["artha_id"])
|
|
181
|
+
output.append(
|
|
182
|
+
(
|
|
183
|
+
memory_id,
|
|
184
|
+
float(row["fts_score"]),
|
|
185
|
+
{
|
|
186
|
+
"type": "memory_node",
|
|
187
|
+
"id": memory_id,
|
|
188
|
+
"content": str(row.get("content") or ""),
|
|
189
|
+
"scope": scope_tokens if isinstance(scope_tokens, list) else [],
|
|
190
|
+
"source_metadata": source_identity_from_metadata(
|
|
191
|
+
_parse_json_field(row.get("metadata_json"), {})
|
|
192
|
+
),
|
|
193
|
+
"matched_via": "memory",
|
|
194
|
+
"matched_via_link_id": None,
|
|
195
|
+
},
|
|
196
|
+
)
|
|
197
|
+
)
|
|
198
|
+
return output
|
|
199
|
+
|
|
200
|
+
corpus = _sqlite_corpus(engine, scope)
|
|
201
|
+
ranked = _bm25_hits_in_memory(engine, query, corpus, top_k=top_k)
|
|
202
|
+
by_id = {
|
|
203
|
+
memory_id: (content, scope, metadata)
|
|
204
|
+
for memory_id, content, scope, metadata in corpus
|
|
205
|
+
}
|
|
206
|
+
return [
|
|
207
|
+
(
|
|
208
|
+
memory_id,
|
|
209
|
+
score,
|
|
210
|
+
{
|
|
211
|
+
"type": "memory_node",
|
|
212
|
+
"id": memory_id,
|
|
213
|
+
"content": by_id[memory_id][0],
|
|
214
|
+
"scope": by_id[memory_id][1],
|
|
215
|
+
"source_metadata": source_identity_from_metadata(by_id[memory_id][2]),
|
|
216
|
+
"matched_via": "memory",
|
|
217
|
+
"matched_via_link_id": None,
|
|
218
|
+
},
|
|
219
|
+
)
|
|
220
|
+
for memory_id, score in ranked
|
|
221
|
+
if memory_id in by_id
|
|
222
|
+
]
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def hybrid_memory_search(
|
|
226
|
+
engine: ArthaEngine,
|
|
227
|
+
query: str,
|
|
228
|
+
query_vector: list[float],
|
|
229
|
+
*,
|
|
230
|
+
k: int,
|
|
231
|
+
scope: list[str] | None,
|
|
232
|
+
min_semantic_score: float | None = None,
|
|
233
|
+
) -> list[tuple[str, dict[str, Any], float]]:
|
|
234
|
+
"""RRF-fuse vector (thresholded) + FTS/BM25 legs. Returns (kind, payload, score) tuples."""
|
|
235
|
+
min_score = (
|
|
236
|
+
min_semantic_score
|
|
237
|
+
if min_semantic_score is not None
|
|
238
|
+
else settings.search_min_semantic_score
|
|
239
|
+
)
|
|
240
|
+
pool = retrieve_pool_size(k)
|
|
241
|
+
|
|
242
|
+
vector_rows = _vector_memory_hits(
|
|
243
|
+
engine,
|
|
244
|
+
query_vector,
|
|
245
|
+
top_k=pool,
|
|
246
|
+
scope=scope,
|
|
247
|
+
min_semantic_score=min_score,
|
|
248
|
+
)
|
|
249
|
+
fts_rows = _fts_memory_hits(engine, query, top_k=pool, scope=scope)
|
|
250
|
+
|
|
251
|
+
payload_by_id: dict[str, dict[str, Any]] = {}
|
|
252
|
+
for memory_id, _score, payload in vector_rows + fts_rows:
|
|
253
|
+
payload_by_id[memory_id] = payload
|
|
254
|
+
|
|
255
|
+
vector_ranked = [memory_id for memory_id, _score, _payload in vector_rows]
|
|
256
|
+
fts_ranked = [memory_id for memory_id, _score, _payload in fts_rows]
|
|
257
|
+
|
|
258
|
+
if not vector_ranked and not fts_ranked:
|
|
259
|
+
return []
|
|
260
|
+
|
|
261
|
+
if not settings.search_hybrid or (not fts_ranked and vector_ranked):
|
|
262
|
+
# Single leg available or hybrid disabled — return vector order only.
|
|
263
|
+
return [
|
|
264
|
+
("memory_node", payload_by_id[memory_id], score)
|
|
265
|
+
for memory_id, score, _payload in vector_rows[:k]
|
|
266
|
+
]
|
|
267
|
+
|
|
268
|
+
if not vector_ranked:
|
|
269
|
+
return [
|
|
270
|
+
("memory_node", payload_by_id[memory_id], score)
|
|
271
|
+
for memory_id, score, _payload in fts_rows[:k]
|
|
272
|
+
]
|
|
273
|
+
|
|
274
|
+
fused = reciprocal_rank_fusion(
|
|
275
|
+
[vector_ranked, fts_ranked],
|
|
276
|
+
top_k=k,
|
|
277
|
+
k=settings.search_rrf_k,
|
|
278
|
+
key=lambda item: item,
|
|
279
|
+
)
|
|
280
|
+
return [
|
|
281
|
+
("memory_node", payload_by_id[memory_id], float(score))
|
|
282
|
+
for memory_id, score in fused
|
|
283
|
+
if memory_id in payload_by_id
|
|
284
|
+
]
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"""PostgreSQL pgvector + HNSW helpers for memuron similarity search."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from artha_engine.store.projection_sql import sql_store_execute, sql_store_fetchall, sql_store_has_tables
|
|
10
|
+
|
|
11
|
+
from memuron.application.config import settings
|
|
12
|
+
from memuron.domain.scope_filter import scope_sql_clauses
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
_PGVECTOR_FLAG = "_memuron_pgvector_ready"
|
|
17
|
+
_VECTOR_DIMS_FLAG = "_memuron_vector_dims"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def vector_literal(values: list[float]) -> str:
|
|
21
|
+
return "[" + ",".join(f"{float(value):.8g}" for value in values) + "]"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def pgvector_is_ready(store: object) -> bool:
|
|
25
|
+
return bool(getattr(store, _PGVECTOR_FLAG, False))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def ensure_pgvector_schema(store: object, dimensions: int | None = None) -> bool:
|
|
29
|
+
"""Enable pgvector, add embedding columns, backfill, and create HNSW indexes."""
|
|
30
|
+
if not sql_store_has_tables(store):
|
|
31
|
+
return False
|
|
32
|
+
if pgvector_is_ready(store):
|
|
33
|
+
return True
|
|
34
|
+
|
|
35
|
+
dims = dimensions or settings.vector_dimensions
|
|
36
|
+
try:
|
|
37
|
+
sql_store_execute(store, "CREATE EXTENSION IF NOT EXISTS vector")
|
|
38
|
+
sql_store_execute(
|
|
39
|
+
store,
|
|
40
|
+
f"ALTER TABLE memuron_memories ADD COLUMN IF NOT EXISTS embedding vector({dims})",
|
|
41
|
+
)
|
|
42
|
+
sql_store_execute(
|
|
43
|
+
store,
|
|
44
|
+
f"ALTER TABLE memuron_links ADD COLUMN IF NOT EXISTS embedding vector({dims})",
|
|
45
|
+
)
|
|
46
|
+
_backfill_table_embeddings(store, "memuron_memories", "artha_id", dims)
|
|
47
|
+
_backfill_table_embeddings(store, "memuron_links", "link_id", dims)
|
|
48
|
+
sql_store_execute(
|
|
49
|
+
store,
|
|
50
|
+
"""
|
|
51
|
+
CREATE INDEX IF NOT EXISTS idx_memuron_memories_embedding_hnsw
|
|
52
|
+
ON memuron_memories USING hnsw (embedding vector_cosine_ops)
|
|
53
|
+
""",
|
|
54
|
+
)
|
|
55
|
+
sql_store_execute(
|
|
56
|
+
store,
|
|
57
|
+
"""
|
|
58
|
+
CREATE INDEX IF NOT EXISTS idx_memuron_links_embedding_hnsw
|
|
59
|
+
ON memuron_links USING hnsw (embedding vector_cosine_ops)
|
|
60
|
+
""",
|
|
61
|
+
)
|
|
62
|
+
setattr(store, _PGVECTOR_FLAG, True)
|
|
63
|
+
setattr(store, _VECTOR_DIMS_FLAG, dims)
|
|
64
|
+
from memuron.search.fulltext import ensure_fulltext_schema
|
|
65
|
+
|
|
66
|
+
ensure_fulltext_schema(store)
|
|
67
|
+
logger.info("pgvector HNSW ready (dims=%s)", dims)
|
|
68
|
+
return True
|
|
69
|
+
except Exception as exc:
|
|
70
|
+
logger.warning("pgvector setup skipped: %s", exc)
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _backfill_table_embeddings(
|
|
75
|
+
store: object,
|
|
76
|
+
table: str,
|
|
77
|
+
id_column: str,
|
|
78
|
+
dimensions: int,
|
|
79
|
+
) -> None:
|
|
80
|
+
rows = sql_store_fetchall(
|
|
81
|
+
store,
|
|
82
|
+
f"""
|
|
83
|
+
SELECT {id_column} AS row_id, embedding_json
|
|
84
|
+
FROM {table}
|
|
85
|
+
WHERE embedding IS NULL AND embedding_json IS NOT NULL AND embedding_json != '[]'
|
|
86
|
+
""",
|
|
87
|
+
)
|
|
88
|
+
for row in rows:
|
|
89
|
+
embedding = _parse_embedding(row.get("embedding_json"))
|
|
90
|
+
if not embedding or len(embedding) != dimensions:
|
|
91
|
+
continue
|
|
92
|
+
sql_store_execute(
|
|
93
|
+
store,
|
|
94
|
+
f"UPDATE {table} SET embedding = ?::vector WHERE {id_column} = ?",
|
|
95
|
+
(vector_literal(embedding), row["row_id"]),
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _parse_embedding(value: Any) -> list[float]:
|
|
100
|
+
if isinstance(value, list):
|
|
101
|
+
return [float(item) for item in value]
|
|
102
|
+
if isinstance(value, str):
|
|
103
|
+
try:
|
|
104
|
+
parsed = json.loads(value)
|
|
105
|
+
except json.JSONDecodeError:
|
|
106
|
+
return []
|
|
107
|
+
if isinstance(parsed, list):
|
|
108
|
+
return [float(item) for item in parsed]
|
|
109
|
+
return []
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _vector_dims(store: object) -> int:
|
|
113
|
+
return int(getattr(store, _VECTOR_DIMS_FLAG, settings.vector_dimensions))
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def sync_memory_embedding(store: object, artha_id: str, embedding: list[float]) -> None:
|
|
117
|
+
if not pgvector_is_ready(store) or not embedding:
|
|
118
|
+
return
|
|
119
|
+
dims = _vector_dims(store)
|
|
120
|
+
if len(embedding) != dims:
|
|
121
|
+
return
|
|
122
|
+
sql_store_execute(
|
|
123
|
+
store,
|
|
124
|
+
"UPDATE memuron_memories SET embedding = ?::vector WHERE artha_id = ?",
|
|
125
|
+
(vector_literal(embedding), artha_id),
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def sync_link_embedding(store: object, link_id: str, embedding: list[float]) -> None:
|
|
130
|
+
if not pgvector_is_ready(store) or not embedding:
|
|
131
|
+
return
|
|
132
|
+
dims = _vector_dims(store)
|
|
133
|
+
if len(embedding) != dims:
|
|
134
|
+
return
|
|
135
|
+
sql_store_execute(
|
|
136
|
+
store,
|
|
137
|
+
"UPDATE memuron_links SET embedding = ?::vector WHERE link_id = ?",
|
|
138
|
+
(vector_literal(embedding), link_id),
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _scope_sql_clauses(column: str, scope: list[str] | None) -> tuple[str, list[str]]:
|
|
143
|
+
"""Build ANDed scope glob filters applied before vector ranking (MemBrain-style)."""
|
|
144
|
+
return scope_sql_clauses(column, scope)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def pgvector_memory_search(
|
|
148
|
+
store: object,
|
|
149
|
+
query_vector: list[float],
|
|
150
|
+
*,
|
|
151
|
+
top_k: int,
|
|
152
|
+
scope: list[str] | None = None,
|
|
153
|
+
include_content: bool = False,
|
|
154
|
+
) -> list[dict[str, Any]]:
|
|
155
|
+
if not pgvector_is_ready(store) or not query_vector:
|
|
156
|
+
return []
|
|
157
|
+
if len(query_vector) != _vector_dims(store):
|
|
158
|
+
return []
|
|
159
|
+
|
|
160
|
+
query_literal = vector_literal(query_vector)
|
|
161
|
+
content_col = ", content, metadata_json" if include_content else ""
|
|
162
|
+
scope_sql, scope_params = _scope_sql_clauses("scope_json", scope)
|
|
163
|
+
scope_where = f" AND {scope_sql}" if scope_sql else ""
|
|
164
|
+
rows = sql_store_fetchall(
|
|
165
|
+
store,
|
|
166
|
+
f"""
|
|
167
|
+
SELECT artha_id, scope_json{content_col},
|
|
168
|
+
1 - (embedding <=> ?::vector) AS semantic_score
|
|
169
|
+
FROM memuron_memories
|
|
170
|
+
WHERE embedding IS NOT NULL{scope_where}
|
|
171
|
+
ORDER BY embedding <=> ?::vector
|
|
172
|
+
LIMIT ?
|
|
173
|
+
""",
|
|
174
|
+
(query_literal, *scope_params, query_literal, top_k),
|
|
175
|
+
)
|
|
176
|
+
hits: list[dict[str, Any]] = []
|
|
177
|
+
for row in rows:
|
|
178
|
+
hit: dict[str, Any] = {
|
|
179
|
+
"artha_id": str(row["artha_id"]),
|
|
180
|
+
"semantic_score": float(row["semantic_score"]),
|
|
181
|
+
}
|
|
182
|
+
if include_content:
|
|
183
|
+
hit["content"] = row.get("content")
|
|
184
|
+
hit["scope_json"] = row.get("scope_json")
|
|
185
|
+
hit["metadata_json"] = row.get("metadata_json")
|
|
186
|
+
hits.append(hit)
|
|
187
|
+
return hits
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def pgvector_memory_search_ids(
|
|
191
|
+
store: object,
|
|
192
|
+
query_vector: list[float],
|
|
193
|
+
*,
|
|
194
|
+
top_k: int,
|
|
195
|
+
scope: list[str] | None = None,
|
|
196
|
+
) -> list[tuple[str, float]]:
|
|
197
|
+
rows = pgvector_memory_search(
|
|
198
|
+
store,
|
|
199
|
+
query_vector,
|
|
200
|
+
top_k=top_k,
|
|
201
|
+
scope=scope,
|
|
202
|
+
include_content=False,
|
|
203
|
+
)
|
|
204
|
+
return [(row["artha_id"], row["semantic_score"]) for row in rows]
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def pgvector_link_search(
|
|
208
|
+
store: object,
|
|
209
|
+
query_vector: list[float],
|
|
210
|
+
*,
|
|
211
|
+
top_k: int,
|
|
212
|
+
scope: list[str] | None = None,
|
|
213
|
+
) -> list[dict[str, Any]]:
|
|
214
|
+
if not pgvector_is_ready(store) or not query_vector:
|
|
215
|
+
return []
|
|
216
|
+
if len(query_vector) != _vector_dims(store):
|
|
217
|
+
return []
|
|
218
|
+
|
|
219
|
+
query_literal = vector_literal(query_vector)
|
|
220
|
+
source_scope_sql, source_scope_params = _scope_sql_clauses("src.scope_json", scope)
|
|
221
|
+
target_scope_sql, target_scope_params = _scope_sql_clauses("tgt.scope_json", scope)
|
|
222
|
+
scope_where = ""
|
|
223
|
+
scope_params: list[str] = []
|
|
224
|
+
if source_scope_sql:
|
|
225
|
+
scope_where = f" AND {source_scope_sql} AND {target_scope_sql}"
|
|
226
|
+
scope_params = [*source_scope_params, *target_scope_params]
|
|
227
|
+
|
|
228
|
+
rows = sql_store_fetchall(
|
|
229
|
+
store,
|
|
230
|
+
f"""
|
|
231
|
+
SELECT l.link_id, l.source_id, l.target_id, l.description, l.metadata_json,
|
|
232
|
+
1 - (l.embedding <=> ?::vector) AS semantic_score
|
|
233
|
+
FROM memuron_links l
|
|
234
|
+
JOIN memuron_memories src ON src.artha_id = l.source_id
|
|
235
|
+
JOIN memuron_memories tgt ON tgt.artha_id = l.target_id
|
|
236
|
+
WHERE l.embedding IS NOT NULL{scope_where}
|
|
237
|
+
ORDER BY l.embedding <=> ?::vector
|
|
238
|
+
LIMIT ?
|
|
239
|
+
""",
|
|
240
|
+
(query_literal, *scope_params, query_literal, top_k),
|
|
241
|
+
)
|
|
242
|
+
return [
|
|
243
|
+
{
|
|
244
|
+
"link_id": str(row["link_id"]),
|
|
245
|
+
"source_id": str(row["source_id"]),
|
|
246
|
+
"target_id": str(row["target_id"]),
|
|
247
|
+
"description": str(row.get("description") or ""),
|
|
248
|
+
"metadata_json": row.get("metadata_json"),
|
|
249
|
+
"semantic_score": float(row["semantic_score"]),
|
|
250
|
+
}
|
|
251
|
+
for row in rows
|
|
252
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Authentication, authorization, tenant, and Clerk integration."""
|