memuron 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memuron/__init__.py +3 -0
- memuron/actions/__init__.py +12 -0
- memuron/actions/context.py +63 -0
- memuron/actions/helpers.py +88 -0
- memuron/actions/memory.py +340 -0
- memuron/actions/memory_write.py +290 -0
- memuron/actions/nodes.py +340 -0
- memuron/actions/registry.py +5 -0
- memuron/actions/runtime.py +37 -0
- memuron/actions/spaces_documents.py +720 -0
- memuron/actions/sync.py +155 -0
- memuron/application/__init__.py +1 -0
- memuron/application/api.py +206 -0
- memuron/application/app.py +103 -0
- memuron/application/capabilities.py +82 -0
- memuron/application/cli.py +35 -0
- memuron/application/config.py +176 -0
- memuron/application/mcp.py +44 -0
- memuron/application/mcp_oauth.py +290 -0
- memuron/application/registry.py +52 -0
- memuron/context.py +532 -0
- memuron/documents/__init__.py +1 -0
- memuron/documents/link_guardian.py +192 -0
- memuron/documents/linking.py +292 -0
- memuron/documents/parser.py +1152 -0
- memuron/documents/storage.py +151 -0
- memuron/documents/url_ingest.py +375 -0
- memuron/domain/__init__.py +1 -0
- memuron/domain/decoders.py +1 -0
- memuron/domain/encoders.py +185 -0
- memuron/domain/lifecycles.py +8 -0
- memuron/domain/limits.py +6 -0
- memuron/domain/representations.py +56 -0
- memuron/domain/schemas.py +581 -0
- memuron/domain/scope_filter.py +104 -0
- memuron/graphfs/__init__.py +1 -0
- memuron/graphfs/manual.py +635 -0
- memuron/graphfs/projection.py +578 -0
- memuron/graphfs/query.py +1782 -0
- memuron/graphfs/read_model.py +574 -0
- memuron/ingest/__init__.py +1 -0
- memuron/ingest/guardian.py +213 -0
- memuron/ingest/jobs.py +424 -0
- memuron/ingest/prompts.py +147 -0
- memuron/memory/__init__.py +1 -0
- memuron/memory/engine.py +35 -0
- memuron/memory/projections.py +452 -0
- memuron/memory/recipes.py +3247 -0
- memuron/persistence/__init__.py +1 -0
- memuron/persistence/db_pool.py +57 -0
- memuron/persistence/identity_store.py +918 -0
- memuron/persistence/store_helpers.py +16 -0
- memuron/search/__init__.py +1 -0
- memuron/search/fulltext.py +110 -0
- memuron/search/hybrid.py +284 -0
- memuron/search/pgvector.py +252 -0
- memuron/security/__init__.py +1 -0
- memuron/security/auth.py +143 -0
- memuron/security/auth_provider.py +119 -0
- memuron/security/authorization.py +53 -0
- memuron/security/clerk_scopes.py +94 -0
- memuron/security/clerk_webhooks.py +61 -0
- memuron/security/jwt_tokens.py +53 -0
- memuron/security/passwords.py +38 -0
- memuron/security/tenant.py +58 -0
- memuron/spaces/__init__.py +1 -0
- memuron/spaces/model.py +35 -0
- memuron/spaces/service.py +155 -0
- memuron/sync/__init__.py +25 -0
- memuron/sync/folder.py +828 -0
- memuron-0.1.1.dist-info/METADATA +242 -0
- memuron-0.1.1.dist-info/RECORD +74 -0
- memuron-0.1.1.dist-info/WHEEL +4 -0
- memuron-0.1.1.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""Guardian for batch semantic linking after document ingest."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import re
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import requests
|
|
12
|
+
|
|
13
|
+
from memuron.application.config import settings
|
|
14
|
+
from memuron.documents.parser import _image_data_url
|
|
15
|
+
from memuron.ingest.prompts import (
|
|
16
|
+
DOCUMENT_LINK_GUARDIAN_PROMPT,
|
|
17
|
+
DocumentLinkGuardianPlan,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
OPENROUTER_CHAT_URL = "https://openrouter.ai/api/v1/chat/completions"
|
|
23
|
+
_MAX_VISION_ATTACHMENTS = 12
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DocumentLinkGuardianError(RuntimeError):
|
|
27
|
+
"""Raised when the document-link Guardian call fails after retries."""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DocumentLinkGuardian:
|
|
31
|
+
"""One multimodal Guardian call to link all chunks from a document ingest."""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
*,
|
|
36
|
+
api_key: str | None = None,
|
|
37
|
+
model_id: str | None = None,
|
|
38
|
+
max_retries: int | None = None,
|
|
39
|
+
timeout_seconds: int = 120,
|
|
40
|
+
) -> None:
|
|
41
|
+
self.api_key = api_key or settings.openrouter_api_key
|
|
42
|
+
self.model_id = model_id or settings.document_link_model
|
|
43
|
+
self.max_retries = (
|
|
44
|
+
max_retries
|
|
45
|
+
if max_retries is not None
|
|
46
|
+
else settings.document_link_retries
|
|
47
|
+
)
|
|
48
|
+
self.timeout_seconds = timeout_seconds
|
|
49
|
+
|
|
50
|
+
@staticmethod
|
|
51
|
+
def _format_memories(
|
|
52
|
+
items: list[tuple[str, dict[str, Any], float | None]],
|
|
53
|
+
*,
|
|
54
|
+
title: str,
|
|
55
|
+
) -> str:
|
|
56
|
+
if not items:
|
|
57
|
+
return f"(no {title})"
|
|
58
|
+
lines: list[str] = []
|
|
59
|
+
for index, (memory_id, memory, score) in enumerate(items, start=1):
|
|
60
|
+
if not isinstance(memory, dict):
|
|
61
|
+
continue
|
|
62
|
+
content = str(memory.get("content", ""))[:500]
|
|
63
|
+
node_type = memory.get("node_type", "text")
|
|
64
|
+
score_text = (
|
|
65
|
+
f"Similarity: {score:.3f}\n"
|
|
66
|
+
if isinstance(score, (int, float))
|
|
67
|
+
else ""
|
|
68
|
+
)
|
|
69
|
+
lines.append(
|
|
70
|
+
f"\n--- {title} {index} ---\n"
|
|
71
|
+
f"ID: {memory_id}\n"
|
|
72
|
+
f"Node type: {node_type}\n"
|
|
73
|
+
f"{score_text}"
|
|
74
|
+
f"Content: {content}\n"
|
|
75
|
+
f"Scope: {memory.get('scope') or []}\n"
|
|
76
|
+
)
|
|
77
|
+
return "".join(lines)
|
|
78
|
+
|
|
79
|
+
def _build_prompt(
|
|
80
|
+
self,
|
|
81
|
+
*,
|
|
82
|
+
external: list[tuple[str, dict[str, Any], float | None]],
|
|
83
|
+
in_doc_chunks: list[tuple[str, dict[str, Any], float | None]],
|
|
84
|
+
in_doc_images: list[tuple[str, dict[str, Any], float | None]],
|
|
85
|
+
) -> str:
|
|
86
|
+
return DOCUMENT_LINK_GUARDIAN_PROMPT.format(
|
|
87
|
+
chunks_text=self._format_memories(in_doc_chunks, title="Chunk"),
|
|
88
|
+
images_text=self._format_memories(in_doc_images, title="Image"),
|
|
89
|
+
external_text=self._format_memories(external, title="External"),
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
@staticmethod
|
|
93
|
+
def _parse_json_content(raw: str) -> DocumentLinkGuardianPlan:
|
|
94
|
+
text = raw.strip()
|
|
95
|
+
if text.startswith("```"):
|
|
96
|
+
text = re.sub(r"^```(?:json)?\s*", "", text)
|
|
97
|
+
text = re.sub(r"\s*```$", "", text)
|
|
98
|
+
payload = json.loads(text)
|
|
99
|
+
return DocumentLinkGuardianPlan.model_validate(payload)
|
|
100
|
+
|
|
101
|
+
def _call_guardian(
|
|
102
|
+
self,
|
|
103
|
+
prompt: str,
|
|
104
|
+
*,
|
|
105
|
+
vision_attachments: list[tuple[str, bytes]],
|
|
106
|
+
) -> DocumentLinkGuardianPlan:
|
|
107
|
+
if not self.api_key:
|
|
108
|
+
raise DocumentLinkGuardianError(
|
|
109
|
+
"OPENROUTER_API_KEY is required for document link Guardian"
|
|
110
|
+
)
|
|
111
|
+
content: list[dict[str, Any]] = [{"type": "text", "text": prompt}]
|
|
112
|
+
for media_type, image_bytes in vision_attachments[:_MAX_VISION_ATTACHMENTS]:
|
|
113
|
+
content.append(
|
|
114
|
+
{
|
|
115
|
+
"type": "image_url",
|
|
116
|
+
"image_url": {"url": _image_data_url(image_bytes, media_type)},
|
|
117
|
+
}
|
|
118
|
+
)
|
|
119
|
+
payload = {
|
|
120
|
+
"model": self.model_id,
|
|
121
|
+
"messages": [{"role": "user", "content": content}],
|
|
122
|
+
"response_format": {"type": "json_object"},
|
|
123
|
+
}
|
|
124
|
+
try:
|
|
125
|
+
response = requests.post(
|
|
126
|
+
OPENROUTER_CHAT_URL,
|
|
127
|
+
headers={
|
|
128
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
129
|
+
"Content-Type": "application/json",
|
|
130
|
+
},
|
|
131
|
+
json=payload,
|
|
132
|
+
timeout=self.timeout_seconds,
|
|
133
|
+
)
|
|
134
|
+
except requests.RequestException as exc:
|
|
135
|
+
raise DocumentLinkGuardianError(
|
|
136
|
+
f"Document link Guardian request failed: {exc}"
|
|
137
|
+
) from exc
|
|
138
|
+
if response.status_code >= 400:
|
|
139
|
+
raise DocumentLinkGuardianError(
|
|
140
|
+
f"Document link Guardian HTTP {response.status_code}: "
|
|
141
|
+
f"{response.text[:500]}"
|
|
142
|
+
)
|
|
143
|
+
try:
|
|
144
|
+
body = response.json()
|
|
145
|
+
message = body["choices"][0]["message"]["content"]
|
|
146
|
+
except (KeyError, IndexError, TypeError, json.JSONDecodeError) as exc:
|
|
147
|
+
raise DocumentLinkGuardianError(
|
|
148
|
+
f"Document link Guardian response parse error: {exc}"
|
|
149
|
+
) from exc
|
|
150
|
+
if not isinstance(message, str):
|
|
151
|
+
raise DocumentLinkGuardianError("Document link Guardian returned non-text content")
|
|
152
|
+
return self._parse_json_content(message)
|
|
153
|
+
|
|
154
|
+
async def plan_document_links(
|
|
155
|
+
self,
|
|
156
|
+
*,
|
|
157
|
+
external: list[tuple[str, dict[str, Any], float | None]],
|
|
158
|
+
in_doc_chunks: list[tuple[str, dict[str, Any], float | None]],
|
|
159
|
+
in_doc_images: list[tuple[str, dict[str, Any], float | None]],
|
|
160
|
+
vision_attachments: list[tuple[str, bytes]] | None = None,
|
|
161
|
+
) -> DocumentLinkGuardianPlan:
|
|
162
|
+
prompt = self._build_prompt(
|
|
163
|
+
external=external,
|
|
164
|
+
in_doc_chunks=in_doc_chunks,
|
|
165
|
+
in_doc_images=in_doc_images,
|
|
166
|
+
)
|
|
167
|
+
attachments = list(vision_attachments or [])
|
|
168
|
+
last_error: Exception | None = None
|
|
169
|
+
for attempt in range(1, self.max_retries + 1):
|
|
170
|
+
try:
|
|
171
|
+
return await asyncio.to_thread(
|
|
172
|
+
self._call_guardian,
|
|
173
|
+
prompt,
|
|
174
|
+
vision_attachments=attachments,
|
|
175
|
+
)
|
|
176
|
+
except Exception as exc:
|
|
177
|
+
last_error = exc
|
|
178
|
+
logger.warning(
|
|
179
|
+
"Document link Guardian attempt %s/%s failed: %s",
|
|
180
|
+
attempt,
|
|
181
|
+
self.max_retries,
|
|
182
|
+
exc,
|
|
183
|
+
)
|
|
184
|
+
if attempt < self.max_retries:
|
|
185
|
+
await asyncio.sleep(2 ** (attempt - 1))
|
|
186
|
+
raise DocumentLinkGuardianError(
|
|
187
|
+
f"Document link Guardian failed after {self.max_retries} attempts: {last_error}"
|
|
188
|
+
) from last_error
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def create_document_link_guardian() -> DocumentLinkGuardian:
|
|
192
|
+
return DocumentLinkGuardian()
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
"""Post-ingest semantic linking for document chunks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from collections import Counter
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from artha_engine import ArthaEngine
|
|
10
|
+
|
|
11
|
+
from memuron.application.config import settings
|
|
12
|
+
from memuron.documents.link_guardian import (
|
|
13
|
+
DocumentLinkGuardian,
|
|
14
|
+
DocumentLinkGuardianError,
|
|
15
|
+
create_document_link_guardian,
|
|
16
|
+
)
|
|
17
|
+
from memuron.ingest.prompts import ConnectionSpec, DocumentLinkGuardianPlan, GuardianWritePlan
|
|
18
|
+
from memuron.memory.recipes import (
|
|
19
|
+
MAX_GUARDIAN_LINKS,
|
|
20
|
+
_apply_guardian_links,
|
|
21
|
+
_find_link_candidates,
|
|
22
|
+
_ingest_scope_context,
|
|
23
|
+
_require_memory_projections,
|
|
24
|
+
get_memory,
|
|
25
|
+
refresh_memory_projections,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _document_ingest_memory_ids(payload: dict[str, Any]) -> set[str]:
|
|
32
|
+
ids = {
|
|
33
|
+
str(payload["collection"]["id"]),
|
|
34
|
+
str(payload["document"]["id"]),
|
|
35
|
+
}
|
|
36
|
+
ids.update(str(item) for item in payload.get("chunk_ids") or [])
|
|
37
|
+
ids.update(str(item) for item in payload.get("image_ids") or [])
|
|
38
|
+
return ids
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def collect_document_external_candidates(
|
|
42
|
+
engine: ArthaEngine,
|
|
43
|
+
*,
|
|
44
|
+
chunk_ids: list[str],
|
|
45
|
+
exclude_ids: set[str],
|
|
46
|
+
tenant_id: str | None,
|
|
47
|
+
top_k_per_chunk: int | None = None,
|
|
48
|
+
budget: int | None = None,
|
|
49
|
+
) -> list[tuple[str, dict[str, Any], float]]:
|
|
50
|
+
"""Rank external memories by how often they appear in per-chunk top-k retrieval."""
|
|
51
|
+
|
|
52
|
+
top_k = top_k_per_chunk or settings.document_chunk_neighbor_top_k
|
|
53
|
+
max_external = budget or settings.document_external_candidate_budget
|
|
54
|
+
scores_by_id: dict[str, float] = {}
|
|
55
|
+
counts: Counter[str] = Counter()
|
|
56
|
+
|
|
57
|
+
for chunk_id in chunk_ids:
|
|
58
|
+
try:
|
|
59
|
+
chunk = get_memory(engine, chunk_id)
|
|
60
|
+
except KeyError:
|
|
61
|
+
continue
|
|
62
|
+
content = str(chunk.get("content", "")).strip()
|
|
63
|
+
if not content:
|
|
64
|
+
continue
|
|
65
|
+
neighbors = _find_link_candidates(
|
|
66
|
+
engine,
|
|
67
|
+
content,
|
|
68
|
+
tenant_id=tenant_id,
|
|
69
|
+
exclude_memory_id=chunk_id,
|
|
70
|
+
top_k=top_k,
|
|
71
|
+
)
|
|
72
|
+
for memory_id, score, memory in neighbors:
|
|
73
|
+
if memory_id in exclude_ids:
|
|
74
|
+
continue
|
|
75
|
+
counts[memory_id] += 1
|
|
76
|
+
scores_by_id[memory_id] = max(scores_by_id.get(memory_id, 0.0), score)
|
|
77
|
+
|
|
78
|
+
ranked = sorted(
|
|
79
|
+
counts.keys(),
|
|
80
|
+
key=lambda memory_id: (-counts[memory_id], -scores_by_id.get(memory_id, 0.0)),
|
|
81
|
+
)[:max_external]
|
|
82
|
+
output: list[tuple[str, dict[str, Any], float]] = []
|
|
83
|
+
for memory_id in ranked:
|
|
84
|
+
memory = get_memory(engine, memory_id)
|
|
85
|
+
output.append((memory_id, memory, scores_by_id[memory_id]))
|
|
86
|
+
return output
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _resolve_chunk_id(raw_id: str, chunk_ids: list[str]) -> str | None:
|
|
90
|
+
"""Map Guardian chunk_id to an ingest chunk (exact or unique prefix match)."""
|
|
91
|
+
token = raw_id.strip()
|
|
92
|
+
if not token:
|
|
93
|
+
return None
|
|
94
|
+
if token in chunk_ids:
|
|
95
|
+
return token
|
|
96
|
+
prefix_matches = [chunk_id for chunk_id in chunk_ids if chunk_id.startswith(token)]
|
|
97
|
+
if len(prefix_matches) == 1:
|
|
98
|
+
return prefix_matches[0]
|
|
99
|
+
suffix_matches = [chunk_id for chunk_id in chunk_ids if token in chunk_id]
|
|
100
|
+
if len(suffix_matches) == 1:
|
|
101
|
+
return suffix_matches[0]
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _normalize_document_connections(
|
|
106
|
+
connections: list[ConnectionSpec],
|
|
107
|
+
) -> list[ConnectionSpec]:
|
|
108
|
+
normalized: list[ConnectionSpec] = []
|
|
109
|
+
for conn in connections:
|
|
110
|
+
description = conn.description.strip()
|
|
111
|
+
if len(description) < 20:
|
|
112
|
+
continue
|
|
113
|
+
if "?" not in description:
|
|
114
|
+
description = f"{description.rstrip('.')}? What else connects these memories?"
|
|
115
|
+
normalized.append(
|
|
116
|
+
ConnectionSpec(target_id=conn.target_id.strip(), description=description)
|
|
117
|
+
)
|
|
118
|
+
return normalized
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _validate_document_connections(
|
|
122
|
+
memory_id: str,
|
|
123
|
+
scope: list[str],
|
|
124
|
+
raw_connections: list[ConnectionSpec],
|
|
125
|
+
allowed_target_ids: set[str],
|
|
126
|
+
memories_by_id: dict[str, dict[str, Any]],
|
|
127
|
+
) -> list[ConnectionSpec]:
|
|
128
|
+
validated: list[ConnectionSpec] = []
|
|
129
|
+
for conn in _normalize_document_connections(raw_connections):
|
|
130
|
+
target_id = conn.target_id
|
|
131
|
+
if not target_id or target_id == memory_id or target_id not in allowed_target_ids:
|
|
132
|
+
continue
|
|
133
|
+
if not memories_by_id.get(target_id):
|
|
134
|
+
continue
|
|
135
|
+
if len(validated) >= MAX_GUARDIAN_LINKS:
|
|
136
|
+
break
|
|
137
|
+
validated.append(conn)
|
|
138
|
+
return validated
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _apply_document_link_plan(
|
|
142
|
+
engine: ArthaEngine,
|
|
143
|
+
*,
|
|
144
|
+
plan: DocumentLinkGuardianPlan,
|
|
145
|
+
chunk_ids: list[str],
|
|
146
|
+
scope: list[str],
|
|
147
|
+
allowed_target_ids: set[str],
|
|
148
|
+
memories_by_id: dict[str, dict[str, Any]],
|
|
149
|
+
event_metadata: dict[str, object] | None,
|
|
150
|
+
) -> int:
|
|
151
|
+
linked = 0
|
|
152
|
+
for assignment in plan.chunk_links:
|
|
153
|
+
chunk_id = _resolve_chunk_id(assignment.chunk_id, chunk_ids)
|
|
154
|
+
if not chunk_id or chunk_id not in memories_by_id:
|
|
155
|
+
continue
|
|
156
|
+
connections = _validate_document_connections(
|
|
157
|
+
chunk_id,
|
|
158
|
+
scope,
|
|
159
|
+
assignment.connections,
|
|
160
|
+
allowed_target_ids,
|
|
161
|
+
memories_by_id,
|
|
162
|
+
)
|
|
163
|
+
if not connections:
|
|
164
|
+
continue
|
|
165
|
+
write_plan = GuardianWritePlan(
|
|
166
|
+
action="create",
|
|
167
|
+
target_memory_id=None,
|
|
168
|
+
reasoning=plan.reasoning,
|
|
169
|
+
final_content=str(memories_by_id[chunk_id].get("content", "")),
|
|
170
|
+
scope=scope,
|
|
171
|
+
connections=connections,
|
|
172
|
+
links_to_remove=[],
|
|
173
|
+
)
|
|
174
|
+
linked += _apply_guardian_links(
|
|
175
|
+
engine,
|
|
176
|
+
chunk_id,
|
|
177
|
+
scope,
|
|
178
|
+
write_plan,
|
|
179
|
+
list(allowed_target_ids),
|
|
180
|
+
memories_by_id,
|
|
181
|
+
event_metadata={
|
|
182
|
+
**(event_metadata or {}),
|
|
183
|
+
"document_link_guardian": True,
|
|
184
|
+
},
|
|
185
|
+
)
|
|
186
|
+
return linked
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
async def link_document_after_ingest(
|
|
190
|
+
engine: ArthaEngine,
|
|
191
|
+
*,
|
|
192
|
+
ingest_payload: dict[str, Any],
|
|
193
|
+
scope: list[str] | None = None,
|
|
194
|
+
event_metadata: dict[str, object] | None = None,
|
|
195
|
+
space_context: dict[str, str] | None = None, # noqa: ARG001 — reserved
|
|
196
|
+
image_attachments: list[dict[str, Any]] | None = None,
|
|
197
|
+
document_link_guardian: DocumentLinkGuardian | None = None,
|
|
198
|
+
) -> int:
|
|
199
|
+
"""Run batch Guardian linking for document chunks; no-op for standalone images."""
|
|
200
|
+
|
|
201
|
+
if ingest_payload.get("source_type") == "image":
|
|
202
|
+
return 0
|
|
203
|
+
|
|
204
|
+
chunk_ids = [str(item) for item in ingest_payload.get("chunk_ids") or []]
|
|
205
|
+
if not chunk_ids:
|
|
206
|
+
return 0
|
|
207
|
+
|
|
208
|
+
if not settings.openrouter_api_key:
|
|
209
|
+
logger.info("Skipping document link Guardian: OPENROUTER_API_KEY unset")
|
|
210
|
+
return 0
|
|
211
|
+
|
|
212
|
+
_require_memory_projections(engine)
|
|
213
|
+
request_scope, tenant_id, _search_scope = _ingest_scope_context(
|
|
214
|
+
scope=scope,
|
|
215
|
+
event_metadata=event_metadata,
|
|
216
|
+
space_context=None,
|
|
217
|
+
candidate_scope=None,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
exclude_ids = _document_ingest_memory_ids(ingest_payload)
|
|
221
|
+
external = collect_document_external_candidates(
|
|
222
|
+
engine,
|
|
223
|
+
chunk_ids=chunk_ids,
|
|
224
|
+
exclude_ids=exclude_ids,
|
|
225
|
+
tenant_id=tenant_id,
|
|
226
|
+
)
|
|
227
|
+
if not external and len(chunk_ids) < 2:
|
|
228
|
+
return 0
|
|
229
|
+
|
|
230
|
+
in_doc_chunks: list[tuple[str, dict[str, Any], float | None]] = []
|
|
231
|
+
in_doc_images: list[tuple[str, dict[str, Any], float | None]] = []
|
|
232
|
+
memories_by_id: dict[str, dict[str, Any]] = {}
|
|
233
|
+
|
|
234
|
+
for chunk_id in chunk_ids:
|
|
235
|
+
memory = get_memory(engine, chunk_id)
|
|
236
|
+
in_doc_chunks.append((chunk_id, memory, None))
|
|
237
|
+
memories_by_id[chunk_id] = memory
|
|
238
|
+
|
|
239
|
+
for image_id in ingest_payload.get("image_ids") or []:
|
|
240
|
+
image_id = str(image_id)
|
|
241
|
+
memory = get_memory(engine, image_id)
|
|
242
|
+
in_doc_images.append((image_id, memory, None))
|
|
243
|
+
memories_by_id[image_id] = memory
|
|
244
|
+
|
|
245
|
+
for memory_id, memory, _score in external:
|
|
246
|
+
memories_by_id[memory_id] = memory
|
|
247
|
+
|
|
248
|
+
allowed_target_ids = set(memories_by_id.keys()) - {str(ingest_payload["collection"]["id"]), str(ingest_payload["document"]["id"])}
|
|
249
|
+
|
|
250
|
+
guardian = document_link_guardian or create_document_link_guardian()
|
|
251
|
+
vision: list[tuple[str, bytes]] = []
|
|
252
|
+
for item in image_attachments or []:
|
|
253
|
+
raw = item.get("bytes")
|
|
254
|
+
media_type = str(item.get("media_type") or "image/png")
|
|
255
|
+
if isinstance(raw, (bytes, bytearray)) and raw:
|
|
256
|
+
vision.append((media_type, bytes(raw)))
|
|
257
|
+
|
|
258
|
+
try:
|
|
259
|
+
plan = await guardian.plan_document_links(
|
|
260
|
+
external=external,
|
|
261
|
+
in_doc_chunks=in_doc_chunks,
|
|
262
|
+
in_doc_images=in_doc_images,
|
|
263
|
+
vision_attachments=vision,
|
|
264
|
+
)
|
|
265
|
+
except DocumentLinkGuardianError:
|
|
266
|
+
logger.warning(
|
|
267
|
+
"Document link Guardian failed for document_key=%s; keeping placement-only graph",
|
|
268
|
+
ingest_payload.get("document_key"),
|
|
269
|
+
exc_info=True,
|
|
270
|
+
)
|
|
271
|
+
return 0
|
|
272
|
+
|
|
273
|
+
linked = _apply_document_link_plan(
|
|
274
|
+
engine,
|
|
275
|
+
plan=plan,
|
|
276
|
+
chunk_ids=chunk_ids,
|
|
277
|
+
scope=request_scope,
|
|
278
|
+
allowed_target_ids=allowed_target_ids,
|
|
279
|
+
memories_by_id=memories_by_id,
|
|
280
|
+
event_metadata=event_metadata,
|
|
281
|
+
)
|
|
282
|
+
logger.info(
|
|
283
|
+
"Document link Guardian document_key=%s chunks=%s external=%s proposed=%s linked=%s",
|
|
284
|
+
ingest_payload.get("document_key"),
|
|
285
|
+
len(chunk_ids),
|
|
286
|
+
len(external),
|
|
287
|
+
sum(len(item.connections) for item in plan.chunk_links),
|
|
288
|
+
linked,
|
|
289
|
+
)
|
|
290
|
+
if linked:
|
|
291
|
+
refresh_memory_projections(engine)
|
|
292
|
+
return linked
|