memuron 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. memuron/__init__.py +3 -0
  2. memuron/actions/__init__.py +12 -0
  3. memuron/actions/context.py +63 -0
  4. memuron/actions/helpers.py +88 -0
  5. memuron/actions/memory.py +340 -0
  6. memuron/actions/memory_write.py +290 -0
  7. memuron/actions/nodes.py +340 -0
  8. memuron/actions/registry.py +5 -0
  9. memuron/actions/runtime.py +37 -0
  10. memuron/actions/spaces_documents.py +720 -0
  11. memuron/actions/sync.py +155 -0
  12. memuron/application/__init__.py +1 -0
  13. memuron/application/api.py +206 -0
  14. memuron/application/app.py +103 -0
  15. memuron/application/capabilities.py +82 -0
  16. memuron/application/cli.py +35 -0
  17. memuron/application/config.py +176 -0
  18. memuron/application/mcp.py +44 -0
  19. memuron/application/mcp_oauth.py +290 -0
  20. memuron/application/registry.py +52 -0
  21. memuron/context.py +532 -0
  22. memuron/documents/__init__.py +1 -0
  23. memuron/documents/link_guardian.py +192 -0
  24. memuron/documents/linking.py +292 -0
  25. memuron/documents/parser.py +1152 -0
  26. memuron/documents/storage.py +151 -0
  27. memuron/documents/url_ingest.py +375 -0
  28. memuron/domain/__init__.py +1 -0
  29. memuron/domain/decoders.py +1 -0
  30. memuron/domain/encoders.py +185 -0
  31. memuron/domain/lifecycles.py +8 -0
  32. memuron/domain/limits.py +6 -0
  33. memuron/domain/representations.py +56 -0
  34. memuron/domain/schemas.py +581 -0
  35. memuron/domain/scope_filter.py +104 -0
  36. memuron/graphfs/__init__.py +1 -0
  37. memuron/graphfs/manual.py +635 -0
  38. memuron/graphfs/projection.py +578 -0
  39. memuron/graphfs/query.py +1782 -0
  40. memuron/graphfs/read_model.py +574 -0
  41. memuron/ingest/__init__.py +1 -0
  42. memuron/ingest/guardian.py +213 -0
  43. memuron/ingest/jobs.py +424 -0
  44. memuron/ingest/prompts.py +147 -0
  45. memuron/memory/__init__.py +1 -0
  46. memuron/memory/engine.py +35 -0
  47. memuron/memory/projections.py +452 -0
  48. memuron/memory/recipes.py +3247 -0
  49. memuron/persistence/__init__.py +1 -0
  50. memuron/persistence/db_pool.py +57 -0
  51. memuron/persistence/identity_store.py +918 -0
  52. memuron/persistence/store_helpers.py +16 -0
  53. memuron/search/__init__.py +1 -0
  54. memuron/search/fulltext.py +110 -0
  55. memuron/search/hybrid.py +284 -0
  56. memuron/search/pgvector.py +252 -0
  57. memuron/security/__init__.py +1 -0
  58. memuron/security/auth.py +143 -0
  59. memuron/security/auth_provider.py +119 -0
  60. memuron/security/authorization.py +53 -0
  61. memuron/security/clerk_scopes.py +94 -0
  62. memuron/security/clerk_webhooks.py +61 -0
  63. memuron/security/jwt_tokens.py +53 -0
  64. memuron/security/passwords.py +38 -0
  65. memuron/security/tenant.py +58 -0
  66. memuron/spaces/__init__.py +1 -0
  67. memuron/spaces/model.py +35 -0
  68. memuron/spaces/service.py +155 -0
  69. memuron/sync/__init__.py +25 -0
  70. memuron/sync/folder.py +828 -0
  71. memuron-0.1.1.dist-info/METADATA +242 -0
  72. memuron-0.1.1.dist-info/RECORD +74 -0
  73. memuron-0.1.1.dist-info/WHEEL +4 -0
  74. memuron-0.1.1.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,192 @@
1
+ """Guardian for batch semantic linking after document ingest."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import logging
8
+ import re
9
+ from typing import Any
10
+
11
+ import requests
12
+
13
+ from memuron.application.config import settings
14
+ from memuron.documents.parser import _image_data_url
15
+ from memuron.ingest.prompts import (
16
+ DOCUMENT_LINK_GUARDIAN_PROMPT,
17
+ DocumentLinkGuardianPlan,
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ OPENROUTER_CHAT_URL = "https://openrouter.ai/api/v1/chat/completions"
23
+ _MAX_VISION_ATTACHMENTS = 12
24
+
25
+
26
+ class DocumentLinkGuardianError(RuntimeError):
27
+ """Raised when the document-link Guardian call fails after retries."""
28
+
29
+
30
+ class DocumentLinkGuardian:
31
+ """One multimodal Guardian call to link all chunks from a document ingest."""
32
+
33
+ def __init__(
34
+ self,
35
+ *,
36
+ api_key: str | None = None,
37
+ model_id: str | None = None,
38
+ max_retries: int | None = None,
39
+ timeout_seconds: int = 120,
40
+ ) -> None:
41
+ self.api_key = api_key or settings.openrouter_api_key
42
+ self.model_id = model_id or settings.document_link_model
43
+ self.max_retries = (
44
+ max_retries
45
+ if max_retries is not None
46
+ else settings.document_link_retries
47
+ )
48
+ self.timeout_seconds = timeout_seconds
49
+
50
+ @staticmethod
51
+ def _format_memories(
52
+ items: list[tuple[str, dict[str, Any], float | None]],
53
+ *,
54
+ title: str,
55
+ ) -> str:
56
+ if not items:
57
+ return f"(no {title})"
58
+ lines: list[str] = []
59
+ for index, (memory_id, memory, score) in enumerate(items, start=1):
60
+ if not isinstance(memory, dict):
61
+ continue
62
+ content = str(memory.get("content", ""))[:500]
63
+ node_type = memory.get("node_type", "text")
64
+ score_text = (
65
+ f"Similarity: {score:.3f}\n"
66
+ if isinstance(score, (int, float))
67
+ else ""
68
+ )
69
+ lines.append(
70
+ f"\n--- {title} {index} ---\n"
71
+ f"ID: {memory_id}\n"
72
+ f"Node type: {node_type}\n"
73
+ f"{score_text}"
74
+ f"Content: {content}\n"
75
+ f"Scope: {memory.get('scope') or []}\n"
76
+ )
77
+ return "".join(lines)
78
+
79
+ def _build_prompt(
80
+ self,
81
+ *,
82
+ external: list[tuple[str, dict[str, Any], float | None]],
83
+ in_doc_chunks: list[tuple[str, dict[str, Any], float | None]],
84
+ in_doc_images: list[tuple[str, dict[str, Any], float | None]],
85
+ ) -> str:
86
+ return DOCUMENT_LINK_GUARDIAN_PROMPT.format(
87
+ chunks_text=self._format_memories(in_doc_chunks, title="Chunk"),
88
+ images_text=self._format_memories(in_doc_images, title="Image"),
89
+ external_text=self._format_memories(external, title="External"),
90
+ )
91
+
92
+ @staticmethod
93
+ def _parse_json_content(raw: str) -> DocumentLinkGuardianPlan:
94
+ text = raw.strip()
95
+ if text.startswith("```"):
96
+ text = re.sub(r"^```(?:json)?\s*", "", text)
97
+ text = re.sub(r"\s*```$", "", text)
98
+ payload = json.loads(text)
99
+ return DocumentLinkGuardianPlan.model_validate(payload)
100
+
101
+ def _call_guardian(
102
+ self,
103
+ prompt: str,
104
+ *,
105
+ vision_attachments: list[tuple[str, bytes]],
106
+ ) -> DocumentLinkGuardianPlan:
107
+ if not self.api_key:
108
+ raise DocumentLinkGuardianError(
109
+ "OPENROUTER_API_KEY is required for document link Guardian"
110
+ )
111
+ content: list[dict[str, Any]] = [{"type": "text", "text": prompt}]
112
+ for media_type, image_bytes in vision_attachments[:_MAX_VISION_ATTACHMENTS]:
113
+ content.append(
114
+ {
115
+ "type": "image_url",
116
+ "image_url": {"url": _image_data_url(image_bytes, media_type)},
117
+ }
118
+ )
119
+ payload = {
120
+ "model": self.model_id,
121
+ "messages": [{"role": "user", "content": content}],
122
+ "response_format": {"type": "json_object"},
123
+ }
124
+ try:
125
+ response = requests.post(
126
+ OPENROUTER_CHAT_URL,
127
+ headers={
128
+ "Authorization": f"Bearer {self.api_key}",
129
+ "Content-Type": "application/json",
130
+ },
131
+ json=payload,
132
+ timeout=self.timeout_seconds,
133
+ )
134
+ except requests.RequestException as exc:
135
+ raise DocumentLinkGuardianError(
136
+ f"Document link Guardian request failed: {exc}"
137
+ ) from exc
138
+ if response.status_code >= 400:
139
+ raise DocumentLinkGuardianError(
140
+ f"Document link Guardian HTTP {response.status_code}: "
141
+ f"{response.text[:500]}"
142
+ )
143
+ try:
144
+ body = response.json()
145
+ message = body["choices"][0]["message"]["content"]
146
+ except (KeyError, IndexError, TypeError, json.JSONDecodeError) as exc:
147
+ raise DocumentLinkGuardianError(
148
+ f"Document link Guardian response parse error: {exc}"
149
+ ) from exc
150
+ if not isinstance(message, str):
151
+ raise DocumentLinkGuardianError("Document link Guardian returned non-text content")
152
+ return self._parse_json_content(message)
153
+
154
+ async def plan_document_links(
155
+ self,
156
+ *,
157
+ external: list[tuple[str, dict[str, Any], float | None]],
158
+ in_doc_chunks: list[tuple[str, dict[str, Any], float | None]],
159
+ in_doc_images: list[tuple[str, dict[str, Any], float | None]],
160
+ vision_attachments: list[tuple[str, bytes]] | None = None,
161
+ ) -> DocumentLinkGuardianPlan:
162
+ prompt = self._build_prompt(
163
+ external=external,
164
+ in_doc_chunks=in_doc_chunks,
165
+ in_doc_images=in_doc_images,
166
+ )
167
+ attachments = list(vision_attachments or [])
168
+ last_error: Exception | None = None
169
+ for attempt in range(1, self.max_retries + 1):
170
+ try:
171
+ return await asyncio.to_thread(
172
+ self._call_guardian,
173
+ prompt,
174
+ vision_attachments=attachments,
175
+ )
176
+ except Exception as exc:
177
+ last_error = exc
178
+ logger.warning(
179
+ "Document link Guardian attempt %s/%s failed: %s",
180
+ attempt,
181
+ self.max_retries,
182
+ exc,
183
+ )
184
+ if attempt < self.max_retries:
185
+ await asyncio.sleep(2 ** (attempt - 1))
186
+ raise DocumentLinkGuardianError(
187
+ f"Document link Guardian failed after {self.max_retries} attempts: {last_error}"
188
+ ) from last_error
189
+
190
+
191
+ def create_document_link_guardian() -> DocumentLinkGuardian:
192
+ return DocumentLinkGuardian()
@@ -0,0 +1,292 @@
1
+ """Post-ingest semantic linking for document chunks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from collections import Counter
7
+ from typing import Any
8
+
9
+ from artha_engine import ArthaEngine
10
+
11
+ from memuron.application.config import settings
12
+ from memuron.documents.link_guardian import (
13
+ DocumentLinkGuardian,
14
+ DocumentLinkGuardianError,
15
+ create_document_link_guardian,
16
+ )
17
+ from memuron.ingest.prompts import ConnectionSpec, DocumentLinkGuardianPlan, GuardianWritePlan
18
+ from memuron.memory.recipes import (
19
+ MAX_GUARDIAN_LINKS,
20
+ _apply_guardian_links,
21
+ _find_link_candidates,
22
+ _ingest_scope_context,
23
+ _require_memory_projections,
24
+ get_memory,
25
+ refresh_memory_projections,
26
+ )
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ def _document_ingest_memory_ids(payload: dict[str, Any]) -> set[str]:
32
+ ids = {
33
+ str(payload["collection"]["id"]),
34
+ str(payload["document"]["id"]),
35
+ }
36
+ ids.update(str(item) for item in payload.get("chunk_ids") or [])
37
+ ids.update(str(item) for item in payload.get("image_ids") or [])
38
+ return ids
39
+
40
+
41
+ def collect_document_external_candidates(
42
+ engine: ArthaEngine,
43
+ *,
44
+ chunk_ids: list[str],
45
+ exclude_ids: set[str],
46
+ tenant_id: str | None,
47
+ top_k_per_chunk: int | None = None,
48
+ budget: int | None = None,
49
+ ) -> list[tuple[str, dict[str, Any], float]]:
50
+ """Rank external memories by how often they appear in per-chunk top-k retrieval."""
51
+
52
+ top_k = top_k_per_chunk or settings.document_chunk_neighbor_top_k
53
+ max_external = budget or settings.document_external_candidate_budget
54
+ scores_by_id: dict[str, float] = {}
55
+ counts: Counter[str] = Counter()
56
+
57
+ for chunk_id in chunk_ids:
58
+ try:
59
+ chunk = get_memory(engine, chunk_id)
60
+ except KeyError:
61
+ continue
62
+ content = str(chunk.get("content", "")).strip()
63
+ if not content:
64
+ continue
65
+ neighbors = _find_link_candidates(
66
+ engine,
67
+ content,
68
+ tenant_id=tenant_id,
69
+ exclude_memory_id=chunk_id,
70
+ top_k=top_k,
71
+ )
72
+ for memory_id, score, memory in neighbors:
73
+ if memory_id in exclude_ids:
74
+ continue
75
+ counts[memory_id] += 1
76
+ scores_by_id[memory_id] = max(scores_by_id.get(memory_id, 0.0), score)
77
+
78
+ ranked = sorted(
79
+ counts.keys(),
80
+ key=lambda memory_id: (-counts[memory_id], -scores_by_id.get(memory_id, 0.0)),
81
+ )[:max_external]
82
+ output: list[tuple[str, dict[str, Any], float]] = []
83
+ for memory_id in ranked:
84
+ memory = get_memory(engine, memory_id)
85
+ output.append((memory_id, memory, scores_by_id[memory_id]))
86
+ return output
87
+
88
+
89
+ def _resolve_chunk_id(raw_id: str, chunk_ids: list[str]) -> str | None:
90
+ """Map Guardian chunk_id to an ingest chunk (exact or unique prefix match)."""
91
+ token = raw_id.strip()
92
+ if not token:
93
+ return None
94
+ if token in chunk_ids:
95
+ return token
96
+ prefix_matches = [chunk_id for chunk_id in chunk_ids if chunk_id.startswith(token)]
97
+ if len(prefix_matches) == 1:
98
+ return prefix_matches[0]
99
+ suffix_matches = [chunk_id for chunk_id in chunk_ids if token in chunk_id]
100
+ if len(suffix_matches) == 1:
101
+ return suffix_matches[0]
102
+ return None
103
+
104
+
105
+ def _normalize_document_connections(
106
+ connections: list[ConnectionSpec],
107
+ ) -> list[ConnectionSpec]:
108
+ normalized: list[ConnectionSpec] = []
109
+ for conn in connections:
110
+ description = conn.description.strip()
111
+ if len(description) < 20:
112
+ continue
113
+ if "?" not in description:
114
+ description = f"{description.rstrip('.')}? What else connects these memories?"
115
+ normalized.append(
116
+ ConnectionSpec(target_id=conn.target_id.strip(), description=description)
117
+ )
118
+ return normalized
119
+
120
+
121
+ def _validate_document_connections(
122
+ memory_id: str,
123
+ scope: list[str],
124
+ raw_connections: list[ConnectionSpec],
125
+ allowed_target_ids: set[str],
126
+ memories_by_id: dict[str, dict[str, Any]],
127
+ ) -> list[ConnectionSpec]:
128
+ validated: list[ConnectionSpec] = []
129
+ for conn in _normalize_document_connections(raw_connections):
130
+ target_id = conn.target_id
131
+ if not target_id or target_id == memory_id or target_id not in allowed_target_ids:
132
+ continue
133
+ if not memories_by_id.get(target_id):
134
+ continue
135
+ if len(validated) >= MAX_GUARDIAN_LINKS:
136
+ break
137
+ validated.append(conn)
138
+ return validated
139
+
140
+
141
+ def _apply_document_link_plan(
142
+ engine: ArthaEngine,
143
+ *,
144
+ plan: DocumentLinkGuardianPlan,
145
+ chunk_ids: list[str],
146
+ scope: list[str],
147
+ allowed_target_ids: set[str],
148
+ memories_by_id: dict[str, dict[str, Any]],
149
+ event_metadata: dict[str, object] | None,
150
+ ) -> int:
151
+ linked = 0
152
+ for assignment in plan.chunk_links:
153
+ chunk_id = _resolve_chunk_id(assignment.chunk_id, chunk_ids)
154
+ if not chunk_id or chunk_id not in memories_by_id:
155
+ continue
156
+ connections = _validate_document_connections(
157
+ chunk_id,
158
+ scope,
159
+ assignment.connections,
160
+ allowed_target_ids,
161
+ memories_by_id,
162
+ )
163
+ if not connections:
164
+ continue
165
+ write_plan = GuardianWritePlan(
166
+ action="create",
167
+ target_memory_id=None,
168
+ reasoning=plan.reasoning,
169
+ final_content=str(memories_by_id[chunk_id].get("content", "")),
170
+ scope=scope,
171
+ connections=connections,
172
+ links_to_remove=[],
173
+ )
174
+ linked += _apply_guardian_links(
175
+ engine,
176
+ chunk_id,
177
+ scope,
178
+ write_plan,
179
+ list(allowed_target_ids),
180
+ memories_by_id,
181
+ event_metadata={
182
+ **(event_metadata or {}),
183
+ "document_link_guardian": True,
184
+ },
185
+ )
186
+ return linked
187
+
188
+
189
+ async def link_document_after_ingest(
190
+ engine: ArthaEngine,
191
+ *,
192
+ ingest_payload: dict[str, Any],
193
+ scope: list[str] | None = None,
194
+ event_metadata: dict[str, object] | None = None,
195
+ space_context: dict[str, str] | None = None, # noqa: ARG001 — reserved
196
+ image_attachments: list[dict[str, Any]] | None = None,
197
+ document_link_guardian: DocumentLinkGuardian | None = None,
198
+ ) -> int:
199
+ """Run batch Guardian linking for document chunks; no-op for standalone images."""
200
+
201
+ if ingest_payload.get("source_type") == "image":
202
+ return 0
203
+
204
+ chunk_ids = [str(item) for item in ingest_payload.get("chunk_ids") or []]
205
+ if not chunk_ids:
206
+ return 0
207
+
208
+ if not settings.openrouter_api_key:
209
+ logger.info("Skipping document link Guardian: OPENROUTER_API_KEY unset")
210
+ return 0
211
+
212
+ _require_memory_projections(engine)
213
+ request_scope, tenant_id, _search_scope = _ingest_scope_context(
214
+ scope=scope,
215
+ event_metadata=event_metadata,
216
+ space_context=None,
217
+ candidate_scope=None,
218
+ )
219
+
220
+ exclude_ids = _document_ingest_memory_ids(ingest_payload)
221
+ external = collect_document_external_candidates(
222
+ engine,
223
+ chunk_ids=chunk_ids,
224
+ exclude_ids=exclude_ids,
225
+ tenant_id=tenant_id,
226
+ )
227
+ if not external and len(chunk_ids) < 2:
228
+ return 0
229
+
230
+ in_doc_chunks: list[tuple[str, dict[str, Any], float | None]] = []
231
+ in_doc_images: list[tuple[str, dict[str, Any], float | None]] = []
232
+ memories_by_id: dict[str, dict[str, Any]] = {}
233
+
234
+ for chunk_id in chunk_ids:
235
+ memory = get_memory(engine, chunk_id)
236
+ in_doc_chunks.append((chunk_id, memory, None))
237
+ memories_by_id[chunk_id] = memory
238
+
239
+ for image_id in ingest_payload.get("image_ids") or []:
240
+ image_id = str(image_id)
241
+ memory = get_memory(engine, image_id)
242
+ in_doc_images.append((image_id, memory, None))
243
+ memories_by_id[image_id] = memory
244
+
245
+ for memory_id, memory, _score in external:
246
+ memories_by_id[memory_id] = memory
247
+
248
+ allowed_target_ids = set(memories_by_id.keys()) - {str(ingest_payload["collection"]["id"]), str(ingest_payload["document"]["id"])}
249
+
250
+ guardian = document_link_guardian or create_document_link_guardian()
251
+ vision: list[tuple[str, bytes]] = []
252
+ for item in image_attachments or []:
253
+ raw = item.get("bytes")
254
+ media_type = str(item.get("media_type") or "image/png")
255
+ if isinstance(raw, (bytes, bytearray)) and raw:
256
+ vision.append((media_type, bytes(raw)))
257
+
258
+ try:
259
+ plan = await guardian.plan_document_links(
260
+ external=external,
261
+ in_doc_chunks=in_doc_chunks,
262
+ in_doc_images=in_doc_images,
263
+ vision_attachments=vision,
264
+ )
265
+ except DocumentLinkGuardianError:
266
+ logger.warning(
267
+ "Document link Guardian failed for document_key=%s; keeping placement-only graph",
268
+ ingest_payload.get("document_key"),
269
+ exc_info=True,
270
+ )
271
+ return 0
272
+
273
+ linked = _apply_document_link_plan(
274
+ engine,
275
+ plan=plan,
276
+ chunk_ids=chunk_ids,
277
+ scope=request_scope,
278
+ allowed_target_ids=allowed_target_ids,
279
+ memories_by_id=memories_by_id,
280
+ event_metadata=event_metadata,
281
+ )
282
+ logger.info(
283
+ "Document link Guardian document_key=%s chunks=%s external=%s proposed=%s linked=%s",
284
+ ingest_payload.get("document_key"),
285
+ len(chunk_ids),
286
+ len(external),
287
+ sum(len(item.connections) for item in plan.chunk_links),
288
+ linked,
289
+ )
290
+ if linked:
291
+ refresh_memory_projections(engine)
292
+ return linked