memuron 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. memuron/__init__.py +3 -0
  2. memuron/actions/__init__.py +12 -0
  3. memuron/actions/context.py +63 -0
  4. memuron/actions/helpers.py +88 -0
  5. memuron/actions/memory.py +340 -0
  6. memuron/actions/memory_write.py +290 -0
  7. memuron/actions/nodes.py +340 -0
  8. memuron/actions/registry.py +5 -0
  9. memuron/actions/runtime.py +37 -0
  10. memuron/actions/spaces_documents.py +720 -0
  11. memuron/actions/sync.py +155 -0
  12. memuron/application/__init__.py +1 -0
  13. memuron/application/api.py +206 -0
  14. memuron/application/app.py +103 -0
  15. memuron/application/capabilities.py +82 -0
  16. memuron/application/cli.py +35 -0
  17. memuron/application/config.py +176 -0
  18. memuron/application/mcp.py +44 -0
  19. memuron/application/mcp_oauth.py +290 -0
  20. memuron/application/registry.py +52 -0
  21. memuron/context.py +532 -0
  22. memuron/documents/__init__.py +1 -0
  23. memuron/documents/link_guardian.py +192 -0
  24. memuron/documents/linking.py +292 -0
  25. memuron/documents/parser.py +1152 -0
  26. memuron/documents/storage.py +151 -0
  27. memuron/documents/url_ingest.py +375 -0
  28. memuron/domain/__init__.py +1 -0
  29. memuron/domain/decoders.py +1 -0
  30. memuron/domain/encoders.py +185 -0
  31. memuron/domain/lifecycles.py +8 -0
  32. memuron/domain/limits.py +6 -0
  33. memuron/domain/representations.py +56 -0
  34. memuron/domain/schemas.py +581 -0
  35. memuron/domain/scope_filter.py +104 -0
  36. memuron/graphfs/__init__.py +1 -0
  37. memuron/graphfs/manual.py +635 -0
  38. memuron/graphfs/projection.py +578 -0
  39. memuron/graphfs/query.py +1782 -0
  40. memuron/graphfs/read_model.py +574 -0
  41. memuron/ingest/__init__.py +1 -0
  42. memuron/ingest/guardian.py +213 -0
  43. memuron/ingest/jobs.py +424 -0
  44. memuron/ingest/prompts.py +147 -0
  45. memuron/memory/__init__.py +1 -0
  46. memuron/memory/engine.py +35 -0
  47. memuron/memory/projections.py +452 -0
  48. memuron/memory/recipes.py +3247 -0
  49. memuron/persistence/__init__.py +1 -0
  50. memuron/persistence/db_pool.py +57 -0
  51. memuron/persistence/identity_store.py +918 -0
  52. memuron/persistence/store_helpers.py +16 -0
  53. memuron/search/__init__.py +1 -0
  54. memuron/search/fulltext.py +110 -0
  55. memuron/search/hybrid.py +284 -0
  56. memuron/search/pgvector.py +252 -0
  57. memuron/security/__init__.py +1 -0
  58. memuron/security/auth.py +143 -0
  59. memuron/security/auth_provider.py +119 -0
  60. memuron/security/authorization.py +53 -0
  61. memuron/security/clerk_scopes.py +94 -0
  62. memuron/security/clerk_webhooks.py +61 -0
  63. memuron/security/jwt_tokens.py +53 -0
  64. memuron/security/passwords.py +38 -0
  65. memuron/security/tenant.py +58 -0
  66. memuron/spaces/__init__.py +1 -0
  67. memuron/spaces/model.py +35 -0
  68. memuron/spaces/service.py +155 -0
  69. memuron/sync/__init__.py +25 -0
  70. memuron/sync/folder.py +828 -0
  71. memuron-0.1.1.dist-info/METADATA +242 -0
  72. memuron-0.1.1.dist-info/RECORD +74 -0
  73. memuron-0.1.1.dist-info/WHEEL +4 -0
  74. memuron-0.1.1.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,151 @@
1
+ """Durable object storage for original document uploads."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import re
7
+ from dataclasses import dataclass
8
+ from typing import Any
9
+
10
+ from memuron.application.config import settings
11
+
12
+
13
+ def _safe_segment(value: str, *, fallback: str) -> str:
14
+ cleaned = re.sub(r"[^A-Za-z0-9._-]+", "-", value).strip(".-")
15
+ return cleaned or fallback
16
+
17
+
18
+ def _is_configured() -> bool:
19
+ return bool(
20
+ settings.object_storage_bucket
21
+ and settings.object_storage_access_key_id
22
+ and settings.object_storage_secret_access_key
23
+ )
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class SourceObject:
28
+ provider: str
29
+ bucket: str
30
+ key: str
31
+ content_type: str
32
+ file_name: str
33
+ size_bytes: int
34
+ sha256: str
35
+ public_url: str | None = None
36
+
37
+ def as_payload(self) -> dict[str, Any]:
38
+ payload: dict[str, Any] = {
39
+ "provider": self.provider,
40
+ "bucket": self.bucket,
41
+ "key": self.key,
42
+ "content_type": self.content_type,
43
+ "file_name": self.file_name,
44
+ "size_bytes": self.size_bytes,
45
+ "sha256": self.sha256,
46
+ }
47
+ if self.public_url:
48
+ payload["public_url"] = self.public_url
49
+ return payload
50
+
51
+
52
+ class DocumentStorage:
53
+ """S3-compatible storage facade for original source files."""
54
+
55
+ def __init__(self) -> None:
56
+ self.bucket = settings.object_storage_bucket
57
+
58
+ @property
59
+ def configured(self) -> bool:
60
+ return _is_configured()
61
+
62
+ def _client(self) -> Any:
63
+ import boto3
64
+ from botocore.config import Config
65
+
66
+ return boto3.client(
67
+ "s3",
68
+ endpoint_url=settings.object_storage_endpoint_url or None,
69
+ region_name=settings.object_storage_region or None,
70
+ aws_access_key_id=settings.object_storage_access_key_id,
71
+ aws_secret_access_key=settings.object_storage_secret_access_key,
72
+ config=Config(signature_version="s3v4", s3={"addressing_style": "path"}),
73
+ )
74
+
75
+ def put_source(
76
+ self,
77
+ *,
78
+ org_id: str | None,
79
+ document_key: str,
80
+ file_name: str,
81
+ content_type: str | None,
82
+ file_bytes: bytes,
83
+ ) -> dict[str, Any] | None:
84
+ if not self.configured:
85
+ return None
86
+ safe_org = _safe_segment(org_id or "no-org", fallback="no-org")
87
+ safe_name = _safe_segment(file_name, fallback="document")
88
+ key = f"org/{safe_org}/documents/{document_key}/source/{safe_name}"
89
+ media_type = content_type or "application/octet-stream"
90
+ digest = hashlib.sha256(file_bytes).hexdigest()
91
+ self._client().put_object(
92
+ Bucket=self.bucket,
93
+ Key=key,
94
+ Body=file_bytes,
95
+ ContentType=media_type,
96
+ Metadata={
97
+ "document-key": document_key,
98
+ "sha256": digest,
99
+ "original-file-name": safe_name,
100
+ },
101
+ )
102
+ public_url = None
103
+ if settings.object_storage_public_base_url:
104
+ public_url = f"{settings.object_storage_public_base_url}/{key}"
105
+ return SourceObject(
106
+ provider="s3",
107
+ bucket=self.bucket,
108
+ key=key,
109
+ content_type=media_type,
110
+ file_name=file_name,
111
+ size_bytes=len(file_bytes),
112
+ sha256=digest,
113
+ public_url=public_url,
114
+ ).as_payload()
115
+
116
+ def presign_download(self, source_object: dict[str, Any]) -> str | None:
117
+ public_url = source_object.get("public_url")
118
+ if isinstance(public_url, str) and public_url:
119
+ return public_url
120
+ if not self.configured:
121
+ return None
122
+ bucket = str(source_object.get("bucket") or self.bucket)
123
+ key = str(source_object.get("key") or "")
124
+ if not bucket or not key:
125
+ return None
126
+ return self._client().generate_presigned_url(
127
+ "get_object",
128
+ Params={"Bucket": bucket, "Key": key},
129
+ ExpiresIn=settings.object_storage_presign_seconds,
130
+ )
131
+
132
+
133
+ def maybe_store_source_file(
134
+ *,
135
+ org_id: str | None,
136
+ document_key: str,
137
+ file_name: str,
138
+ content_type: str | None,
139
+ file_bytes: bytes,
140
+ ) -> dict[str, Any] | None:
141
+ return DocumentStorage().put_source(
142
+ org_id=org_id,
143
+ document_key=document_key,
144
+ file_name=file_name,
145
+ content_type=content_type,
146
+ file_bytes=file_bytes,
147
+ )
148
+
149
+
150
+ def presign_source_object(source_object: dict[str, Any]) -> str | None:
151
+ return DocumentStorage().presign_download(source_object)
@@ -0,0 +1,375 @@
1
+ """Safe URL fetching and normalization for document ingest."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import ipaddress
6
+ import re
7
+ import socket
8
+ from dataclasses import dataclass
9
+ from datetime import UTC, datetime
10
+ from html.parser import HTMLParser
11
+ from pathlib import Path
12
+ from typing import Any
13
+ from urllib.parse import unquote, urljoin, urlparse
14
+
15
+ import requests
16
+
17
+ from memuron.documents.parser import MAX_DOCUMENT_UPLOAD_BYTES
18
+
19
+
20
+ DEFAULT_URL_FETCH_TIMEOUT_SECONDS = 15
21
+ MAX_URL_REDIRECTS = 5
22
+ URL_FETCH_USER_AGENT = "MemuronURLIngest/1.0"
23
+
24
+
25
+ class UrlIngestError(ValueError):
26
+ """Raised when a URL cannot be safely fetched for document ingest."""
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class FetchedUrlSource:
31
+ file_name: str
32
+ content_type: str | None
33
+ file_bytes: bytes
34
+ metadata: dict[str, Any]
35
+
36
+
37
+ def fetch_url_source(
38
+ url: str,
39
+ *,
40
+ timeout_seconds: int = DEFAULT_URL_FETCH_TIMEOUT_SECONDS,
41
+ max_bytes: int = MAX_DOCUMENT_UPLOAD_BYTES,
42
+ session: requests.Session | None = None,
43
+ ) -> FetchedUrlSource:
44
+ """Fetch and normalize a remote URL into a source document payload."""
45
+
46
+ current_url = _validate_fetch_url(url)
47
+ client = session or requests.Session()
48
+ fetched_at = datetime.now(UTC).isoformat()
49
+ response = None
50
+ for _redirect in range(MAX_URL_REDIRECTS + 1):
51
+ try:
52
+ response = client.get(
53
+ current_url,
54
+ headers={"User-Agent": URL_FETCH_USER_AGENT},
55
+ timeout=timeout_seconds,
56
+ stream=True,
57
+ allow_redirects=False,
58
+ )
59
+ except requests.Timeout as exc:
60
+ raise UrlIngestError("URL fetch timed out") from exc
61
+ except requests.RequestException as exc:
62
+ raise UrlIngestError(f"URL fetch failed: {exc}") from exc
63
+
64
+ if 300 <= response.status_code < 400:
65
+ location = response.headers.get("Location")
66
+ if not location:
67
+ raise UrlIngestError("URL redirect is missing a Location header")
68
+ current_url = _validate_fetch_url(urljoin(current_url, location))
69
+ continue
70
+ break
71
+ else: # pragma: no cover - for-loop always breaks before this branch
72
+ raise UrlIngestError("URL has too many redirects")
73
+
74
+ assert response is not None
75
+ final_url = _validate_fetch_url(getattr(response, "url", current_url) or current_url)
76
+ if response.status_code != 200:
77
+ raise UrlIngestError(f"URL returned HTTP {response.status_code}")
78
+
79
+ raw_content_type = response.headers.get("Content-Type")
80
+ content_type = _content_type(raw_content_type)
81
+ raw_bytes = _read_limited_response(response, max_bytes=max_bytes)
82
+ if _is_html(content_type, final_url):
83
+ markdown, title = html_to_markdown(raw_bytes, content_type=raw_content_type)
84
+ file_name = _url_file_name(final_url, content_type="text/markdown", title=title)
85
+ metadata_content_type = content_type or "text/html"
86
+ return FetchedUrlSource(
87
+ file_name=file_name,
88
+ content_type="text/markdown",
89
+ file_bytes=markdown.encode("utf-8"),
90
+ metadata=_fetch_metadata(
91
+ source_url=url,
92
+ final_url=final_url,
93
+ content_type=metadata_content_type,
94
+ fetched_at=fetched_at,
95
+ title=title,
96
+ normalized_content_type="text/markdown",
97
+ size_bytes=len(raw_bytes),
98
+ ),
99
+ )
100
+
101
+ file_name = _url_file_name(final_url, content_type=content_type)
102
+ return FetchedUrlSource(
103
+ file_name=file_name,
104
+ content_type=content_type,
105
+ file_bytes=raw_bytes,
106
+ metadata=_fetch_metadata(
107
+ source_url=url,
108
+ final_url=final_url,
109
+ content_type=content_type,
110
+ fetched_at=fetched_at,
111
+ title=None,
112
+ normalized_content_type=None,
113
+ size_bytes=len(raw_bytes),
114
+ ),
115
+ )
116
+
117
+
118
+ def html_to_markdown(file_bytes: bytes, *, content_type: str | None = None) -> tuple[str, str | None]:
119
+ """Extract readable markdown-ish text from HTML without heavy dependencies."""
120
+
121
+ parser = _ReadableHTMLParser()
122
+ parser.feed(_decode_html(file_bytes, content_type=content_type))
123
+ parser.close()
124
+ title = _normalize_whitespace(parser.title).strip() or None
125
+ body = _clean_markdown_lines("".join(parser.parts))
126
+ if title and not body.lstrip().startswith("#"):
127
+ body = f"# {title}\n\n{body}".strip()
128
+ if not body.strip():
129
+ raise UrlIngestError("HTML page did not contain readable text")
130
+ return body.strip() + "\n", title
131
+
132
+
133
+ class _ReadableHTMLParser(HTMLParser):
134
+ def __init__(self) -> None:
135
+ super().__init__(convert_charrefs=True)
136
+ self.parts: list[str] = []
137
+ self._title_parts: list[str] = []
138
+ self._skip_depth = 0
139
+ self._in_title = False
140
+
141
+ @property
142
+ def title(self) -> str:
143
+ return "".join(self._title_parts)
144
+
145
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
146
+ tag = tag.lower()
147
+ if tag in {"script", "style", "noscript", "svg", "template"}:
148
+ self._skip_depth += 1
149
+ return
150
+ if tag == "title":
151
+ self._in_title = True
152
+ return
153
+ if self._skip_depth:
154
+ return
155
+ if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:
156
+ level = int(tag[1])
157
+ self.parts.append("\n\n" + "#" * level + " ")
158
+ elif tag in {"p", "div", "section", "article", "main", "header", "footer", "tr"}:
159
+ self.parts.append("\n\n")
160
+ elif tag == "li":
161
+ self.parts.append("\n- ")
162
+ elif tag == "br":
163
+ self.parts.append("\n")
164
+ elif tag in {"td", "th"}:
165
+ self.parts.append(" | ")
166
+
167
+ def handle_endtag(self, tag: str) -> None:
168
+ tag = tag.lower()
169
+ if tag in {"script", "style", "noscript", "svg", "template"} and self._skip_depth:
170
+ self._skip_depth -= 1
171
+ return
172
+ if tag == "title":
173
+ self._in_title = False
174
+ return
175
+ if self._skip_depth:
176
+ return
177
+ if tag in {
178
+ "h1",
179
+ "h2",
180
+ "h3",
181
+ "h4",
182
+ "h5",
183
+ "h6",
184
+ "p",
185
+ "div",
186
+ "section",
187
+ "article",
188
+ "main",
189
+ "li",
190
+ "tr",
191
+ }:
192
+ self.parts.append("\n")
193
+
194
+ def handle_data(self, data: str) -> None:
195
+ if self._in_title:
196
+ self._title_parts.append(data)
197
+ return
198
+ if self._skip_depth:
199
+ return
200
+ text = _normalize_whitespace(data)
201
+ if text:
202
+ self.parts.append(text)
203
+
204
+
205
+ def _validate_fetch_url(url: str) -> str:
206
+ raw = str(url or "").strip()
207
+ parsed = urlparse(raw)
208
+ if parsed.scheme.lower() not in {"http", "https"}:
209
+ raise UrlIngestError("URL ingest only supports http and https URLs")
210
+ if not parsed.hostname:
211
+ raise UrlIngestError("URL must include a host")
212
+ host = parsed.hostname.strip().lower()
213
+ if host in {"localhost", "localhost.localdomain"} or host.endswith(".localhost"):
214
+ raise UrlIngestError("URL host is not allowed")
215
+ _reject_private_host(host)
216
+ return raw
217
+
218
+
219
+ def _reject_private_host(host: str) -> None:
220
+ try:
221
+ addresses = [ipaddress.ip_address(host)]
222
+ except ValueError:
223
+ try:
224
+ infos = socket.getaddrinfo(host, None, type=socket.SOCK_STREAM)
225
+ except socket.gaierror as exc:
226
+ raise UrlIngestError(f"Could not resolve URL host: {host}") from exc
227
+ addresses = []
228
+ for info in infos:
229
+ raw_address = info[4][0]
230
+ try:
231
+ addresses.append(ipaddress.ip_address(raw_address))
232
+ except ValueError:
233
+ continue
234
+ if not addresses:
235
+ raise UrlIngestError(f"Could not resolve URL host: {host}")
236
+ for address in addresses:
237
+ if (
238
+ address.is_private
239
+ or address.is_loopback
240
+ or address.is_link_local
241
+ or address.is_multicast
242
+ or address.is_reserved
243
+ or address.is_unspecified
244
+ ):
245
+ raise UrlIngestError("URL host resolves to a private or local address")
246
+
247
+
248
+ def _read_limited_response(response: requests.Response, *, max_bytes: int) -> bytes:
249
+ content_length = response.headers.get("Content-Length")
250
+ if content_length:
251
+ try:
252
+ if int(content_length) > max_bytes:
253
+ raise UrlIngestError(
254
+ f"URL response is too large; max is {max_bytes // (1024 * 1024)} MB"
255
+ )
256
+ except ValueError:
257
+ if content_length.strip().isdigit():
258
+ raise
259
+ chunks: list[bytes] = []
260
+ total = 0
261
+ for chunk in response.iter_content(chunk_size=64 * 1024):
262
+ if not chunk:
263
+ continue
264
+ total += len(chunk)
265
+ if total > max_bytes:
266
+ raise UrlIngestError(
267
+ f"URL response is too large; max is {max_bytes // (1024 * 1024)} MB"
268
+ )
269
+ chunks.append(chunk)
270
+ return b"".join(chunks)
271
+
272
+
273
+ def _content_type(value: str | None) -> str | None:
274
+ if not value:
275
+ return None
276
+ return value.split(";", 1)[0].strip().lower() or None
277
+
278
+
279
+ def _is_html(content_type: str | None, url: str) -> bool:
280
+ path = urlparse(url).path.lower()
281
+ return content_type in {"text/html", "application/xhtml+xml"} or path.endswith((".html", ".htm"))
282
+
283
+
284
+ def _decode_html(file_bytes: bytes, *, content_type: str | None) -> str:
285
+ charset = None
286
+ if content_type:
287
+ match = re.search(r"charset=([A-Za-z0-9._-]+)", content_type, flags=re.IGNORECASE)
288
+ if match:
289
+ charset = match.group(1)
290
+ for encoding in [charset, "utf-8", "latin-1"]:
291
+ if not encoding:
292
+ continue
293
+ try:
294
+ return file_bytes.decode(encoding)
295
+ except (LookupError, UnicodeDecodeError):
296
+ continue
297
+ return file_bytes.decode("utf-8", errors="replace")
298
+
299
+
300
+ def _clean_markdown_lines(text: str) -> str:
301
+ lines = [_normalize_whitespace(line).strip() for line in text.splitlines()]
302
+ output: list[str] = []
303
+ blank = False
304
+ for line in lines:
305
+ if not line:
306
+ if output and not blank:
307
+ output.append("")
308
+ blank = True
309
+ continue
310
+ output.append(line)
311
+ blank = False
312
+ return "\n".join(output).strip()
313
+
314
+
315
+ def _normalize_whitespace(text: str) -> str:
316
+ return re.sub(r"[ \t\r\f\v]+", " ", text.replace("\xa0", " "))
317
+
318
+
319
+ def _url_file_name(url: str, *, content_type: str | None, title: str | None = None) -> str:
320
+ if title:
321
+ stem = re.sub(r"[^A-Za-z0-9._-]+", "-", title).strip(".-").lower()
322
+ if stem:
323
+ return f"{stem[:80]}.md"
324
+ path_name = Path(unquote(urlparse(url).path or "")).name
325
+ if path_name and "." in path_name:
326
+ return re.sub(r"[^A-Za-z0-9._-]+", "-", path_name).strip(".-") or "url-document"
327
+ stem = re.sub(r"[^A-Za-z0-9._-]+", "-", path_name or urlparse(url).hostname or "url-document")
328
+ stem = stem.strip(".-") or "url-document"
329
+ return f"{stem[:80]}{_extension_for_content_type(content_type)}"
330
+
331
+
332
+ def _extension_for_content_type(content_type: str | None) -> str:
333
+ return {
334
+ "application/pdf": ".pdf",
335
+ "application/json": ".json",
336
+ "application/ld+json": ".json",
337
+ "application/xml": ".xml",
338
+ "text/xml": ".xml",
339
+ "text/csv": ".csv",
340
+ "text/tab-separated-values": ".tsv",
341
+ "text/markdown": ".md",
342
+ "text/x-markdown": ".md",
343
+ "text/html": ".html",
344
+ "application/xhtml+xml": ".html",
345
+ "text/plain": ".txt",
346
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
347
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
348
+ "application/vnd.ms-excel": ".xls",
349
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
350
+ "application/vnd.ms-powerpoint": ".ppt",
351
+ }.get(content_type or "", ".txt")
352
+
353
+
354
+ def _fetch_metadata(
355
+ *,
356
+ source_url: str,
357
+ final_url: str,
358
+ content_type: str | None,
359
+ fetched_at: str,
360
+ title: str | None,
361
+ normalized_content_type: str | None,
362
+ size_bytes: int,
363
+ ) -> dict[str, Any]:
364
+ metadata: dict[str, Any] = {
365
+ "source_url": source_url,
366
+ "fetched_url": final_url,
367
+ "content_type": content_type,
368
+ "fetched_at": fetched_at,
369
+ "size_bytes": size_bytes,
370
+ }
371
+ if title:
372
+ metadata["title"] = title
373
+ if normalized_content_type:
374
+ metadata["normalized_content_type"] = normalized_content_type
375
+ return metadata
@@ -0,0 +1 @@
1
+ """Shared domain models, schemas, encoders, and validation helpers."""
@@ -0,0 +1 @@
1
+ """Product-specific decoders live here."""