memuron 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memuron/__init__.py +3 -0
- memuron/actions/__init__.py +12 -0
- memuron/actions/context.py +63 -0
- memuron/actions/helpers.py +88 -0
- memuron/actions/memory.py +340 -0
- memuron/actions/memory_write.py +290 -0
- memuron/actions/nodes.py +340 -0
- memuron/actions/registry.py +5 -0
- memuron/actions/runtime.py +37 -0
- memuron/actions/spaces_documents.py +720 -0
- memuron/actions/sync.py +155 -0
- memuron/application/__init__.py +1 -0
- memuron/application/api.py +206 -0
- memuron/application/app.py +103 -0
- memuron/application/capabilities.py +82 -0
- memuron/application/cli.py +35 -0
- memuron/application/config.py +176 -0
- memuron/application/mcp.py +44 -0
- memuron/application/mcp_oauth.py +290 -0
- memuron/application/registry.py +52 -0
- memuron/context.py +532 -0
- memuron/documents/__init__.py +1 -0
- memuron/documents/link_guardian.py +192 -0
- memuron/documents/linking.py +292 -0
- memuron/documents/parser.py +1152 -0
- memuron/documents/storage.py +151 -0
- memuron/documents/url_ingest.py +375 -0
- memuron/domain/__init__.py +1 -0
- memuron/domain/decoders.py +1 -0
- memuron/domain/encoders.py +185 -0
- memuron/domain/lifecycles.py +8 -0
- memuron/domain/limits.py +6 -0
- memuron/domain/representations.py +56 -0
- memuron/domain/schemas.py +581 -0
- memuron/domain/scope_filter.py +104 -0
- memuron/graphfs/__init__.py +1 -0
- memuron/graphfs/manual.py +635 -0
- memuron/graphfs/projection.py +578 -0
- memuron/graphfs/query.py +1782 -0
- memuron/graphfs/read_model.py +574 -0
- memuron/ingest/__init__.py +1 -0
- memuron/ingest/guardian.py +213 -0
- memuron/ingest/jobs.py +424 -0
- memuron/ingest/prompts.py +147 -0
- memuron/memory/__init__.py +1 -0
- memuron/memory/engine.py +35 -0
- memuron/memory/projections.py +452 -0
- memuron/memory/recipes.py +3247 -0
- memuron/persistence/__init__.py +1 -0
- memuron/persistence/db_pool.py +57 -0
- memuron/persistence/identity_store.py +918 -0
- memuron/persistence/store_helpers.py +16 -0
- memuron/search/__init__.py +1 -0
- memuron/search/fulltext.py +110 -0
- memuron/search/hybrid.py +284 -0
- memuron/search/pgvector.py +252 -0
- memuron/security/__init__.py +1 -0
- memuron/security/auth.py +143 -0
- memuron/security/auth_provider.py +119 -0
- memuron/security/authorization.py +53 -0
- memuron/security/clerk_scopes.py +94 -0
- memuron/security/clerk_webhooks.py +61 -0
- memuron/security/jwt_tokens.py +53 -0
- memuron/security/passwords.py +38 -0
- memuron/security/tenant.py +58 -0
- memuron/spaces/__init__.py +1 -0
- memuron/spaces/model.py +35 -0
- memuron/spaces/service.py +155 -0
- memuron/sync/__init__.py +25 -0
- memuron/sync/folder.py +828 -0
- memuron-0.1.1.dist-info/METADATA +242 -0
- memuron-0.1.1.dist-info/RECORD +74 -0
- memuron-0.1.1.dist-info/WHEEL +4 -0
- memuron-0.1.1.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""Durable object storage for original document uploads."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import re
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from memuron.application.config import settings
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _safe_segment(value: str, *, fallback: str) -> str:
|
|
14
|
+
cleaned = re.sub(r"[^A-Za-z0-9._-]+", "-", value).strip(".-")
|
|
15
|
+
return cleaned or fallback
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _is_configured() -> bool:
|
|
19
|
+
return bool(
|
|
20
|
+
settings.object_storage_bucket
|
|
21
|
+
and settings.object_storage_access_key_id
|
|
22
|
+
and settings.object_storage_secret_access_key
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(frozen=True)
|
|
27
|
+
class SourceObject:
|
|
28
|
+
provider: str
|
|
29
|
+
bucket: str
|
|
30
|
+
key: str
|
|
31
|
+
content_type: str
|
|
32
|
+
file_name: str
|
|
33
|
+
size_bytes: int
|
|
34
|
+
sha256: str
|
|
35
|
+
public_url: str | None = None
|
|
36
|
+
|
|
37
|
+
def as_payload(self) -> dict[str, Any]:
|
|
38
|
+
payload: dict[str, Any] = {
|
|
39
|
+
"provider": self.provider,
|
|
40
|
+
"bucket": self.bucket,
|
|
41
|
+
"key": self.key,
|
|
42
|
+
"content_type": self.content_type,
|
|
43
|
+
"file_name": self.file_name,
|
|
44
|
+
"size_bytes": self.size_bytes,
|
|
45
|
+
"sha256": self.sha256,
|
|
46
|
+
}
|
|
47
|
+
if self.public_url:
|
|
48
|
+
payload["public_url"] = self.public_url
|
|
49
|
+
return payload
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class DocumentStorage:
|
|
53
|
+
"""S3-compatible storage facade for original source files."""
|
|
54
|
+
|
|
55
|
+
def __init__(self) -> None:
|
|
56
|
+
self.bucket = settings.object_storage_bucket
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def configured(self) -> bool:
|
|
60
|
+
return _is_configured()
|
|
61
|
+
|
|
62
|
+
def _client(self) -> Any:
|
|
63
|
+
import boto3
|
|
64
|
+
from botocore.config import Config
|
|
65
|
+
|
|
66
|
+
return boto3.client(
|
|
67
|
+
"s3",
|
|
68
|
+
endpoint_url=settings.object_storage_endpoint_url or None,
|
|
69
|
+
region_name=settings.object_storage_region or None,
|
|
70
|
+
aws_access_key_id=settings.object_storage_access_key_id,
|
|
71
|
+
aws_secret_access_key=settings.object_storage_secret_access_key,
|
|
72
|
+
config=Config(signature_version="s3v4", s3={"addressing_style": "path"}),
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
def put_source(
|
|
76
|
+
self,
|
|
77
|
+
*,
|
|
78
|
+
org_id: str | None,
|
|
79
|
+
document_key: str,
|
|
80
|
+
file_name: str,
|
|
81
|
+
content_type: str | None,
|
|
82
|
+
file_bytes: bytes,
|
|
83
|
+
) -> dict[str, Any] | None:
|
|
84
|
+
if not self.configured:
|
|
85
|
+
return None
|
|
86
|
+
safe_org = _safe_segment(org_id or "no-org", fallback="no-org")
|
|
87
|
+
safe_name = _safe_segment(file_name, fallback="document")
|
|
88
|
+
key = f"org/{safe_org}/documents/{document_key}/source/{safe_name}"
|
|
89
|
+
media_type = content_type or "application/octet-stream"
|
|
90
|
+
digest = hashlib.sha256(file_bytes).hexdigest()
|
|
91
|
+
self._client().put_object(
|
|
92
|
+
Bucket=self.bucket,
|
|
93
|
+
Key=key,
|
|
94
|
+
Body=file_bytes,
|
|
95
|
+
ContentType=media_type,
|
|
96
|
+
Metadata={
|
|
97
|
+
"document-key": document_key,
|
|
98
|
+
"sha256": digest,
|
|
99
|
+
"original-file-name": safe_name,
|
|
100
|
+
},
|
|
101
|
+
)
|
|
102
|
+
public_url = None
|
|
103
|
+
if settings.object_storage_public_base_url:
|
|
104
|
+
public_url = f"{settings.object_storage_public_base_url}/{key}"
|
|
105
|
+
return SourceObject(
|
|
106
|
+
provider="s3",
|
|
107
|
+
bucket=self.bucket,
|
|
108
|
+
key=key,
|
|
109
|
+
content_type=media_type,
|
|
110
|
+
file_name=file_name,
|
|
111
|
+
size_bytes=len(file_bytes),
|
|
112
|
+
sha256=digest,
|
|
113
|
+
public_url=public_url,
|
|
114
|
+
).as_payload()
|
|
115
|
+
|
|
116
|
+
def presign_download(self, source_object: dict[str, Any]) -> str | None:
|
|
117
|
+
public_url = source_object.get("public_url")
|
|
118
|
+
if isinstance(public_url, str) and public_url:
|
|
119
|
+
return public_url
|
|
120
|
+
if not self.configured:
|
|
121
|
+
return None
|
|
122
|
+
bucket = str(source_object.get("bucket") or self.bucket)
|
|
123
|
+
key = str(source_object.get("key") or "")
|
|
124
|
+
if not bucket or not key:
|
|
125
|
+
return None
|
|
126
|
+
return self._client().generate_presigned_url(
|
|
127
|
+
"get_object",
|
|
128
|
+
Params={"Bucket": bucket, "Key": key},
|
|
129
|
+
ExpiresIn=settings.object_storage_presign_seconds,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def maybe_store_source_file(
|
|
134
|
+
*,
|
|
135
|
+
org_id: str | None,
|
|
136
|
+
document_key: str,
|
|
137
|
+
file_name: str,
|
|
138
|
+
content_type: str | None,
|
|
139
|
+
file_bytes: bytes,
|
|
140
|
+
) -> dict[str, Any] | None:
|
|
141
|
+
return DocumentStorage().put_source(
|
|
142
|
+
org_id=org_id,
|
|
143
|
+
document_key=document_key,
|
|
144
|
+
file_name=file_name,
|
|
145
|
+
content_type=content_type,
|
|
146
|
+
file_bytes=file_bytes,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def presign_source_object(source_object: dict[str, Any]) -> str | None:
|
|
151
|
+
return DocumentStorage().presign_download(source_object)
|
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
"""Safe URL fetching and normalization for document ingest."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import ipaddress
|
|
6
|
+
import re
|
|
7
|
+
import socket
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from datetime import UTC, datetime
|
|
10
|
+
from html.parser import HTMLParser
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
from urllib.parse import unquote, urljoin, urlparse
|
|
14
|
+
|
|
15
|
+
import requests
|
|
16
|
+
|
|
17
|
+
from memuron.documents.parser import MAX_DOCUMENT_UPLOAD_BYTES
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
DEFAULT_URL_FETCH_TIMEOUT_SECONDS = 15
|
|
21
|
+
MAX_URL_REDIRECTS = 5
|
|
22
|
+
URL_FETCH_USER_AGENT = "MemuronURLIngest/1.0"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class UrlIngestError(ValueError):
|
|
26
|
+
"""Raised when a URL cannot be safely fetched for document ingest."""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True)
|
|
30
|
+
class FetchedUrlSource:
|
|
31
|
+
file_name: str
|
|
32
|
+
content_type: str | None
|
|
33
|
+
file_bytes: bytes
|
|
34
|
+
metadata: dict[str, Any]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def fetch_url_source(
|
|
38
|
+
url: str,
|
|
39
|
+
*,
|
|
40
|
+
timeout_seconds: int = DEFAULT_URL_FETCH_TIMEOUT_SECONDS,
|
|
41
|
+
max_bytes: int = MAX_DOCUMENT_UPLOAD_BYTES,
|
|
42
|
+
session: requests.Session | None = None,
|
|
43
|
+
) -> FetchedUrlSource:
|
|
44
|
+
"""Fetch and normalize a remote URL into a source document payload."""
|
|
45
|
+
|
|
46
|
+
current_url = _validate_fetch_url(url)
|
|
47
|
+
client = session or requests.Session()
|
|
48
|
+
fetched_at = datetime.now(UTC).isoformat()
|
|
49
|
+
response = None
|
|
50
|
+
for _redirect in range(MAX_URL_REDIRECTS + 1):
|
|
51
|
+
try:
|
|
52
|
+
response = client.get(
|
|
53
|
+
current_url,
|
|
54
|
+
headers={"User-Agent": URL_FETCH_USER_AGENT},
|
|
55
|
+
timeout=timeout_seconds,
|
|
56
|
+
stream=True,
|
|
57
|
+
allow_redirects=False,
|
|
58
|
+
)
|
|
59
|
+
except requests.Timeout as exc:
|
|
60
|
+
raise UrlIngestError("URL fetch timed out") from exc
|
|
61
|
+
except requests.RequestException as exc:
|
|
62
|
+
raise UrlIngestError(f"URL fetch failed: {exc}") from exc
|
|
63
|
+
|
|
64
|
+
if 300 <= response.status_code < 400:
|
|
65
|
+
location = response.headers.get("Location")
|
|
66
|
+
if not location:
|
|
67
|
+
raise UrlIngestError("URL redirect is missing a Location header")
|
|
68
|
+
current_url = _validate_fetch_url(urljoin(current_url, location))
|
|
69
|
+
continue
|
|
70
|
+
break
|
|
71
|
+
else: # pragma: no cover - for-loop always breaks before this branch
|
|
72
|
+
raise UrlIngestError("URL has too many redirects")
|
|
73
|
+
|
|
74
|
+
assert response is not None
|
|
75
|
+
final_url = _validate_fetch_url(getattr(response, "url", current_url) or current_url)
|
|
76
|
+
if response.status_code != 200:
|
|
77
|
+
raise UrlIngestError(f"URL returned HTTP {response.status_code}")
|
|
78
|
+
|
|
79
|
+
raw_content_type = response.headers.get("Content-Type")
|
|
80
|
+
content_type = _content_type(raw_content_type)
|
|
81
|
+
raw_bytes = _read_limited_response(response, max_bytes=max_bytes)
|
|
82
|
+
if _is_html(content_type, final_url):
|
|
83
|
+
markdown, title = html_to_markdown(raw_bytes, content_type=raw_content_type)
|
|
84
|
+
file_name = _url_file_name(final_url, content_type="text/markdown", title=title)
|
|
85
|
+
metadata_content_type = content_type or "text/html"
|
|
86
|
+
return FetchedUrlSource(
|
|
87
|
+
file_name=file_name,
|
|
88
|
+
content_type="text/markdown",
|
|
89
|
+
file_bytes=markdown.encode("utf-8"),
|
|
90
|
+
metadata=_fetch_metadata(
|
|
91
|
+
source_url=url,
|
|
92
|
+
final_url=final_url,
|
|
93
|
+
content_type=metadata_content_type,
|
|
94
|
+
fetched_at=fetched_at,
|
|
95
|
+
title=title,
|
|
96
|
+
normalized_content_type="text/markdown",
|
|
97
|
+
size_bytes=len(raw_bytes),
|
|
98
|
+
),
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
file_name = _url_file_name(final_url, content_type=content_type)
|
|
102
|
+
return FetchedUrlSource(
|
|
103
|
+
file_name=file_name,
|
|
104
|
+
content_type=content_type,
|
|
105
|
+
file_bytes=raw_bytes,
|
|
106
|
+
metadata=_fetch_metadata(
|
|
107
|
+
source_url=url,
|
|
108
|
+
final_url=final_url,
|
|
109
|
+
content_type=content_type,
|
|
110
|
+
fetched_at=fetched_at,
|
|
111
|
+
title=None,
|
|
112
|
+
normalized_content_type=None,
|
|
113
|
+
size_bytes=len(raw_bytes),
|
|
114
|
+
),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def html_to_markdown(file_bytes: bytes, *, content_type: str | None = None) -> tuple[str, str | None]:
|
|
119
|
+
"""Extract readable markdown-ish text from HTML without heavy dependencies."""
|
|
120
|
+
|
|
121
|
+
parser = _ReadableHTMLParser()
|
|
122
|
+
parser.feed(_decode_html(file_bytes, content_type=content_type))
|
|
123
|
+
parser.close()
|
|
124
|
+
title = _normalize_whitespace(parser.title).strip() or None
|
|
125
|
+
body = _clean_markdown_lines("".join(parser.parts))
|
|
126
|
+
if title and not body.lstrip().startswith("#"):
|
|
127
|
+
body = f"# {title}\n\n{body}".strip()
|
|
128
|
+
if not body.strip():
|
|
129
|
+
raise UrlIngestError("HTML page did not contain readable text")
|
|
130
|
+
return body.strip() + "\n", title
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class _ReadableHTMLParser(HTMLParser):
|
|
134
|
+
def __init__(self) -> None:
|
|
135
|
+
super().__init__(convert_charrefs=True)
|
|
136
|
+
self.parts: list[str] = []
|
|
137
|
+
self._title_parts: list[str] = []
|
|
138
|
+
self._skip_depth = 0
|
|
139
|
+
self._in_title = False
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def title(self) -> str:
|
|
143
|
+
return "".join(self._title_parts)
|
|
144
|
+
|
|
145
|
+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
146
|
+
tag = tag.lower()
|
|
147
|
+
if tag in {"script", "style", "noscript", "svg", "template"}:
|
|
148
|
+
self._skip_depth += 1
|
|
149
|
+
return
|
|
150
|
+
if tag == "title":
|
|
151
|
+
self._in_title = True
|
|
152
|
+
return
|
|
153
|
+
if self._skip_depth:
|
|
154
|
+
return
|
|
155
|
+
if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:
|
|
156
|
+
level = int(tag[1])
|
|
157
|
+
self.parts.append("\n\n" + "#" * level + " ")
|
|
158
|
+
elif tag in {"p", "div", "section", "article", "main", "header", "footer", "tr"}:
|
|
159
|
+
self.parts.append("\n\n")
|
|
160
|
+
elif tag == "li":
|
|
161
|
+
self.parts.append("\n- ")
|
|
162
|
+
elif tag == "br":
|
|
163
|
+
self.parts.append("\n")
|
|
164
|
+
elif tag in {"td", "th"}:
|
|
165
|
+
self.parts.append(" | ")
|
|
166
|
+
|
|
167
|
+
def handle_endtag(self, tag: str) -> None:
|
|
168
|
+
tag = tag.lower()
|
|
169
|
+
if tag in {"script", "style", "noscript", "svg", "template"} and self._skip_depth:
|
|
170
|
+
self._skip_depth -= 1
|
|
171
|
+
return
|
|
172
|
+
if tag == "title":
|
|
173
|
+
self._in_title = False
|
|
174
|
+
return
|
|
175
|
+
if self._skip_depth:
|
|
176
|
+
return
|
|
177
|
+
if tag in {
|
|
178
|
+
"h1",
|
|
179
|
+
"h2",
|
|
180
|
+
"h3",
|
|
181
|
+
"h4",
|
|
182
|
+
"h5",
|
|
183
|
+
"h6",
|
|
184
|
+
"p",
|
|
185
|
+
"div",
|
|
186
|
+
"section",
|
|
187
|
+
"article",
|
|
188
|
+
"main",
|
|
189
|
+
"li",
|
|
190
|
+
"tr",
|
|
191
|
+
}:
|
|
192
|
+
self.parts.append("\n")
|
|
193
|
+
|
|
194
|
+
def handle_data(self, data: str) -> None:
|
|
195
|
+
if self._in_title:
|
|
196
|
+
self._title_parts.append(data)
|
|
197
|
+
return
|
|
198
|
+
if self._skip_depth:
|
|
199
|
+
return
|
|
200
|
+
text = _normalize_whitespace(data)
|
|
201
|
+
if text:
|
|
202
|
+
self.parts.append(text)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _validate_fetch_url(url: str) -> str:
|
|
206
|
+
raw = str(url or "").strip()
|
|
207
|
+
parsed = urlparse(raw)
|
|
208
|
+
if parsed.scheme.lower() not in {"http", "https"}:
|
|
209
|
+
raise UrlIngestError("URL ingest only supports http and https URLs")
|
|
210
|
+
if not parsed.hostname:
|
|
211
|
+
raise UrlIngestError("URL must include a host")
|
|
212
|
+
host = parsed.hostname.strip().lower()
|
|
213
|
+
if host in {"localhost", "localhost.localdomain"} or host.endswith(".localhost"):
|
|
214
|
+
raise UrlIngestError("URL host is not allowed")
|
|
215
|
+
_reject_private_host(host)
|
|
216
|
+
return raw
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _reject_private_host(host: str) -> None:
|
|
220
|
+
try:
|
|
221
|
+
addresses = [ipaddress.ip_address(host)]
|
|
222
|
+
except ValueError:
|
|
223
|
+
try:
|
|
224
|
+
infos = socket.getaddrinfo(host, None, type=socket.SOCK_STREAM)
|
|
225
|
+
except socket.gaierror as exc:
|
|
226
|
+
raise UrlIngestError(f"Could not resolve URL host: {host}") from exc
|
|
227
|
+
addresses = []
|
|
228
|
+
for info in infos:
|
|
229
|
+
raw_address = info[4][0]
|
|
230
|
+
try:
|
|
231
|
+
addresses.append(ipaddress.ip_address(raw_address))
|
|
232
|
+
except ValueError:
|
|
233
|
+
continue
|
|
234
|
+
if not addresses:
|
|
235
|
+
raise UrlIngestError(f"Could not resolve URL host: {host}")
|
|
236
|
+
for address in addresses:
|
|
237
|
+
if (
|
|
238
|
+
address.is_private
|
|
239
|
+
or address.is_loopback
|
|
240
|
+
or address.is_link_local
|
|
241
|
+
or address.is_multicast
|
|
242
|
+
or address.is_reserved
|
|
243
|
+
or address.is_unspecified
|
|
244
|
+
):
|
|
245
|
+
raise UrlIngestError("URL host resolves to a private or local address")
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _read_limited_response(response: requests.Response, *, max_bytes: int) -> bytes:
|
|
249
|
+
content_length = response.headers.get("Content-Length")
|
|
250
|
+
if content_length:
|
|
251
|
+
try:
|
|
252
|
+
if int(content_length) > max_bytes:
|
|
253
|
+
raise UrlIngestError(
|
|
254
|
+
f"URL response is too large; max is {max_bytes // (1024 * 1024)} MB"
|
|
255
|
+
)
|
|
256
|
+
except ValueError:
|
|
257
|
+
if content_length.strip().isdigit():
|
|
258
|
+
raise
|
|
259
|
+
chunks: list[bytes] = []
|
|
260
|
+
total = 0
|
|
261
|
+
for chunk in response.iter_content(chunk_size=64 * 1024):
|
|
262
|
+
if not chunk:
|
|
263
|
+
continue
|
|
264
|
+
total += len(chunk)
|
|
265
|
+
if total > max_bytes:
|
|
266
|
+
raise UrlIngestError(
|
|
267
|
+
f"URL response is too large; max is {max_bytes // (1024 * 1024)} MB"
|
|
268
|
+
)
|
|
269
|
+
chunks.append(chunk)
|
|
270
|
+
return b"".join(chunks)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _content_type(value: str | None) -> str | None:
|
|
274
|
+
if not value:
|
|
275
|
+
return None
|
|
276
|
+
return value.split(";", 1)[0].strip().lower() or None
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def _is_html(content_type: str | None, url: str) -> bool:
|
|
280
|
+
path = urlparse(url).path.lower()
|
|
281
|
+
return content_type in {"text/html", "application/xhtml+xml"} or path.endswith((".html", ".htm"))
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _decode_html(file_bytes: bytes, *, content_type: str | None) -> str:
|
|
285
|
+
charset = None
|
|
286
|
+
if content_type:
|
|
287
|
+
match = re.search(r"charset=([A-Za-z0-9._-]+)", content_type, flags=re.IGNORECASE)
|
|
288
|
+
if match:
|
|
289
|
+
charset = match.group(1)
|
|
290
|
+
for encoding in [charset, "utf-8", "latin-1"]:
|
|
291
|
+
if not encoding:
|
|
292
|
+
continue
|
|
293
|
+
try:
|
|
294
|
+
return file_bytes.decode(encoding)
|
|
295
|
+
except (LookupError, UnicodeDecodeError):
|
|
296
|
+
continue
|
|
297
|
+
return file_bytes.decode("utf-8", errors="replace")
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def _clean_markdown_lines(text: str) -> str:
|
|
301
|
+
lines = [_normalize_whitespace(line).strip() for line in text.splitlines()]
|
|
302
|
+
output: list[str] = []
|
|
303
|
+
blank = False
|
|
304
|
+
for line in lines:
|
|
305
|
+
if not line:
|
|
306
|
+
if output and not blank:
|
|
307
|
+
output.append("")
|
|
308
|
+
blank = True
|
|
309
|
+
continue
|
|
310
|
+
output.append(line)
|
|
311
|
+
blank = False
|
|
312
|
+
return "\n".join(output).strip()
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def _normalize_whitespace(text: str) -> str:
|
|
316
|
+
return re.sub(r"[ \t\r\f\v]+", " ", text.replace("\xa0", " "))
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def _url_file_name(url: str, *, content_type: str | None, title: str | None = None) -> str:
|
|
320
|
+
if title:
|
|
321
|
+
stem = re.sub(r"[^A-Za-z0-9._-]+", "-", title).strip(".-").lower()
|
|
322
|
+
if stem:
|
|
323
|
+
return f"{stem[:80]}.md"
|
|
324
|
+
path_name = Path(unquote(urlparse(url).path or "")).name
|
|
325
|
+
if path_name and "." in path_name:
|
|
326
|
+
return re.sub(r"[^A-Za-z0-9._-]+", "-", path_name).strip(".-") or "url-document"
|
|
327
|
+
stem = re.sub(r"[^A-Za-z0-9._-]+", "-", path_name or urlparse(url).hostname or "url-document")
|
|
328
|
+
stem = stem.strip(".-") or "url-document"
|
|
329
|
+
return f"{stem[:80]}{_extension_for_content_type(content_type)}"
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def _extension_for_content_type(content_type: str | None) -> str:
|
|
333
|
+
return {
|
|
334
|
+
"application/pdf": ".pdf",
|
|
335
|
+
"application/json": ".json",
|
|
336
|
+
"application/ld+json": ".json",
|
|
337
|
+
"application/xml": ".xml",
|
|
338
|
+
"text/xml": ".xml",
|
|
339
|
+
"text/csv": ".csv",
|
|
340
|
+
"text/tab-separated-values": ".tsv",
|
|
341
|
+
"text/markdown": ".md",
|
|
342
|
+
"text/x-markdown": ".md",
|
|
343
|
+
"text/html": ".html",
|
|
344
|
+
"application/xhtml+xml": ".html",
|
|
345
|
+
"text/plain": ".txt",
|
|
346
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
|
347
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
|
348
|
+
"application/vnd.ms-excel": ".xls",
|
|
349
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
|
350
|
+
"application/vnd.ms-powerpoint": ".ppt",
|
|
351
|
+
}.get(content_type or "", ".txt")
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def _fetch_metadata(
|
|
355
|
+
*,
|
|
356
|
+
source_url: str,
|
|
357
|
+
final_url: str,
|
|
358
|
+
content_type: str | None,
|
|
359
|
+
fetched_at: str,
|
|
360
|
+
title: str | None,
|
|
361
|
+
normalized_content_type: str | None,
|
|
362
|
+
size_bytes: int,
|
|
363
|
+
) -> dict[str, Any]:
|
|
364
|
+
metadata: dict[str, Any] = {
|
|
365
|
+
"source_url": source_url,
|
|
366
|
+
"fetched_url": final_url,
|
|
367
|
+
"content_type": content_type,
|
|
368
|
+
"fetched_at": fetched_at,
|
|
369
|
+
"size_bytes": size_bytes,
|
|
370
|
+
}
|
|
371
|
+
if title:
|
|
372
|
+
metadata["title"] = title
|
|
373
|
+
if normalized_content_type:
|
|
374
|
+
metadata["normalized_content_type"] = normalized_content_type
|
|
375
|
+
return metadata
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Shared domain models, schemas, encoders, and validation helpers."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Product-specific decoders live here."""
|