memuron 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memuron/__init__.py +3 -0
- memuron/actions/__init__.py +12 -0
- memuron/actions/context.py +63 -0
- memuron/actions/helpers.py +88 -0
- memuron/actions/memory.py +340 -0
- memuron/actions/memory_write.py +290 -0
- memuron/actions/nodes.py +340 -0
- memuron/actions/registry.py +5 -0
- memuron/actions/runtime.py +37 -0
- memuron/actions/spaces_documents.py +720 -0
- memuron/actions/sync.py +155 -0
- memuron/application/__init__.py +1 -0
- memuron/application/api.py +206 -0
- memuron/application/app.py +103 -0
- memuron/application/capabilities.py +82 -0
- memuron/application/cli.py +35 -0
- memuron/application/config.py +176 -0
- memuron/application/mcp.py +44 -0
- memuron/application/mcp_oauth.py +290 -0
- memuron/application/registry.py +52 -0
- memuron/context.py +532 -0
- memuron/documents/__init__.py +1 -0
- memuron/documents/link_guardian.py +192 -0
- memuron/documents/linking.py +292 -0
- memuron/documents/parser.py +1152 -0
- memuron/documents/storage.py +151 -0
- memuron/documents/url_ingest.py +375 -0
- memuron/domain/__init__.py +1 -0
- memuron/domain/decoders.py +1 -0
- memuron/domain/encoders.py +185 -0
- memuron/domain/lifecycles.py +8 -0
- memuron/domain/limits.py +6 -0
- memuron/domain/representations.py +56 -0
- memuron/domain/schemas.py +581 -0
- memuron/domain/scope_filter.py +104 -0
- memuron/graphfs/__init__.py +1 -0
- memuron/graphfs/manual.py +635 -0
- memuron/graphfs/projection.py +578 -0
- memuron/graphfs/query.py +1782 -0
- memuron/graphfs/read_model.py +574 -0
- memuron/ingest/__init__.py +1 -0
- memuron/ingest/guardian.py +213 -0
- memuron/ingest/jobs.py +424 -0
- memuron/ingest/prompts.py +147 -0
- memuron/memory/__init__.py +1 -0
- memuron/memory/engine.py +35 -0
- memuron/memory/projections.py +452 -0
- memuron/memory/recipes.py +3247 -0
- memuron/persistence/__init__.py +1 -0
- memuron/persistence/db_pool.py +57 -0
- memuron/persistence/identity_store.py +918 -0
- memuron/persistence/store_helpers.py +16 -0
- memuron/search/__init__.py +1 -0
- memuron/search/fulltext.py +110 -0
- memuron/search/hybrid.py +284 -0
- memuron/search/pgvector.py +252 -0
- memuron/security/__init__.py +1 -0
- memuron/security/auth.py +143 -0
- memuron/security/auth_provider.py +119 -0
- memuron/security/authorization.py +53 -0
- memuron/security/clerk_scopes.py +94 -0
- memuron/security/clerk_webhooks.py +61 -0
- memuron/security/jwt_tokens.py +53 -0
- memuron/security/passwords.py +38 -0
- memuron/security/tenant.py +58 -0
- memuron/spaces/__init__.py +1 -0
- memuron/spaces/model.py +35 -0
- memuron/spaces/service.py +155 -0
- memuron/sync/__init__.py +25 -0
- memuron/sync/folder.py +828 -0
- memuron-0.1.1.dist-info/METADATA +242 -0
- memuron-0.1.1.dist-info/RECORD +74 -0
- memuron-0.1.1.dist-info/WHEEL +4 -0
- memuron-0.1.1.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,1152 @@
|
|
|
1
|
+
"""Document parsing helpers for Memuron rich nodes.
|
|
2
|
+
|
|
3
|
+
This module deliberately stops at normalized markdown and deterministic chunks.
|
|
4
|
+
The Memuron recipe layer decides how those chunks become semantic events.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import base64
|
|
10
|
+
import datetime as dt
|
|
11
|
+
import io
|
|
12
|
+
import json
|
|
13
|
+
import re
|
|
14
|
+
import struct
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
import requests
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
MAX_DOCUMENT_UPLOAD_BYTES = 8 * 1024 * 1024
|
|
23
|
+
MAX_SPREADSHEET_CELLS = 100_000
|
|
24
|
+
DEFAULT_TARGET_CHARS = 1_600
|
|
25
|
+
DEFAULT_MAX_CHARS = 2_800
|
|
26
|
+
OPENROUTER_CHAT_URL = "https://openrouter.ai/api/v1/chat/completions"
|
|
27
|
+
IMAGE_DESCRIPTION_PROMPT = (
|
|
28
|
+
"Analyze this image for a semantic memory graph. Return strict JSON only with keys: "
|
|
29
|
+
"description, include_in_graph, image_kind, reason. The description should be concrete and "
|
|
30
|
+
"capture visible text, diagrams, entities, relationships, layout, and likely intent. "
|
|
31
|
+
"Set include_in_graph=true for images with durable semantic value, such as diagrams, "
|
|
32
|
+
"mindmaps, charts, screenshots, document pages, UI states, photos of meaningful objects, "
|
|
33
|
+
"or figures that materially explain the document. Set include_in_graph=false for logos, "
|
|
34
|
+
"letterheads, watermarks, decorative graphics, repeated branding, tiny icons, separators, "
|
|
35
|
+
"background textures, and images that add no retrievable meaning. image_kind should be one "
|
|
36
|
+
"of diagram, screenshot, chart, document_page, photo, figure, logo, letterhead, watermark, "
|
|
37
|
+
"decorative, icon, unknown."
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DocumentParseError(ValueError):
|
|
42
|
+
"""Raised when an uploaded source cannot be parsed into useful markdown."""
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass(frozen=True)
|
|
46
|
+
class DocumentBlock:
|
|
47
|
+
text: str
|
|
48
|
+
page_number: int
|
|
49
|
+
kind: str = "text"
|
|
50
|
+
heading_level: int | None = None
|
|
51
|
+
char_start: int = 0
|
|
52
|
+
char_end: int = 0
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass(frozen=True)
|
|
56
|
+
class ParsedChunk:
|
|
57
|
+
index: int
|
|
58
|
+
text: str
|
|
59
|
+
page_range: tuple[int, int]
|
|
60
|
+
element_range: tuple[int, int]
|
|
61
|
+
char_range: tuple[int, int]
|
|
62
|
+
bboxes: list[dict[str, Any]] = field(default_factory=list)
|
|
63
|
+
|
|
64
|
+
def to_location(self) -> dict[str, Any]:
|
|
65
|
+
return {
|
|
66
|
+
"page_range": list(self.page_range),
|
|
67
|
+
"element_range": list(self.element_range),
|
|
68
|
+
"char_range": list(self.char_range),
|
|
69
|
+
"bboxes": list(self.bboxes),
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass(frozen=True)
|
|
74
|
+
class ImagePerception:
|
|
75
|
+
description: str
|
|
76
|
+
include_in_graph: bool = True
|
|
77
|
+
image_kind: str = "unknown"
|
|
78
|
+
reason: str = ""
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass(frozen=True)
|
|
82
|
+
class ParsedImage:
|
|
83
|
+
index: int
|
|
84
|
+
file_name: str
|
|
85
|
+
media_type: str
|
|
86
|
+
description: str
|
|
87
|
+
include_in_graph: bool = True
|
|
88
|
+
image_kind: str = "unknown"
|
|
89
|
+
reason: str = ""
|
|
90
|
+
page_number: int = 0
|
|
91
|
+
size_bytes: int = 0
|
|
92
|
+
source: str = "upload"
|
|
93
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
94
|
+
raw_bytes: bytes | None = None
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@dataclass(frozen=True)
|
|
98
|
+
class ParsedDocument:
|
|
99
|
+
file_name: str
|
|
100
|
+
media_type: str
|
|
101
|
+
source_type: str
|
|
102
|
+
markdown: str
|
|
103
|
+
chunks: list[ParsedChunk]
|
|
104
|
+
page_count: int = 0
|
|
105
|
+
unreadable_pages: list[int] = field(default_factory=list)
|
|
106
|
+
source_metadata: dict[str, Any] = field(default_factory=dict)
|
|
107
|
+
images: list[ParsedImage] = field(default_factory=list)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
_HEADING_RE = re.compile(r"^(#{1,6})\s+(.*\S)\s*$")
|
|
111
|
+
_IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".webp", ".gif"}
|
|
112
|
+
_MARKDOWN_EXTS = {".md", ".markdown"}
|
|
113
|
+
_TEXT_EXTS = {
|
|
114
|
+
".txt",
|
|
115
|
+
".text",
|
|
116
|
+
".csv",
|
|
117
|
+
".tsv",
|
|
118
|
+
".json",
|
|
119
|
+
".jsonl",
|
|
120
|
+
".yaml",
|
|
121
|
+
".yml",
|
|
122
|
+
".html",
|
|
123
|
+
".htm",
|
|
124
|
+
".xml",
|
|
125
|
+
".rtf",
|
|
126
|
+
}
|
|
127
|
+
_DOCX_EXTS = {".docx"}
|
|
128
|
+
_EXCEL_EXTS = {".xlsx", ".xlsm"}
|
|
129
|
+
_LEGACY_EXCEL_EXTS = {".xls"}
|
|
130
|
+
_POWERPOINT_EXTS = {".pptx"}
|
|
131
|
+
_LEGACY_POWERPOINT_EXTS = {".ppt"}
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def parse_source(
|
|
135
|
+
*,
|
|
136
|
+
file_name: str,
|
|
137
|
+
content_type: str | None,
|
|
138
|
+
file_bytes: bytes,
|
|
139
|
+
describe_images: bool = False,
|
|
140
|
+
vlm_api_key: str | None = None,
|
|
141
|
+
vlm_model: str = "perceptron/perceptron-mk1",
|
|
142
|
+
vlm_timeout_seconds: int = 60,
|
|
143
|
+
) -> ParsedDocument:
|
|
144
|
+
"""Parse an uploaded source into markdown and retrievable chunks."""
|
|
145
|
+
|
|
146
|
+
if not file_bytes:
|
|
147
|
+
raise DocumentParseError("Uploaded file is empty")
|
|
148
|
+
if len(file_bytes) > MAX_DOCUMENT_UPLOAD_BYTES:
|
|
149
|
+
raise DocumentParseError(
|
|
150
|
+
f"Uploaded file is too large; max is {MAX_DOCUMENT_UPLOAD_BYTES // (1024 * 1024)} MB"
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
safe_name = Path(file_name or "document").name
|
|
154
|
+
ext = Path(safe_name).suffix.lower()
|
|
155
|
+
media_type = (content_type or _guess_media_type(ext)).split(";", 1)[0].strip()
|
|
156
|
+
normalized_media_type = media_type.lower()
|
|
157
|
+
image_describer = ImageDescriber(
|
|
158
|
+
api_key=vlm_api_key,
|
|
159
|
+
model=vlm_model,
|
|
160
|
+
timeout_seconds=vlm_timeout_seconds,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
if ext == ".pdf" or normalized_media_type == "application/pdf":
|
|
164
|
+
return _parse_pdf(
|
|
165
|
+
safe_name,
|
|
166
|
+
media_type,
|
|
167
|
+
file_bytes,
|
|
168
|
+
describe_images=describe_images,
|
|
169
|
+
image_describer=image_describer,
|
|
170
|
+
)
|
|
171
|
+
if ext in _DOCX_EXTS or normalized_media_type == (
|
|
172
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
173
|
+
):
|
|
174
|
+
return _parse_docx(safe_name, media_type, file_bytes)
|
|
175
|
+
if ext in _EXCEL_EXTS or normalized_media_type in {
|
|
176
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
177
|
+
"application/vnd.ms-excel.sheet.macroenabled.12",
|
|
178
|
+
}:
|
|
179
|
+
return _parse_excel(safe_name, media_type, file_bytes)
|
|
180
|
+
if ext in _LEGACY_EXCEL_EXTS or normalized_media_type == "application/vnd.ms-excel":
|
|
181
|
+
return _parse_legacy_excel(safe_name, media_type, file_bytes)
|
|
182
|
+
if ext in _POWERPOINT_EXTS or normalized_media_type == (
|
|
183
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
|
184
|
+
):
|
|
185
|
+
return _parse_pptx(safe_name, media_type, file_bytes)
|
|
186
|
+
if ext in _LEGACY_POWERPOINT_EXTS or normalized_media_type == "application/vnd.ms-powerpoint":
|
|
187
|
+
return _parse_legacy_ppt(safe_name, media_type, file_bytes)
|
|
188
|
+
if ext in _MARKDOWN_EXTS or normalized_media_type in {"text/markdown", "text/x-markdown"}:
|
|
189
|
+
markdown = _decode_text(file_bytes, safe_name).strip() + "\n"
|
|
190
|
+
return _parsed_from_markdown(
|
|
191
|
+
file_name=safe_name,
|
|
192
|
+
media_type="text/markdown",
|
|
193
|
+
source_type="markdown",
|
|
194
|
+
markdown=markdown,
|
|
195
|
+
metadata={"char_count": len(markdown)},
|
|
196
|
+
)
|
|
197
|
+
if ext in _TEXT_EXTS or normalized_media_type.startswith("text/"):
|
|
198
|
+
text = _decode_text(file_bytes, safe_name).strip()
|
|
199
|
+
markdown = f"# {Path(safe_name).stem}\n\n{text}\n"
|
|
200
|
+
return _parsed_from_markdown(
|
|
201
|
+
file_name=safe_name,
|
|
202
|
+
media_type=media_type or "text/plain",
|
|
203
|
+
source_type="text",
|
|
204
|
+
markdown=markdown,
|
|
205
|
+
metadata={"char_count": len(text)},
|
|
206
|
+
)
|
|
207
|
+
if ext in _IMAGE_EXTS or normalized_media_type.startswith("image/"):
|
|
208
|
+
return _parse_image(
|
|
209
|
+
safe_name,
|
|
210
|
+
media_type,
|
|
211
|
+
file_bytes,
|
|
212
|
+
describe_images=describe_images,
|
|
213
|
+
image_describer=image_describer,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
raise DocumentParseError(
|
|
217
|
+
"Unsupported file type. Supported: PDF, DOCX, XLS/XLSX/XLSM, PPT/PPTX, Markdown, "
|
|
218
|
+
"common text formats, and PNG/JPEG/WebP/GIF image metadata."
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
class ImageDescriber:
|
|
223
|
+
"""OpenRouter VLM image description client."""
|
|
224
|
+
|
|
225
|
+
def __init__(
|
|
226
|
+
self,
|
|
227
|
+
*,
|
|
228
|
+
api_key: str | None,
|
|
229
|
+
model: str,
|
|
230
|
+
timeout_seconds: int,
|
|
231
|
+
) -> None:
|
|
232
|
+
self.api_key = (api_key or "").strip()
|
|
233
|
+
self.model = model
|
|
234
|
+
self.timeout_seconds = timeout_seconds
|
|
235
|
+
|
|
236
|
+
def describe(
|
|
237
|
+
self,
|
|
238
|
+
image_bytes: bytes,
|
|
239
|
+
*,
|
|
240
|
+
media_type: str,
|
|
241
|
+
file_name: str,
|
|
242
|
+
default_include: bool = True,
|
|
243
|
+
) -> ImagePerception | None:
|
|
244
|
+
if not self.api_key:
|
|
245
|
+
return None
|
|
246
|
+
data_url = _image_data_url(image_bytes, media_type)
|
|
247
|
+
payload: dict[str, Any] = {
|
|
248
|
+
"model": self.model,
|
|
249
|
+
"messages": [
|
|
250
|
+
{
|
|
251
|
+
"role": "user",
|
|
252
|
+
"content": [
|
|
253
|
+
{"type": "text", "text": IMAGE_DESCRIPTION_PROMPT},
|
|
254
|
+
{"type": "image_url", "image_url": {"url": data_url}},
|
|
255
|
+
],
|
|
256
|
+
}
|
|
257
|
+
],
|
|
258
|
+
}
|
|
259
|
+
try:
|
|
260
|
+
response = requests.post(
|
|
261
|
+
OPENROUTER_CHAT_URL,
|
|
262
|
+
headers={
|
|
263
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
264
|
+
"Content-Type": "application/json",
|
|
265
|
+
},
|
|
266
|
+
json=payload,
|
|
267
|
+
timeout=self.timeout_seconds,
|
|
268
|
+
)
|
|
269
|
+
except requests.RequestException as exc:
|
|
270
|
+
raise DocumentParseError(f"Image VLM request failed for {file_name}: {exc}") from exc
|
|
271
|
+
if response.status_code >= 400:
|
|
272
|
+
raise DocumentParseError(
|
|
273
|
+
f"Image VLM returned HTTP {response.status_code} for {file_name}: {response.text[:500]}"
|
|
274
|
+
)
|
|
275
|
+
try:
|
|
276
|
+
data = response.json()
|
|
277
|
+
content = data["choices"][0]["message"]["content"]
|
|
278
|
+
except (KeyError, IndexError, TypeError, ValueError) as exc:
|
|
279
|
+
raise DocumentParseError(f"Image VLM returned invalid response for {file_name}") from exc
|
|
280
|
+
if isinstance(content, list):
|
|
281
|
+
text_parts = [
|
|
282
|
+
str(item.get("text", "")).strip()
|
|
283
|
+
for item in content
|
|
284
|
+
if isinstance(item, dict) and item.get("type") == "text"
|
|
285
|
+
]
|
|
286
|
+
content = "\n".join(part for part in text_parts if part)
|
|
287
|
+
return _parse_image_perception(
|
|
288
|
+
str(content or "").strip(),
|
|
289
|
+
default_include=default_include,
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def _parse_pdf(
|
|
294
|
+
file_name: str,
|
|
295
|
+
media_type: str,
|
|
296
|
+
file_bytes: bytes,
|
|
297
|
+
*,
|
|
298
|
+
describe_images: bool,
|
|
299
|
+
image_describer: ImageDescriber,
|
|
300
|
+
) -> ParsedDocument:
|
|
301
|
+
try:
|
|
302
|
+
from pypdf import PdfReader
|
|
303
|
+
except Exception as exc: # pragma: no cover - dependency sanity guard
|
|
304
|
+
raise DocumentParseError("PDF parsing requires pypdf to be installed") from exc
|
|
305
|
+
|
|
306
|
+
try:
|
|
307
|
+
reader = PdfReader(io.BytesIO(file_bytes))
|
|
308
|
+
except Exception as exc:
|
|
309
|
+
raise DocumentParseError(f"Could not open PDF: {exc}") from exc
|
|
310
|
+
|
|
311
|
+
if not reader.pages:
|
|
312
|
+
raise DocumentParseError("PDF has no pages")
|
|
313
|
+
|
|
314
|
+
parts: list[str] = []
|
|
315
|
+
unreadable: list[int] = []
|
|
316
|
+
for page_index, page in enumerate(reader.pages, start=1):
|
|
317
|
+
try:
|
|
318
|
+
text = (page.extract_text() or "").strip()
|
|
319
|
+
except Exception:
|
|
320
|
+
text = ""
|
|
321
|
+
if not text:
|
|
322
|
+
unreadable.append(page_index)
|
|
323
|
+
continue
|
|
324
|
+
parts.append(f"# Page {page_index}\n\n{text}")
|
|
325
|
+
|
|
326
|
+
images = _extract_pdf_images(
|
|
327
|
+
file_name,
|
|
328
|
+
file_bytes,
|
|
329
|
+
unreadable_pages=unreadable,
|
|
330
|
+
describe_images=describe_images,
|
|
331
|
+
image_describer=image_describer,
|
|
332
|
+
)
|
|
333
|
+
graph_images = [image for image in images if image.include_in_graph]
|
|
334
|
+
for image in graph_images:
|
|
335
|
+
parts.append(f"# Image {image.index + 1} on page {image.page_number}\n\n{image.description}")
|
|
336
|
+
|
|
337
|
+
markdown = "\n\n".join(parts).strip() + "\n"
|
|
338
|
+
if not markdown.strip():
|
|
339
|
+
raise DocumentParseError("No extractable text found in PDF")
|
|
340
|
+
|
|
341
|
+
return _parsed_from_markdown(
|
|
342
|
+
file_name=file_name,
|
|
343
|
+
media_type=media_type or "application/pdf",
|
|
344
|
+
source_type="pdf",
|
|
345
|
+
markdown=markdown,
|
|
346
|
+
page_count=len(reader.pages),
|
|
347
|
+
unreadable_pages=unreadable,
|
|
348
|
+
metadata={
|
|
349
|
+
"page_count": len(reader.pages),
|
|
350
|
+
"unreadable_pages": unreadable,
|
|
351
|
+
"parser": "pypdf",
|
|
352
|
+
"image_count": len(images),
|
|
353
|
+
"graph_image_count": len(graph_images),
|
|
354
|
+
"skipped_image_count": len(images) - len(graph_images),
|
|
355
|
+
},
|
|
356
|
+
images=images,
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def _parse_docx(file_name: str, media_type: str, file_bytes: bytes) -> ParsedDocument:
|
|
361
|
+
try:
|
|
362
|
+
from docx import Document
|
|
363
|
+
from docx.table import Table
|
|
364
|
+
from docx.text.paragraph import Paragraph
|
|
365
|
+
except Exception as exc: # pragma: no cover - dependency sanity guard
|
|
366
|
+
raise DocumentParseError("DOCX parsing requires python-docx to be installed") from exc
|
|
367
|
+
|
|
368
|
+
try:
|
|
369
|
+
document = Document(io.BytesIO(file_bytes))
|
|
370
|
+
except Exception as exc:
|
|
371
|
+
raise DocumentParseError(f"Could not open DOCX: {exc}") from exc
|
|
372
|
+
|
|
373
|
+
parts: list[str] = [f"# {Path(file_name).stem}"]
|
|
374
|
+
page_number = 1
|
|
375
|
+
paragraph_count = 0
|
|
376
|
+
table_count = 0
|
|
377
|
+
|
|
378
|
+
for child in document.element.body.iterchildren():
|
|
379
|
+
if child.tag.endswith("}p"):
|
|
380
|
+
paragraph = Paragraph(child, document)
|
|
381
|
+
text = paragraph.text.strip()
|
|
382
|
+
if text:
|
|
383
|
+
paragraph_count += 1
|
|
384
|
+
style_name = str(getattr(paragraph.style, "name", "") or "")
|
|
385
|
+
heading_match = re.match(r"Heading\s+([1-6])$", style_name, flags=re.IGNORECASE)
|
|
386
|
+
if heading_match:
|
|
387
|
+
parts.append(f"{'#' * int(heading_match.group(1))} {text}")
|
|
388
|
+
else:
|
|
389
|
+
parts.append(text)
|
|
390
|
+
breaks = _docx_page_break_count(paragraph)
|
|
391
|
+
for _ in range(breaks):
|
|
392
|
+
page_number += 1
|
|
393
|
+
parts.append(f"# Page {page_number}")
|
|
394
|
+
continue
|
|
395
|
+
|
|
396
|
+
if child.tag.endswith("}tbl"):
|
|
397
|
+
table = Table(child, document)
|
|
398
|
+
rows = [
|
|
399
|
+
[cell.text.strip() for cell in row.cells]
|
|
400
|
+
for row in table.rows
|
|
401
|
+
]
|
|
402
|
+
rendered = _rows_to_markdown_table(rows)
|
|
403
|
+
if rendered:
|
|
404
|
+
table_count += 1
|
|
405
|
+
parts.append(rendered)
|
|
406
|
+
|
|
407
|
+
markdown = "\n\n".join(part for part in parts if part.strip()).strip() + "\n"
|
|
408
|
+
if paragraph_count == 0 and table_count == 0:
|
|
409
|
+
raise DocumentParseError("No extractable text or tables found in DOCX")
|
|
410
|
+
|
|
411
|
+
return _parsed_from_markdown(
|
|
412
|
+
file_name=file_name,
|
|
413
|
+
media_type=media_type
|
|
414
|
+
or "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
415
|
+
source_type="docx",
|
|
416
|
+
markdown=markdown,
|
|
417
|
+
page_count=page_number,
|
|
418
|
+
metadata={
|
|
419
|
+
"parser": "python-docx",
|
|
420
|
+
"paragraph_count": paragraph_count,
|
|
421
|
+
"table_count": table_count,
|
|
422
|
+
"page_count": page_number,
|
|
423
|
+
},
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def _docx_page_break_count(paragraph: Any) -> int:
|
|
428
|
+
namespace = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
|
|
429
|
+
count = 1 if bool(paragraph.paragraph_format.page_break_before) else 0
|
|
430
|
+
for run in paragraph.runs:
|
|
431
|
+
count += sum(
|
|
432
|
+
1
|
|
433
|
+
for break_element in run._r.findall(f".//{namespace}br")
|
|
434
|
+
if break_element.get(f"{namespace}type") == "page"
|
|
435
|
+
)
|
|
436
|
+
return count
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def _parse_excel(file_name: str, media_type: str, file_bytes: bytes) -> ParsedDocument:
|
|
440
|
+
try:
|
|
441
|
+
from openpyxl import load_workbook
|
|
442
|
+
except Exception as exc: # pragma: no cover - dependency sanity guard
|
|
443
|
+
raise DocumentParseError("Excel parsing requires openpyxl to be installed") from exc
|
|
444
|
+
|
|
445
|
+
try:
|
|
446
|
+
workbook = load_workbook(
|
|
447
|
+
io.BytesIO(file_bytes),
|
|
448
|
+
read_only=True,
|
|
449
|
+
data_only=True,
|
|
450
|
+
keep_links=False,
|
|
451
|
+
)
|
|
452
|
+
except Exception as exc:
|
|
453
|
+
raise DocumentParseError(f"Could not open Excel workbook: {exc}") from exc
|
|
454
|
+
|
|
455
|
+
sheets: list[tuple[str, list[list[str]]]] = []
|
|
456
|
+
try:
|
|
457
|
+
for worksheet in workbook.worksheets:
|
|
458
|
+
rows: list[list[str]] = []
|
|
459
|
+
for row in worksheet.iter_rows(values_only=True):
|
|
460
|
+
values = [_excel_cell_text(value) for value in row]
|
|
461
|
+
while values and not values[-1]:
|
|
462
|
+
values.pop()
|
|
463
|
+
if not any(values):
|
|
464
|
+
continue
|
|
465
|
+
rows.append(values)
|
|
466
|
+
sheets.append((worksheet.title, rows))
|
|
467
|
+
finally:
|
|
468
|
+
workbook.close()
|
|
469
|
+
|
|
470
|
+
return _parsed_from_spreadsheet(
|
|
471
|
+
file_name=file_name,
|
|
472
|
+
media_type=media_type
|
|
473
|
+
or "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
474
|
+
sheets=sheets,
|
|
475
|
+
parser="openpyxl",
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def _parse_legacy_excel(file_name: str, media_type: str, file_bytes: bytes) -> ParsedDocument:
|
|
480
|
+
try:
|
|
481
|
+
import xlrd
|
|
482
|
+
except Exception as exc: # pragma: no cover - dependency sanity guard
|
|
483
|
+
raise DocumentParseError("Legacy Excel parsing requires xlrd to be installed") from exc
|
|
484
|
+
|
|
485
|
+
try:
|
|
486
|
+
workbook = xlrd.open_workbook(file_contents=file_bytes, on_demand=True)
|
|
487
|
+
except Exception as exc:
|
|
488
|
+
raise DocumentParseError(f"Could not open legacy Excel workbook: {exc}") from exc
|
|
489
|
+
|
|
490
|
+
sheets: list[tuple[str, list[list[str]]]] = []
|
|
491
|
+
try:
|
|
492
|
+
for worksheet in workbook.sheets():
|
|
493
|
+
rows = [
|
|
494
|
+
[_excel_cell_text(worksheet.cell_value(row, column)) for column in range(worksheet.ncols)]
|
|
495
|
+
for row in range(worksheet.nrows)
|
|
496
|
+
]
|
|
497
|
+
sheets.append((worksheet.name, rows))
|
|
498
|
+
finally:
|
|
499
|
+
workbook.release_resources()
|
|
500
|
+
|
|
501
|
+
return _parsed_from_spreadsheet(
|
|
502
|
+
file_name=file_name,
|
|
503
|
+
media_type=media_type or "application/vnd.ms-excel",
|
|
504
|
+
sheets=sheets,
|
|
505
|
+
parser="xlrd",
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
def _parsed_from_spreadsheet(
|
|
510
|
+
*,
|
|
511
|
+
file_name: str,
|
|
512
|
+
media_type: str,
|
|
513
|
+
sheets: list[tuple[str, list[list[str]]]],
|
|
514
|
+
parser: str,
|
|
515
|
+
) -> ParsedDocument:
|
|
516
|
+
parts: list[str] = [f"# {Path(file_name).stem}"]
|
|
517
|
+
sheet_metadata: list[dict[str, Any]] = []
|
|
518
|
+
total_cells = 0
|
|
519
|
+
for sheet_name, raw_rows in sheets:
|
|
520
|
+
rows: list[list[str]] = []
|
|
521
|
+
sheet_cells = 0
|
|
522
|
+
for raw_row in raw_rows:
|
|
523
|
+
row = list(raw_row)
|
|
524
|
+
while row and not row[-1]:
|
|
525
|
+
row.pop()
|
|
526
|
+
if not any(row):
|
|
527
|
+
continue
|
|
528
|
+
non_empty = sum(1 for value in row if value)
|
|
529
|
+
sheet_cells += non_empty
|
|
530
|
+
total_cells += non_empty
|
|
531
|
+
if total_cells > MAX_SPREADSHEET_CELLS:
|
|
532
|
+
raise DocumentParseError(
|
|
533
|
+
f"Excel workbook exceeds the {MAX_SPREADSHEET_CELLS:,} non-empty cell limit"
|
|
534
|
+
)
|
|
535
|
+
rows.append(row)
|
|
536
|
+
if not rows:
|
|
537
|
+
continue
|
|
538
|
+
width = max(len(row) for row in rows)
|
|
539
|
+
normalized_rows = [row + [""] * (width - len(row)) for row in rows]
|
|
540
|
+
parts.extend([f"## Sheet: {sheet_name}", _rows_to_markdown_table(normalized_rows)])
|
|
541
|
+
sheet_metadata.append(
|
|
542
|
+
{
|
|
543
|
+
"name": sheet_name,
|
|
544
|
+
"row_count": len(normalized_rows),
|
|
545
|
+
"column_count": width,
|
|
546
|
+
"non_empty_cell_count": sheet_cells,
|
|
547
|
+
}
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
if not sheet_metadata:
|
|
551
|
+
raise DocumentParseError("No non-empty cells found in Excel workbook")
|
|
552
|
+
|
|
553
|
+
return _parsed_from_markdown(
|
|
554
|
+
file_name=file_name,
|
|
555
|
+
media_type=media_type,
|
|
556
|
+
source_type="excel",
|
|
557
|
+
markdown="\n\n".join(parts).strip() + "\n",
|
|
558
|
+
page_count=len(sheet_metadata),
|
|
559
|
+
metadata={
|
|
560
|
+
"parser": parser,
|
|
561
|
+
"sheet_count": len(sheet_metadata),
|
|
562
|
+
"non_empty_cell_count": total_cells,
|
|
563
|
+
"sheets": sheet_metadata,
|
|
564
|
+
},
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
def _parse_pptx(file_name: str, media_type: str, file_bytes: bytes) -> ParsedDocument:
|
|
569
|
+
try:
|
|
570
|
+
from pptx import Presentation
|
|
571
|
+
except Exception as exc: # pragma: no cover - dependency sanity guard
|
|
572
|
+
raise DocumentParseError("PPTX parsing requires python-pptx to be installed") from exc
|
|
573
|
+
|
|
574
|
+
try:
|
|
575
|
+
presentation = Presentation(io.BytesIO(file_bytes))
|
|
576
|
+
except Exception as exc:
|
|
577
|
+
raise DocumentParseError(f"Could not open PPTX: {exc}") from exc
|
|
578
|
+
|
|
579
|
+
parts: list[str] = [f"# {Path(file_name).stem}"]
|
|
580
|
+
slide_metadata: list[dict[str, Any]] = []
|
|
581
|
+
for slide_number, slide in enumerate(presentation.slides, start=1):
|
|
582
|
+
slide_parts: list[str] = []
|
|
583
|
+
table_count = 0
|
|
584
|
+
for shape in slide.shapes:
|
|
585
|
+
if getattr(shape, "has_text_frame", False):
|
|
586
|
+
text = "\n".join(
|
|
587
|
+
paragraph.text.strip()
|
|
588
|
+
for paragraph in shape.text_frame.paragraphs
|
|
589
|
+
if paragraph.text.strip()
|
|
590
|
+
)
|
|
591
|
+
if text:
|
|
592
|
+
slide_parts.append(text)
|
|
593
|
+
if getattr(shape, "has_table", False):
|
|
594
|
+
rows = [[cell.text.strip() for cell in row.cells] for row in shape.table.rows]
|
|
595
|
+
rendered = _rows_to_markdown_table(rows)
|
|
596
|
+
if rendered:
|
|
597
|
+
table_count += 1
|
|
598
|
+
slide_parts.append(rendered)
|
|
599
|
+
notes_text = ""
|
|
600
|
+
if slide.has_notes_slide:
|
|
601
|
+
notes_text = "\n".join(
|
|
602
|
+
shape.text.strip()
|
|
603
|
+
for shape in slide.notes_slide.notes_text_frame.paragraphs
|
|
604
|
+
if shape.text.strip()
|
|
605
|
+
)
|
|
606
|
+
if notes_text:
|
|
607
|
+
slide_parts.append(f"### Speaker notes\n\n{notes_text}")
|
|
608
|
+
if slide_parts:
|
|
609
|
+
parts.append(f"# Slide {slide_number}\n\n" + "\n\n".join(slide_parts))
|
|
610
|
+
slide_metadata.append(
|
|
611
|
+
{
|
|
612
|
+
"slide_number": slide_number,
|
|
613
|
+
"table_count": table_count,
|
|
614
|
+
"has_notes": bool(notes_text),
|
|
615
|
+
}
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
if len(parts) == 1:
|
|
619
|
+
raise DocumentParseError("No extractable text or tables found in PPTX")
|
|
620
|
+
|
|
621
|
+
return _parsed_from_markdown(
|
|
622
|
+
file_name=file_name,
|
|
623
|
+
media_type=media_type
|
|
624
|
+
or "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
625
|
+
source_type="powerpoint",
|
|
626
|
+
markdown="\n\n".join(parts).strip() + "\n",
|
|
627
|
+
page_count=len(presentation.slides),
|
|
628
|
+
metadata={
|
|
629
|
+
"parser": "python-pptx",
|
|
630
|
+
"slide_count": len(presentation.slides),
|
|
631
|
+
"slides": slide_metadata,
|
|
632
|
+
},
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
def _parse_legacy_ppt(file_name: str, media_type: str, file_bytes: bytes) -> ParsedDocument:
|
|
637
|
+
try:
|
|
638
|
+
import olefile
|
|
639
|
+
except Exception as exc: # pragma: no cover - dependency sanity guard
|
|
640
|
+
raise DocumentParseError("Legacy PPT parsing requires olefile to be installed") from exc
|
|
641
|
+
|
|
642
|
+
try:
|
|
643
|
+
ole = olefile.OleFileIO(io.BytesIO(file_bytes))
|
|
644
|
+
except Exception as exc:
|
|
645
|
+
raise DocumentParseError(f"Could not open legacy PPT: {exc}") from exc
|
|
646
|
+
try:
|
|
647
|
+
if not ole.exists("PowerPoint Document"):
|
|
648
|
+
raise DocumentParseError("Legacy PPT does not contain a PowerPoint Document stream")
|
|
649
|
+
stream = ole.openstream("PowerPoint Document").read()
|
|
650
|
+
finally:
|
|
651
|
+
ole.close()
|
|
652
|
+
|
|
653
|
+
fragments = _legacy_ppt_text_fragments(stream)
|
|
654
|
+
if not fragments:
|
|
655
|
+
raise DocumentParseError("No extractable text found in legacy PPT")
|
|
656
|
+
markdown = f"# {Path(file_name).stem}\n\n" + "\n\n".join(fragments) + "\n"
|
|
657
|
+
return _parsed_from_markdown(
|
|
658
|
+
file_name=file_name,
|
|
659
|
+
media_type=media_type or "application/vnd.ms-powerpoint",
|
|
660
|
+
source_type="powerpoint",
|
|
661
|
+
markdown=markdown,
|
|
662
|
+
metadata={"parser": "olefile", "text_fragment_count": len(fragments)},
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
|
|
666
|
+
def _legacy_ppt_text_fragments(stream: bytes) -> list[str]:
|
|
667
|
+
fragments: list[str] = []
|
|
668
|
+
|
|
669
|
+
def walk(start: int, end: int) -> None:
|
|
670
|
+
offset = start
|
|
671
|
+
while offset + 8 <= end:
|
|
672
|
+
options, record_type, length = struct.unpack_from("<HHI", stream, offset)
|
|
673
|
+
payload_start = offset + 8
|
|
674
|
+
payload_end = payload_start + length
|
|
675
|
+
if payload_end > end:
|
|
676
|
+
break
|
|
677
|
+
record_version = options & 0xF
|
|
678
|
+
if record_version == 0xF:
|
|
679
|
+
walk(payload_start, payload_end)
|
|
680
|
+
elif record_type == 4000:
|
|
681
|
+
_append_text_fragment(fragments, stream[payload_start:payload_end].decode("utf-16le", errors="ignore"))
|
|
682
|
+
elif record_type == 4008:
|
|
683
|
+
_append_text_fragment(fragments, stream[payload_start:payload_end].decode("cp1252", errors="ignore"))
|
|
684
|
+
offset = payload_end
|
|
685
|
+
|
|
686
|
+
walk(0, len(stream))
|
|
687
|
+
return fragments
|
|
688
|
+
|
|
689
|
+
|
|
690
|
+
def _append_text_fragment(fragments: list[str], text: str) -> None:
|
|
691
|
+
normalized = re.sub(r"\s+", " ", text).strip("\x00 \t\r\n")
|
|
692
|
+
if normalized and (not fragments or fragments[-1] != normalized):
|
|
693
|
+
fragments.append(normalized)
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
def _excel_cell_text(value: Any) -> str:
|
|
697
|
+
if value is None:
|
|
698
|
+
return ""
|
|
699
|
+
if isinstance(value, (dt.datetime, dt.date, dt.time)):
|
|
700
|
+
return value.isoformat()
|
|
701
|
+
if isinstance(value, bool):
|
|
702
|
+
return "TRUE" if value else "FALSE"
|
|
703
|
+
return str(value).strip()
|
|
704
|
+
|
|
705
|
+
|
|
706
|
+
def _rows_to_markdown_table(rows: list[list[str]]) -> str:
|
|
707
|
+
if not rows:
|
|
708
|
+
return ""
|
|
709
|
+
width = max((len(row) for row in rows), default=0)
|
|
710
|
+
if width == 0:
|
|
711
|
+
return ""
|
|
712
|
+
normalized = [row + [""] * (width - len(row)) for row in rows]
|
|
713
|
+
header = normalized[0]
|
|
714
|
+
body = normalized[1:]
|
|
715
|
+
|
|
716
|
+
def render(row: list[str]) -> str:
|
|
717
|
+
cells = [
|
|
718
|
+
str(value).replace("\\", "\\\\").replace("|", "\\|").replace("\r", " ").replace("\n", "<br>")
|
|
719
|
+
for value in row
|
|
720
|
+
]
|
|
721
|
+
return f"| {' | '.join(cells)} |"
|
|
722
|
+
|
|
723
|
+
return "\n".join(
|
|
724
|
+
[
|
|
725
|
+
render(header),
|
|
726
|
+
f"| {' | '.join(['---'] * width)} |",
|
|
727
|
+
*(render(row) for row in body),
|
|
728
|
+
]
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
def _parse_image(
|
|
733
|
+
file_name: str,
|
|
734
|
+
media_type: str,
|
|
735
|
+
file_bytes: bytes,
|
|
736
|
+
*,
|
|
737
|
+
describe_images: bool,
|
|
738
|
+
image_describer: ImageDescriber,
|
|
739
|
+
) -> ParsedDocument:
|
|
740
|
+
description = None
|
|
741
|
+
perception = None
|
|
742
|
+
parser = "image_metadata"
|
|
743
|
+
if describe_images:
|
|
744
|
+
perception = image_describer.describe(
|
|
745
|
+
file_bytes,
|
|
746
|
+
media_type=media_type,
|
|
747
|
+
file_name=file_name,
|
|
748
|
+
default_include=True,
|
|
749
|
+
)
|
|
750
|
+
if perception:
|
|
751
|
+
description = perception.description
|
|
752
|
+
parser = "openrouter_vlm"
|
|
753
|
+
description = description or _image_description_fallback(
|
|
754
|
+
describe_images=describe_images,
|
|
755
|
+
has_api_key=bool(image_describer.api_key),
|
|
756
|
+
)
|
|
757
|
+
markdown = (
|
|
758
|
+
f"# {Path(file_name).stem}\n\n"
|
|
759
|
+
f"Image upload: `{file_name}`.\n\n"
|
|
760
|
+
f"{description}\n"
|
|
761
|
+
)
|
|
762
|
+
image = ParsedImage(
|
|
763
|
+
index=0,
|
|
764
|
+
file_name=file_name,
|
|
765
|
+
media_type=media_type or _guess_media_type(Path(file_name).suffix.lower()),
|
|
766
|
+
description=description,
|
|
767
|
+
include_in_graph=True,
|
|
768
|
+
image_kind=perception.image_kind if perception else "unknown",
|
|
769
|
+
reason=perception.reason if perception else "",
|
|
770
|
+
size_bytes=len(file_bytes),
|
|
771
|
+
source="upload",
|
|
772
|
+
metadata={
|
|
773
|
+
"parser": parser,
|
|
774
|
+
"vlm_include_in_graph": perception.include_in_graph if perception else True,
|
|
775
|
+
"vlm_image_kind": perception.image_kind if perception else "unknown",
|
|
776
|
+
"vlm_reason": perception.reason if perception else "",
|
|
777
|
+
},
|
|
778
|
+
raw_bytes=file_bytes,
|
|
779
|
+
)
|
|
780
|
+
return _parsed_from_markdown(
|
|
781
|
+
file_name=file_name,
|
|
782
|
+
media_type=media_type or _guess_media_type(Path(file_name).suffix.lower()),
|
|
783
|
+
source_type="image",
|
|
784
|
+
markdown=markdown,
|
|
785
|
+
metadata={
|
|
786
|
+
"size_bytes": len(file_bytes),
|
|
787
|
+
"parser": parser,
|
|
788
|
+
"visual_description": description,
|
|
789
|
+
"image_count": 1,
|
|
790
|
+
"graph_image_count": 1,
|
|
791
|
+
"skipped_image_count": 0,
|
|
792
|
+
"vlm_include_in_graph": perception.include_in_graph if perception else True,
|
|
793
|
+
"vlm_image_kind": perception.image_kind if perception else "unknown",
|
|
794
|
+
"vlm_reason": perception.reason if perception else "",
|
|
795
|
+
},
|
|
796
|
+
images=[image],
|
|
797
|
+
)
|
|
798
|
+
|
|
799
|
+
|
|
800
|
+
def _parsed_from_markdown(
|
|
801
|
+
*,
|
|
802
|
+
file_name: str,
|
|
803
|
+
media_type: str,
|
|
804
|
+
source_type: str,
|
|
805
|
+
markdown: str,
|
|
806
|
+
page_count: int = 0,
|
|
807
|
+
unreadable_pages: list[int] | None = None,
|
|
808
|
+
metadata: dict[str, Any] | None = None,
|
|
809
|
+
images: list[ParsedImage] | None = None,
|
|
810
|
+
) -> ParsedDocument:
|
|
811
|
+
blocks = _markdown_blocks(markdown)
|
|
812
|
+
chunks = _chunk_blocks(blocks)
|
|
813
|
+
if not chunks:
|
|
814
|
+
raise DocumentParseError("No semantic chunks were produced from uploaded file")
|
|
815
|
+
return ParsedDocument(
|
|
816
|
+
file_name=file_name,
|
|
817
|
+
media_type=media_type,
|
|
818
|
+
source_type=source_type,
|
|
819
|
+
markdown=markdown,
|
|
820
|
+
chunks=chunks,
|
|
821
|
+
page_count=page_count,
|
|
822
|
+
unreadable_pages=unreadable_pages or [],
|
|
823
|
+
source_metadata=metadata or {},
|
|
824
|
+
images=images or [],
|
|
825
|
+
)
|
|
826
|
+
|
|
827
|
+
|
|
828
|
+
def _markdown_blocks(markdown: str) -> list[DocumentBlock]:
|
|
829
|
+
lines = markdown.splitlines()
|
|
830
|
+
blocks: list[DocumentBlock] = []
|
|
831
|
+
paragraph: list[str] = []
|
|
832
|
+
offset = 0
|
|
833
|
+
paragraph_start = 0
|
|
834
|
+
current_page = 0
|
|
835
|
+
|
|
836
|
+
def flush_paragraph(end_offset: int) -> None:
|
|
837
|
+
nonlocal paragraph, paragraph_start
|
|
838
|
+
if not paragraph:
|
|
839
|
+
return
|
|
840
|
+
text = " ".join(part.strip() for part in paragraph if part.strip()).strip()
|
|
841
|
+
if text:
|
|
842
|
+
blocks.append(
|
|
843
|
+
DocumentBlock(
|
|
844
|
+
text=text,
|
|
845
|
+
page_number=current_page,
|
|
846
|
+
char_start=paragraph_start,
|
|
847
|
+
char_end=end_offset,
|
|
848
|
+
)
|
|
849
|
+
)
|
|
850
|
+
paragraph = []
|
|
851
|
+
paragraph_start = end_offset
|
|
852
|
+
|
|
853
|
+
for raw_line in lines:
|
|
854
|
+
line_start = offset
|
|
855
|
+
stripped = raw_line.strip()
|
|
856
|
+
offset += len(raw_line) + 1
|
|
857
|
+
|
|
858
|
+
if not stripped:
|
|
859
|
+
flush_paragraph(line_start)
|
|
860
|
+
paragraph_start = offset
|
|
861
|
+
continue
|
|
862
|
+
|
|
863
|
+
heading = _HEADING_RE.match(stripped)
|
|
864
|
+
if heading:
|
|
865
|
+
flush_paragraph(line_start)
|
|
866
|
+
level = len(heading.group(1))
|
|
867
|
+
text = heading.group(2).strip()
|
|
868
|
+
page_match = re.match(r"Page\s+(\d+)$", text, flags=re.IGNORECASE)
|
|
869
|
+
if page_match:
|
|
870
|
+
current_page = int(page_match.group(1))
|
|
871
|
+
blocks.append(
|
|
872
|
+
DocumentBlock(
|
|
873
|
+
text=f"{'#' * level} {text}",
|
|
874
|
+
page_number=current_page,
|
|
875
|
+
kind="heading",
|
|
876
|
+
heading_level=level,
|
|
877
|
+
char_start=line_start,
|
|
878
|
+
char_end=offset,
|
|
879
|
+
)
|
|
880
|
+
)
|
|
881
|
+
paragraph_start = offset
|
|
882
|
+
continue
|
|
883
|
+
|
|
884
|
+
if not paragraph:
|
|
885
|
+
paragraph_start = line_start
|
|
886
|
+
paragraph.append(raw_line)
|
|
887
|
+
|
|
888
|
+
flush_paragraph(offset)
|
|
889
|
+
return blocks
|
|
890
|
+
|
|
891
|
+
|
|
892
|
+
def _chunk_blocks(
|
|
893
|
+
blocks: list[DocumentBlock],
|
|
894
|
+
*,
|
|
895
|
+
target_chars: int = DEFAULT_TARGET_CHARS,
|
|
896
|
+
max_chars: int = DEFAULT_MAX_CHARS,
|
|
897
|
+
) -> list[ParsedChunk]:
|
|
898
|
+
chunks: list[ParsedChunk] = []
|
|
899
|
+
current: list[tuple[int, DocumentBlock]] = []
|
|
900
|
+
|
|
901
|
+
def current_len() -> int:
|
|
902
|
+
return sum(len(block.text) for _index, block in current) + max(0, len(current) - 1) * 2
|
|
903
|
+
|
|
904
|
+
def flush() -> None:
|
|
905
|
+
nonlocal current
|
|
906
|
+
if not current:
|
|
907
|
+
return
|
|
908
|
+
indices = [index for index, _block in current]
|
|
909
|
+
texts = [block.text for _index, block in current]
|
|
910
|
+
pages = [block.page_number for _index, block in current if block.page_number > 0]
|
|
911
|
+
page_range = (min(pages), max(pages)) if pages else (0, 0)
|
|
912
|
+
chunks.append(
|
|
913
|
+
ParsedChunk(
|
|
914
|
+
index=len(chunks),
|
|
915
|
+
text="\n\n".join(texts).strip(),
|
|
916
|
+
page_range=page_range,
|
|
917
|
+
element_range=(indices[0], indices[-1]),
|
|
918
|
+
char_range=(current[0][1].char_start, current[-1][1].char_end),
|
|
919
|
+
)
|
|
920
|
+
)
|
|
921
|
+
current = []
|
|
922
|
+
|
|
923
|
+
for index, block in enumerate(blocks):
|
|
924
|
+
if block.kind == "heading" and current and current_len() >= target_chars // 3:
|
|
925
|
+
flush()
|
|
926
|
+
if current and current_len() + len(block.text) + 2 > max_chars:
|
|
927
|
+
flush()
|
|
928
|
+
current.append((index, block))
|
|
929
|
+
if current_len() >= target_chars and block.kind != "heading":
|
|
930
|
+
flush()
|
|
931
|
+
|
|
932
|
+
flush()
|
|
933
|
+
return chunks
|
|
934
|
+
|
|
935
|
+
|
|
936
|
+
def _extract_pdf_images(
|
|
937
|
+
file_name: str,
|
|
938
|
+
file_bytes: bytes,
|
|
939
|
+
*,
|
|
940
|
+
unreadable_pages: list[int],
|
|
941
|
+
describe_images: bool,
|
|
942
|
+
image_describer: ImageDescriber,
|
|
943
|
+
) -> list[ParsedImage]:
|
|
944
|
+
try:
|
|
945
|
+
import fitz
|
|
946
|
+
except Exception:
|
|
947
|
+
return []
|
|
948
|
+
|
|
949
|
+
images: list[ParsedImage] = []
|
|
950
|
+
seen_xrefs: set[int] = set()
|
|
951
|
+
pages_with_images: set[int] = set()
|
|
952
|
+
try:
|
|
953
|
+
document = fitz.open(stream=file_bytes, filetype="pdf")
|
|
954
|
+
except Exception:
|
|
955
|
+
return []
|
|
956
|
+
|
|
957
|
+
try:
|
|
958
|
+
for page_index, page in enumerate(document, start=1):
|
|
959
|
+
for image_info in page.get_images(full=True):
|
|
960
|
+
xref = int(image_info[0])
|
|
961
|
+
if xref in seen_xrefs:
|
|
962
|
+
continue
|
|
963
|
+
seen_xrefs.add(xref)
|
|
964
|
+
extracted = document.extract_image(xref)
|
|
965
|
+
image_bytes = extracted.get("image")
|
|
966
|
+
if not isinstance(image_bytes, bytes) or not image_bytes:
|
|
967
|
+
continue
|
|
968
|
+
ext = str(extracted.get("ext") or "png").lower()
|
|
969
|
+
image_media_type = _guess_media_type(f".{ext}")
|
|
970
|
+
image_name = f"{Path(file_name).stem}-page-{page_index}-image-{len(images) + 1}.{ext}"
|
|
971
|
+
description = None
|
|
972
|
+
perception = None
|
|
973
|
+
parser = "pdf_image_metadata"
|
|
974
|
+
if describe_images:
|
|
975
|
+
perception = image_describer.describe(
|
|
976
|
+
image_bytes,
|
|
977
|
+
media_type=image_media_type,
|
|
978
|
+
file_name=image_name,
|
|
979
|
+
default_include=False,
|
|
980
|
+
)
|
|
981
|
+
if perception:
|
|
982
|
+
description = perception.description
|
|
983
|
+
parser = "openrouter_vlm"
|
|
984
|
+
description = description or (
|
|
985
|
+
f"Extracted image from `{file_name}` on page {page_index}. "
|
|
986
|
+
f"{_image_description_fallback(describe_images=describe_images, has_api_key=bool(image_describer.api_key))}"
|
|
987
|
+
)
|
|
988
|
+
pages_with_images.add(page_index)
|
|
989
|
+
images.append(
|
|
990
|
+
ParsedImage(
|
|
991
|
+
index=len(images),
|
|
992
|
+
file_name=image_name,
|
|
993
|
+
media_type=image_media_type,
|
|
994
|
+
description=description,
|
|
995
|
+
include_in_graph=perception.include_in_graph if perception else not describe_images,
|
|
996
|
+
image_kind=perception.image_kind if perception else "unknown",
|
|
997
|
+
reason=perception.reason if perception else "",
|
|
998
|
+
page_number=page_index,
|
|
999
|
+
size_bytes=len(image_bytes),
|
|
1000
|
+
source="pdf",
|
|
1001
|
+
metadata={
|
|
1002
|
+
"parser": parser,
|
|
1003
|
+
"pdf_file_name": file_name,
|
|
1004
|
+
"xref": xref,
|
|
1005
|
+
"vlm_include_in_graph": perception.include_in_graph if perception else None,
|
|
1006
|
+
"vlm_image_kind": perception.image_kind if perception else "unknown",
|
|
1007
|
+
"vlm_reason": perception.reason if perception else "",
|
|
1008
|
+
},
|
|
1009
|
+
raw_bytes=image_bytes,
|
|
1010
|
+
)
|
|
1011
|
+
)
|
|
1012
|
+
for page_number in unreadable_pages:
|
|
1013
|
+
if page_number in pages_with_images or page_number < 1 or page_number > document.page_count:
|
|
1014
|
+
continue
|
|
1015
|
+
page = document[page_number - 1]
|
|
1016
|
+
pixmap = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
|
|
1017
|
+
image_bytes = pixmap.tobytes("png")
|
|
1018
|
+
image_name = f"{Path(file_name).stem}-page-{page_number}.png"
|
|
1019
|
+
description = None
|
|
1020
|
+
perception = None
|
|
1021
|
+
parser = "pdf_page_render"
|
|
1022
|
+
if describe_images:
|
|
1023
|
+
perception = image_describer.describe(
|
|
1024
|
+
image_bytes,
|
|
1025
|
+
media_type="image/png",
|
|
1026
|
+
file_name=image_name,
|
|
1027
|
+
default_include=True,
|
|
1028
|
+
)
|
|
1029
|
+
if perception:
|
|
1030
|
+
description = perception.description
|
|
1031
|
+
parser = "openrouter_vlm"
|
|
1032
|
+
description = description or (
|
|
1033
|
+
f"Rendered image-only page from `{file_name}` page {page_number}. "
|
|
1034
|
+
f"{_image_description_fallback(describe_images=describe_images, has_api_key=bool(image_describer.api_key))}"
|
|
1035
|
+
)
|
|
1036
|
+
images.append(
|
|
1037
|
+
ParsedImage(
|
|
1038
|
+
index=len(images),
|
|
1039
|
+
file_name=image_name,
|
|
1040
|
+
media_type="image/png",
|
|
1041
|
+
description=description,
|
|
1042
|
+
include_in_graph=perception.include_in_graph if perception else True,
|
|
1043
|
+
image_kind=perception.image_kind if perception else "document_page",
|
|
1044
|
+
reason=perception.reason if perception else "",
|
|
1045
|
+
page_number=page_number,
|
|
1046
|
+
size_bytes=len(image_bytes),
|
|
1047
|
+
source="pdf_page_render",
|
|
1048
|
+
metadata={
|
|
1049
|
+
"parser": parser,
|
|
1050
|
+
"pdf_file_name": file_name,
|
|
1051
|
+
"vlm_include_in_graph": perception.include_in_graph if perception else None,
|
|
1052
|
+
"vlm_image_kind": perception.image_kind if perception else "document_page",
|
|
1053
|
+
"vlm_reason": perception.reason if perception else "",
|
|
1054
|
+
},
|
|
1055
|
+
raw_bytes=image_bytes,
|
|
1056
|
+
)
|
|
1057
|
+
)
|
|
1058
|
+
finally:
|
|
1059
|
+
document.close()
|
|
1060
|
+
return images
|
|
1061
|
+
|
|
1062
|
+
|
|
1063
|
+
def _image_data_url(image_bytes: bytes, media_type: str) -> str:
|
|
1064
|
+
encoded = base64.b64encode(image_bytes).decode("ascii")
|
|
1065
|
+
return f"data:{media_type or 'image/png'};base64,{encoded}"
|
|
1066
|
+
|
|
1067
|
+
|
|
1068
|
+
def _image_description_fallback(*, describe_images: bool, has_api_key: bool) -> str:
|
|
1069
|
+
if not describe_images:
|
|
1070
|
+
return "Visual description was skipped because image VLM description is disabled."
|
|
1071
|
+
if not has_api_key:
|
|
1072
|
+
return "Visual description was skipped because OPENROUTER_API_KEY is not configured."
|
|
1073
|
+
return "Visual description was skipped because the image VLM did not return text."
|
|
1074
|
+
|
|
1075
|
+
|
|
1076
|
+
def _parse_image_perception(content: str, *, default_include: bool) -> ImagePerception | None:
|
|
1077
|
+
if not content:
|
|
1078
|
+
return None
|
|
1079
|
+
parsed = _loads_json_object(content)
|
|
1080
|
+
if not isinstance(parsed, dict):
|
|
1081
|
+
return ImagePerception(description=content, include_in_graph=default_include)
|
|
1082
|
+
description = str(parsed.get("description") or "").strip()
|
|
1083
|
+
if not description:
|
|
1084
|
+
description = content
|
|
1085
|
+
include_raw = parsed.get("include_in_graph", default_include)
|
|
1086
|
+
include = include_raw if isinstance(include_raw, bool) else default_include
|
|
1087
|
+
image_kind = str(parsed.get("image_kind") or "unknown").strip() or "unknown"
|
|
1088
|
+
reason = str(parsed.get("reason") or "").strip()
|
|
1089
|
+
return ImagePerception(
|
|
1090
|
+
description=description,
|
|
1091
|
+
include_in_graph=include,
|
|
1092
|
+
image_kind=image_kind,
|
|
1093
|
+
reason=reason,
|
|
1094
|
+
)
|
|
1095
|
+
|
|
1096
|
+
|
|
1097
|
+
def _loads_json_object(content: str) -> dict[str, Any] | None:
|
|
1098
|
+
text = content.strip()
|
|
1099
|
+
fenced = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, flags=re.DOTALL)
|
|
1100
|
+
if fenced:
|
|
1101
|
+
text = fenced.group(1).strip()
|
|
1102
|
+
if not text.startswith("{"):
|
|
1103
|
+
start = text.find("{")
|
|
1104
|
+
end = text.rfind("}")
|
|
1105
|
+
if start >= 0 and end > start:
|
|
1106
|
+
text = text[start : end + 1]
|
|
1107
|
+
try:
|
|
1108
|
+
data = json.loads(text)
|
|
1109
|
+
except ValueError:
|
|
1110
|
+
return None
|
|
1111
|
+
return data if isinstance(data, dict) else None
|
|
1112
|
+
|
|
1113
|
+
|
|
1114
|
+
def _decode_text(file_bytes: bytes, file_name: str) -> str:
|
|
1115
|
+
try:
|
|
1116
|
+
return file_bytes.decode("utf-8")
|
|
1117
|
+
except UnicodeDecodeError:
|
|
1118
|
+
decoded = file_bytes.decode("utf-8", errors="replace")
|
|
1119
|
+
if not decoded.strip():
|
|
1120
|
+
raise DocumentParseError(f"Could not decode text file {file_name}")
|
|
1121
|
+
return decoded
|
|
1122
|
+
|
|
1123
|
+
|
|
1124
|
+
def _guess_media_type(ext: str) -> str:
|
|
1125
|
+
return {
|
|
1126
|
+
".pdf": "application/pdf",
|
|
1127
|
+
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
1128
|
+
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
1129
|
+
".xlsm": "application/vnd.ms-excel.sheet.macroEnabled.12",
|
|
1130
|
+
".xls": "application/vnd.ms-excel",
|
|
1131
|
+
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
1132
|
+
".ppt": "application/vnd.ms-powerpoint",
|
|
1133
|
+
".md": "text/markdown",
|
|
1134
|
+
".markdown": "text/markdown",
|
|
1135
|
+
".txt": "text/plain",
|
|
1136
|
+
".text": "text/plain",
|
|
1137
|
+
".csv": "text/csv",
|
|
1138
|
+
".tsv": "text/tab-separated-values",
|
|
1139
|
+
".json": "application/json",
|
|
1140
|
+
".jsonl": "application/x-ndjson",
|
|
1141
|
+
".yaml": "application/yaml",
|
|
1142
|
+
".yml": "application/yaml",
|
|
1143
|
+
".html": "text/html",
|
|
1144
|
+
".htm": "text/html",
|
|
1145
|
+
".xml": "application/xml",
|
|
1146
|
+
".rtf": "application/rtf",
|
|
1147
|
+
".png": "image/png",
|
|
1148
|
+
".jpg": "image/jpeg",
|
|
1149
|
+
".jpeg": "image/jpeg",
|
|
1150
|
+
".webp": "image/webp",
|
|
1151
|
+
".gif": "image/gif",
|
|
1152
|
+
}.get(ext.lower(), "application/octet-stream")
|