memuron 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. memuron/__init__.py +3 -0
  2. memuron/actions/__init__.py +12 -0
  3. memuron/actions/context.py +63 -0
  4. memuron/actions/helpers.py +88 -0
  5. memuron/actions/memory.py +340 -0
  6. memuron/actions/memory_write.py +290 -0
  7. memuron/actions/nodes.py +340 -0
  8. memuron/actions/registry.py +5 -0
  9. memuron/actions/runtime.py +37 -0
  10. memuron/actions/spaces_documents.py +720 -0
  11. memuron/actions/sync.py +155 -0
  12. memuron/application/__init__.py +1 -0
  13. memuron/application/api.py +206 -0
  14. memuron/application/app.py +103 -0
  15. memuron/application/capabilities.py +82 -0
  16. memuron/application/cli.py +35 -0
  17. memuron/application/config.py +176 -0
  18. memuron/application/mcp.py +44 -0
  19. memuron/application/mcp_oauth.py +290 -0
  20. memuron/application/registry.py +52 -0
  21. memuron/context.py +532 -0
  22. memuron/documents/__init__.py +1 -0
  23. memuron/documents/link_guardian.py +192 -0
  24. memuron/documents/linking.py +292 -0
  25. memuron/documents/parser.py +1152 -0
  26. memuron/documents/storage.py +151 -0
  27. memuron/documents/url_ingest.py +375 -0
  28. memuron/domain/__init__.py +1 -0
  29. memuron/domain/decoders.py +1 -0
  30. memuron/domain/encoders.py +185 -0
  31. memuron/domain/lifecycles.py +8 -0
  32. memuron/domain/limits.py +6 -0
  33. memuron/domain/representations.py +56 -0
  34. memuron/domain/schemas.py +581 -0
  35. memuron/domain/scope_filter.py +104 -0
  36. memuron/graphfs/__init__.py +1 -0
  37. memuron/graphfs/manual.py +635 -0
  38. memuron/graphfs/projection.py +578 -0
  39. memuron/graphfs/query.py +1782 -0
  40. memuron/graphfs/read_model.py +574 -0
  41. memuron/ingest/__init__.py +1 -0
  42. memuron/ingest/guardian.py +213 -0
  43. memuron/ingest/jobs.py +424 -0
  44. memuron/ingest/prompts.py +147 -0
  45. memuron/memory/__init__.py +1 -0
  46. memuron/memory/engine.py +35 -0
  47. memuron/memory/projections.py +452 -0
  48. memuron/memory/recipes.py +3247 -0
  49. memuron/persistence/__init__.py +1 -0
  50. memuron/persistence/db_pool.py +57 -0
  51. memuron/persistence/identity_store.py +918 -0
  52. memuron/persistence/store_helpers.py +16 -0
  53. memuron/search/__init__.py +1 -0
  54. memuron/search/fulltext.py +110 -0
  55. memuron/search/hybrid.py +284 -0
  56. memuron/search/pgvector.py +252 -0
  57. memuron/security/__init__.py +1 -0
  58. memuron/security/auth.py +143 -0
  59. memuron/security/auth_provider.py +119 -0
  60. memuron/security/authorization.py +53 -0
  61. memuron/security/clerk_scopes.py +94 -0
  62. memuron/security/clerk_webhooks.py +61 -0
  63. memuron/security/jwt_tokens.py +53 -0
  64. memuron/security/passwords.py +38 -0
  65. memuron/security/tenant.py +58 -0
  66. memuron/spaces/__init__.py +1 -0
  67. memuron/spaces/model.py +35 -0
  68. memuron/spaces/service.py +155 -0
  69. memuron/sync/__init__.py +25 -0
  70. memuron/sync/folder.py +828 -0
  71. memuron-0.1.1.dist-info/METADATA +242 -0
  72. memuron-0.1.1.dist-info/RECORD +74 -0
  73. memuron-0.1.1.dist-info/WHEEL +4 -0
  74. memuron-0.1.1.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,1152 @@
1
+ """Document parsing helpers for Memuron rich nodes.
2
+
3
+ This module deliberately stops at normalized markdown and deterministic chunks.
4
+ The Memuron recipe layer decides how those chunks become semantic events.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import base64
10
+ import datetime as dt
11
+ import io
12
+ import json
13
+ import re
14
+ import struct
15
+ from dataclasses import dataclass, field
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ import requests
20
+
21
+
22
+ MAX_DOCUMENT_UPLOAD_BYTES = 8 * 1024 * 1024
23
+ MAX_SPREADSHEET_CELLS = 100_000
24
+ DEFAULT_TARGET_CHARS = 1_600
25
+ DEFAULT_MAX_CHARS = 2_800
26
+ OPENROUTER_CHAT_URL = "https://openrouter.ai/api/v1/chat/completions"
27
+ IMAGE_DESCRIPTION_PROMPT = (
28
+ "Analyze this image for a semantic memory graph. Return strict JSON only with keys: "
29
+ "description, include_in_graph, image_kind, reason. The description should be concrete and "
30
+ "capture visible text, diagrams, entities, relationships, layout, and likely intent. "
31
+ "Set include_in_graph=true for images with durable semantic value, such as diagrams, "
32
+ "mindmaps, charts, screenshots, document pages, UI states, photos of meaningful objects, "
33
+ "or figures that materially explain the document. Set include_in_graph=false for logos, "
34
+ "letterheads, watermarks, decorative graphics, repeated branding, tiny icons, separators, "
35
+ "background textures, and images that add no retrievable meaning. image_kind should be one "
36
+ "of diagram, screenshot, chart, document_page, photo, figure, logo, letterhead, watermark, "
37
+ "decorative, icon, unknown."
38
+ )
39
+
40
+
41
+ class DocumentParseError(ValueError):
42
+ """Raised when an uploaded source cannot be parsed into useful markdown."""
43
+
44
+
45
+ @dataclass(frozen=True)
46
+ class DocumentBlock:
47
+ text: str
48
+ page_number: int
49
+ kind: str = "text"
50
+ heading_level: int | None = None
51
+ char_start: int = 0
52
+ char_end: int = 0
53
+
54
+
55
+ @dataclass(frozen=True)
56
+ class ParsedChunk:
57
+ index: int
58
+ text: str
59
+ page_range: tuple[int, int]
60
+ element_range: tuple[int, int]
61
+ char_range: tuple[int, int]
62
+ bboxes: list[dict[str, Any]] = field(default_factory=list)
63
+
64
+ def to_location(self) -> dict[str, Any]:
65
+ return {
66
+ "page_range": list(self.page_range),
67
+ "element_range": list(self.element_range),
68
+ "char_range": list(self.char_range),
69
+ "bboxes": list(self.bboxes),
70
+ }
71
+
72
+
73
+ @dataclass(frozen=True)
74
+ class ImagePerception:
75
+ description: str
76
+ include_in_graph: bool = True
77
+ image_kind: str = "unknown"
78
+ reason: str = ""
79
+
80
+
81
+ @dataclass(frozen=True)
82
+ class ParsedImage:
83
+ index: int
84
+ file_name: str
85
+ media_type: str
86
+ description: str
87
+ include_in_graph: bool = True
88
+ image_kind: str = "unknown"
89
+ reason: str = ""
90
+ page_number: int = 0
91
+ size_bytes: int = 0
92
+ source: str = "upload"
93
+ metadata: dict[str, Any] = field(default_factory=dict)
94
+ raw_bytes: bytes | None = None
95
+
96
+
97
+ @dataclass(frozen=True)
98
+ class ParsedDocument:
99
+ file_name: str
100
+ media_type: str
101
+ source_type: str
102
+ markdown: str
103
+ chunks: list[ParsedChunk]
104
+ page_count: int = 0
105
+ unreadable_pages: list[int] = field(default_factory=list)
106
+ source_metadata: dict[str, Any] = field(default_factory=dict)
107
+ images: list[ParsedImage] = field(default_factory=list)
108
+
109
+
110
+ _HEADING_RE = re.compile(r"^(#{1,6})\s+(.*\S)\s*$")
111
+ _IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".webp", ".gif"}
112
+ _MARKDOWN_EXTS = {".md", ".markdown"}
113
+ _TEXT_EXTS = {
114
+ ".txt",
115
+ ".text",
116
+ ".csv",
117
+ ".tsv",
118
+ ".json",
119
+ ".jsonl",
120
+ ".yaml",
121
+ ".yml",
122
+ ".html",
123
+ ".htm",
124
+ ".xml",
125
+ ".rtf",
126
+ }
127
+ _DOCX_EXTS = {".docx"}
128
+ _EXCEL_EXTS = {".xlsx", ".xlsm"}
129
+ _LEGACY_EXCEL_EXTS = {".xls"}
130
+ _POWERPOINT_EXTS = {".pptx"}
131
+ _LEGACY_POWERPOINT_EXTS = {".ppt"}
132
+
133
+
134
+ def parse_source(
135
+ *,
136
+ file_name: str,
137
+ content_type: str | None,
138
+ file_bytes: bytes,
139
+ describe_images: bool = False,
140
+ vlm_api_key: str | None = None,
141
+ vlm_model: str = "perceptron/perceptron-mk1",
142
+ vlm_timeout_seconds: int = 60,
143
+ ) -> ParsedDocument:
144
+ """Parse an uploaded source into markdown and retrievable chunks."""
145
+
146
+ if not file_bytes:
147
+ raise DocumentParseError("Uploaded file is empty")
148
+ if len(file_bytes) > MAX_DOCUMENT_UPLOAD_BYTES:
149
+ raise DocumentParseError(
150
+ f"Uploaded file is too large; max is {MAX_DOCUMENT_UPLOAD_BYTES // (1024 * 1024)} MB"
151
+ )
152
+
153
+ safe_name = Path(file_name or "document").name
154
+ ext = Path(safe_name).suffix.lower()
155
+ media_type = (content_type or _guess_media_type(ext)).split(";", 1)[0].strip()
156
+ normalized_media_type = media_type.lower()
157
+ image_describer = ImageDescriber(
158
+ api_key=vlm_api_key,
159
+ model=vlm_model,
160
+ timeout_seconds=vlm_timeout_seconds,
161
+ )
162
+
163
+ if ext == ".pdf" or normalized_media_type == "application/pdf":
164
+ return _parse_pdf(
165
+ safe_name,
166
+ media_type,
167
+ file_bytes,
168
+ describe_images=describe_images,
169
+ image_describer=image_describer,
170
+ )
171
+ if ext in _DOCX_EXTS or normalized_media_type == (
172
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
173
+ ):
174
+ return _parse_docx(safe_name, media_type, file_bytes)
175
+ if ext in _EXCEL_EXTS or normalized_media_type in {
176
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
177
+ "application/vnd.ms-excel.sheet.macroenabled.12",
178
+ }:
179
+ return _parse_excel(safe_name, media_type, file_bytes)
180
+ if ext in _LEGACY_EXCEL_EXTS or normalized_media_type == "application/vnd.ms-excel":
181
+ return _parse_legacy_excel(safe_name, media_type, file_bytes)
182
+ if ext in _POWERPOINT_EXTS or normalized_media_type == (
183
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation"
184
+ ):
185
+ return _parse_pptx(safe_name, media_type, file_bytes)
186
+ if ext in _LEGACY_POWERPOINT_EXTS or normalized_media_type == "application/vnd.ms-powerpoint":
187
+ return _parse_legacy_ppt(safe_name, media_type, file_bytes)
188
+ if ext in _MARKDOWN_EXTS or normalized_media_type in {"text/markdown", "text/x-markdown"}:
189
+ markdown = _decode_text(file_bytes, safe_name).strip() + "\n"
190
+ return _parsed_from_markdown(
191
+ file_name=safe_name,
192
+ media_type="text/markdown",
193
+ source_type="markdown",
194
+ markdown=markdown,
195
+ metadata={"char_count": len(markdown)},
196
+ )
197
+ if ext in _TEXT_EXTS or normalized_media_type.startswith("text/"):
198
+ text = _decode_text(file_bytes, safe_name).strip()
199
+ markdown = f"# {Path(safe_name).stem}\n\n{text}\n"
200
+ return _parsed_from_markdown(
201
+ file_name=safe_name,
202
+ media_type=media_type or "text/plain",
203
+ source_type="text",
204
+ markdown=markdown,
205
+ metadata={"char_count": len(text)},
206
+ )
207
+ if ext in _IMAGE_EXTS or normalized_media_type.startswith("image/"):
208
+ return _parse_image(
209
+ safe_name,
210
+ media_type,
211
+ file_bytes,
212
+ describe_images=describe_images,
213
+ image_describer=image_describer,
214
+ )
215
+
216
+ raise DocumentParseError(
217
+ "Unsupported file type. Supported: PDF, DOCX, XLS/XLSX/XLSM, PPT/PPTX, Markdown, "
218
+ "common text formats, and PNG/JPEG/WebP/GIF image metadata."
219
+ )
220
+
221
+
222
+ class ImageDescriber:
223
+ """OpenRouter VLM image description client."""
224
+
225
+ def __init__(
226
+ self,
227
+ *,
228
+ api_key: str | None,
229
+ model: str,
230
+ timeout_seconds: int,
231
+ ) -> None:
232
+ self.api_key = (api_key or "").strip()
233
+ self.model = model
234
+ self.timeout_seconds = timeout_seconds
235
+
236
+ def describe(
237
+ self,
238
+ image_bytes: bytes,
239
+ *,
240
+ media_type: str,
241
+ file_name: str,
242
+ default_include: bool = True,
243
+ ) -> ImagePerception | None:
244
+ if not self.api_key:
245
+ return None
246
+ data_url = _image_data_url(image_bytes, media_type)
247
+ payload: dict[str, Any] = {
248
+ "model": self.model,
249
+ "messages": [
250
+ {
251
+ "role": "user",
252
+ "content": [
253
+ {"type": "text", "text": IMAGE_DESCRIPTION_PROMPT},
254
+ {"type": "image_url", "image_url": {"url": data_url}},
255
+ ],
256
+ }
257
+ ],
258
+ }
259
+ try:
260
+ response = requests.post(
261
+ OPENROUTER_CHAT_URL,
262
+ headers={
263
+ "Authorization": f"Bearer {self.api_key}",
264
+ "Content-Type": "application/json",
265
+ },
266
+ json=payload,
267
+ timeout=self.timeout_seconds,
268
+ )
269
+ except requests.RequestException as exc:
270
+ raise DocumentParseError(f"Image VLM request failed for {file_name}: {exc}") from exc
271
+ if response.status_code >= 400:
272
+ raise DocumentParseError(
273
+ f"Image VLM returned HTTP {response.status_code} for {file_name}: {response.text[:500]}"
274
+ )
275
+ try:
276
+ data = response.json()
277
+ content = data["choices"][0]["message"]["content"]
278
+ except (KeyError, IndexError, TypeError, ValueError) as exc:
279
+ raise DocumentParseError(f"Image VLM returned invalid response for {file_name}") from exc
280
+ if isinstance(content, list):
281
+ text_parts = [
282
+ str(item.get("text", "")).strip()
283
+ for item in content
284
+ if isinstance(item, dict) and item.get("type") == "text"
285
+ ]
286
+ content = "\n".join(part for part in text_parts if part)
287
+ return _parse_image_perception(
288
+ str(content or "").strip(),
289
+ default_include=default_include,
290
+ )
291
+
292
+
293
+ def _parse_pdf(
294
+ file_name: str,
295
+ media_type: str,
296
+ file_bytes: bytes,
297
+ *,
298
+ describe_images: bool,
299
+ image_describer: ImageDescriber,
300
+ ) -> ParsedDocument:
301
+ try:
302
+ from pypdf import PdfReader
303
+ except Exception as exc: # pragma: no cover - dependency sanity guard
304
+ raise DocumentParseError("PDF parsing requires pypdf to be installed") from exc
305
+
306
+ try:
307
+ reader = PdfReader(io.BytesIO(file_bytes))
308
+ except Exception as exc:
309
+ raise DocumentParseError(f"Could not open PDF: {exc}") from exc
310
+
311
+ if not reader.pages:
312
+ raise DocumentParseError("PDF has no pages")
313
+
314
+ parts: list[str] = []
315
+ unreadable: list[int] = []
316
+ for page_index, page in enumerate(reader.pages, start=1):
317
+ try:
318
+ text = (page.extract_text() or "").strip()
319
+ except Exception:
320
+ text = ""
321
+ if not text:
322
+ unreadable.append(page_index)
323
+ continue
324
+ parts.append(f"# Page {page_index}\n\n{text}")
325
+
326
+ images = _extract_pdf_images(
327
+ file_name,
328
+ file_bytes,
329
+ unreadable_pages=unreadable,
330
+ describe_images=describe_images,
331
+ image_describer=image_describer,
332
+ )
333
+ graph_images = [image for image in images if image.include_in_graph]
334
+ for image in graph_images:
335
+ parts.append(f"# Image {image.index + 1} on page {image.page_number}\n\n{image.description}")
336
+
337
+ markdown = "\n\n".join(parts).strip() + "\n"
338
+ if not markdown.strip():
339
+ raise DocumentParseError("No extractable text found in PDF")
340
+
341
+ return _parsed_from_markdown(
342
+ file_name=file_name,
343
+ media_type=media_type or "application/pdf",
344
+ source_type="pdf",
345
+ markdown=markdown,
346
+ page_count=len(reader.pages),
347
+ unreadable_pages=unreadable,
348
+ metadata={
349
+ "page_count": len(reader.pages),
350
+ "unreadable_pages": unreadable,
351
+ "parser": "pypdf",
352
+ "image_count": len(images),
353
+ "graph_image_count": len(graph_images),
354
+ "skipped_image_count": len(images) - len(graph_images),
355
+ },
356
+ images=images,
357
+ )
358
+
359
+
360
+ def _parse_docx(file_name: str, media_type: str, file_bytes: bytes) -> ParsedDocument:
361
+ try:
362
+ from docx import Document
363
+ from docx.table import Table
364
+ from docx.text.paragraph import Paragraph
365
+ except Exception as exc: # pragma: no cover - dependency sanity guard
366
+ raise DocumentParseError("DOCX parsing requires python-docx to be installed") from exc
367
+
368
+ try:
369
+ document = Document(io.BytesIO(file_bytes))
370
+ except Exception as exc:
371
+ raise DocumentParseError(f"Could not open DOCX: {exc}") from exc
372
+
373
+ parts: list[str] = [f"# {Path(file_name).stem}"]
374
+ page_number = 1
375
+ paragraph_count = 0
376
+ table_count = 0
377
+
378
+ for child in document.element.body.iterchildren():
379
+ if child.tag.endswith("}p"):
380
+ paragraph = Paragraph(child, document)
381
+ text = paragraph.text.strip()
382
+ if text:
383
+ paragraph_count += 1
384
+ style_name = str(getattr(paragraph.style, "name", "") or "")
385
+ heading_match = re.match(r"Heading\s+([1-6])$", style_name, flags=re.IGNORECASE)
386
+ if heading_match:
387
+ parts.append(f"{'#' * int(heading_match.group(1))} {text}")
388
+ else:
389
+ parts.append(text)
390
+ breaks = _docx_page_break_count(paragraph)
391
+ for _ in range(breaks):
392
+ page_number += 1
393
+ parts.append(f"# Page {page_number}")
394
+ continue
395
+
396
+ if child.tag.endswith("}tbl"):
397
+ table = Table(child, document)
398
+ rows = [
399
+ [cell.text.strip() for cell in row.cells]
400
+ for row in table.rows
401
+ ]
402
+ rendered = _rows_to_markdown_table(rows)
403
+ if rendered:
404
+ table_count += 1
405
+ parts.append(rendered)
406
+
407
+ markdown = "\n\n".join(part for part in parts if part.strip()).strip() + "\n"
408
+ if paragraph_count == 0 and table_count == 0:
409
+ raise DocumentParseError("No extractable text or tables found in DOCX")
410
+
411
+ return _parsed_from_markdown(
412
+ file_name=file_name,
413
+ media_type=media_type
414
+ or "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
415
+ source_type="docx",
416
+ markdown=markdown,
417
+ page_count=page_number,
418
+ metadata={
419
+ "parser": "python-docx",
420
+ "paragraph_count": paragraph_count,
421
+ "table_count": table_count,
422
+ "page_count": page_number,
423
+ },
424
+ )
425
+
426
+
427
+ def _docx_page_break_count(paragraph: Any) -> int:
428
+ namespace = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
429
+ count = 1 if bool(paragraph.paragraph_format.page_break_before) else 0
430
+ for run in paragraph.runs:
431
+ count += sum(
432
+ 1
433
+ for break_element in run._r.findall(f".//{namespace}br")
434
+ if break_element.get(f"{namespace}type") == "page"
435
+ )
436
+ return count
437
+
438
+
439
+ def _parse_excel(file_name: str, media_type: str, file_bytes: bytes) -> ParsedDocument:
440
+ try:
441
+ from openpyxl import load_workbook
442
+ except Exception as exc: # pragma: no cover - dependency sanity guard
443
+ raise DocumentParseError("Excel parsing requires openpyxl to be installed") from exc
444
+
445
+ try:
446
+ workbook = load_workbook(
447
+ io.BytesIO(file_bytes),
448
+ read_only=True,
449
+ data_only=True,
450
+ keep_links=False,
451
+ )
452
+ except Exception as exc:
453
+ raise DocumentParseError(f"Could not open Excel workbook: {exc}") from exc
454
+
455
+ sheets: list[tuple[str, list[list[str]]]] = []
456
+ try:
457
+ for worksheet in workbook.worksheets:
458
+ rows: list[list[str]] = []
459
+ for row in worksheet.iter_rows(values_only=True):
460
+ values = [_excel_cell_text(value) for value in row]
461
+ while values and not values[-1]:
462
+ values.pop()
463
+ if not any(values):
464
+ continue
465
+ rows.append(values)
466
+ sheets.append((worksheet.title, rows))
467
+ finally:
468
+ workbook.close()
469
+
470
+ return _parsed_from_spreadsheet(
471
+ file_name=file_name,
472
+ media_type=media_type
473
+ or "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
474
+ sheets=sheets,
475
+ parser="openpyxl",
476
+ )
477
+
478
+
479
+ def _parse_legacy_excel(file_name: str, media_type: str, file_bytes: bytes) -> ParsedDocument:
480
+ try:
481
+ import xlrd
482
+ except Exception as exc: # pragma: no cover - dependency sanity guard
483
+ raise DocumentParseError("Legacy Excel parsing requires xlrd to be installed") from exc
484
+
485
+ try:
486
+ workbook = xlrd.open_workbook(file_contents=file_bytes, on_demand=True)
487
+ except Exception as exc:
488
+ raise DocumentParseError(f"Could not open legacy Excel workbook: {exc}") from exc
489
+
490
+ sheets: list[tuple[str, list[list[str]]]] = []
491
+ try:
492
+ for worksheet in workbook.sheets():
493
+ rows = [
494
+ [_excel_cell_text(worksheet.cell_value(row, column)) for column in range(worksheet.ncols)]
495
+ for row in range(worksheet.nrows)
496
+ ]
497
+ sheets.append((worksheet.name, rows))
498
+ finally:
499
+ workbook.release_resources()
500
+
501
+ return _parsed_from_spreadsheet(
502
+ file_name=file_name,
503
+ media_type=media_type or "application/vnd.ms-excel",
504
+ sheets=sheets,
505
+ parser="xlrd",
506
+ )
507
+
508
+
509
+ def _parsed_from_spreadsheet(
510
+ *,
511
+ file_name: str,
512
+ media_type: str,
513
+ sheets: list[tuple[str, list[list[str]]]],
514
+ parser: str,
515
+ ) -> ParsedDocument:
516
+ parts: list[str] = [f"# {Path(file_name).stem}"]
517
+ sheet_metadata: list[dict[str, Any]] = []
518
+ total_cells = 0
519
+ for sheet_name, raw_rows in sheets:
520
+ rows: list[list[str]] = []
521
+ sheet_cells = 0
522
+ for raw_row in raw_rows:
523
+ row = list(raw_row)
524
+ while row and not row[-1]:
525
+ row.pop()
526
+ if not any(row):
527
+ continue
528
+ non_empty = sum(1 for value in row if value)
529
+ sheet_cells += non_empty
530
+ total_cells += non_empty
531
+ if total_cells > MAX_SPREADSHEET_CELLS:
532
+ raise DocumentParseError(
533
+ f"Excel workbook exceeds the {MAX_SPREADSHEET_CELLS:,} non-empty cell limit"
534
+ )
535
+ rows.append(row)
536
+ if not rows:
537
+ continue
538
+ width = max(len(row) for row in rows)
539
+ normalized_rows = [row + [""] * (width - len(row)) for row in rows]
540
+ parts.extend([f"## Sheet: {sheet_name}", _rows_to_markdown_table(normalized_rows)])
541
+ sheet_metadata.append(
542
+ {
543
+ "name": sheet_name,
544
+ "row_count": len(normalized_rows),
545
+ "column_count": width,
546
+ "non_empty_cell_count": sheet_cells,
547
+ }
548
+ )
549
+
550
+ if not sheet_metadata:
551
+ raise DocumentParseError("No non-empty cells found in Excel workbook")
552
+
553
+ return _parsed_from_markdown(
554
+ file_name=file_name,
555
+ media_type=media_type,
556
+ source_type="excel",
557
+ markdown="\n\n".join(parts).strip() + "\n",
558
+ page_count=len(sheet_metadata),
559
+ metadata={
560
+ "parser": parser,
561
+ "sheet_count": len(sheet_metadata),
562
+ "non_empty_cell_count": total_cells,
563
+ "sheets": sheet_metadata,
564
+ },
565
+ )
566
+
567
+
568
+ def _parse_pptx(file_name: str, media_type: str, file_bytes: bytes) -> ParsedDocument:
569
+ try:
570
+ from pptx import Presentation
571
+ except Exception as exc: # pragma: no cover - dependency sanity guard
572
+ raise DocumentParseError("PPTX parsing requires python-pptx to be installed") from exc
573
+
574
+ try:
575
+ presentation = Presentation(io.BytesIO(file_bytes))
576
+ except Exception as exc:
577
+ raise DocumentParseError(f"Could not open PPTX: {exc}") from exc
578
+
579
+ parts: list[str] = [f"# {Path(file_name).stem}"]
580
+ slide_metadata: list[dict[str, Any]] = []
581
+ for slide_number, slide in enumerate(presentation.slides, start=1):
582
+ slide_parts: list[str] = []
583
+ table_count = 0
584
+ for shape in slide.shapes:
585
+ if getattr(shape, "has_text_frame", False):
586
+ text = "\n".join(
587
+ paragraph.text.strip()
588
+ for paragraph in shape.text_frame.paragraphs
589
+ if paragraph.text.strip()
590
+ )
591
+ if text:
592
+ slide_parts.append(text)
593
+ if getattr(shape, "has_table", False):
594
+ rows = [[cell.text.strip() for cell in row.cells] for row in shape.table.rows]
595
+ rendered = _rows_to_markdown_table(rows)
596
+ if rendered:
597
+ table_count += 1
598
+ slide_parts.append(rendered)
599
+ notes_text = ""
600
+ if slide.has_notes_slide:
601
+ notes_text = "\n".join(
602
+ shape.text.strip()
603
+ for shape in slide.notes_slide.notes_text_frame.paragraphs
604
+ if shape.text.strip()
605
+ )
606
+ if notes_text:
607
+ slide_parts.append(f"### Speaker notes\n\n{notes_text}")
608
+ if slide_parts:
609
+ parts.append(f"# Slide {slide_number}\n\n" + "\n\n".join(slide_parts))
610
+ slide_metadata.append(
611
+ {
612
+ "slide_number": slide_number,
613
+ "table_count": table_count,
614
+ "has_notes": bool(notes_text),
615
+ }
616
+ )
617
+
618
+ if len(parts) == 1:
619
+ raise DocumentParseError("No extractable text or tables found in PPTX")
620
+
621
+ return _parsed_from_markdown(
622
+ file_name=file_name,
623
+ media_type=media_type
624
+ or "application/vnd.openxmlformats-officedocument.presentationml.presentation",
625
+ source_type="powerpoint",
626
+ markdown="\n\n".join(parts).strip() + "\n",
627
+ page_count=len(presentation.slides),
628
+ metadata={
629
+ "parser": "python-pptx",
630
+ "slide_count": len(presentation.slides),
631
+ "slides": slide_metadata,
632
+ },
633
+ )
634
+
635
+
636
+ def _parse_legacy_ppt(file_name: str, media_type: str, file_bytes: bytes) -> ParsedDocument:
637
+ try:
638
+ import olefile
639
+ except Exception as exc: # pragma: no cover - dependency sanity guard
640
+ raise DocumentParseError("Legacy PPT parsing requires olefile to be installed") from exc
641
+
642
+ try:
643
+ ole = olefile.OleFileIO(io.BytesIO(file_bytes))
644
+ except Exception as exc:
645
+ raise DocumentParseError(f"Could not open legacy PPT: {exc}") from exc
646
+ try:
647
+ if not ole.exists("PowerPoint Document"):
648
+ raise DocumentParseError("Legacy PPT does not contain a PowerPoint Document stream")
649
+ stream = ole.openstream("PowerPoint Document").read()
650
+ finally:
651
+ ole.close()
652
+
653
+ fragments = _legacy_ppt_text_fragments(stream)
654
+ if not fragments:
655
+ raise DocumentParseError("No extractable text found in legacy PPT")
656
+ markdown = f"# {Path(file_name).stem}\n\n" + "\n\n".join(fragments) + "\n"
657
+ return _parsed_from_markdown(
658
+ file_name=file_name,
659
+ media_type=media_type or "application/vnd.ms-powerpoint",
660
+ source_type="powerpoint",
661
+ markdown=markdown,
662
+ metadata={"parser": "olefile", "text_fragment_count": len(fragments)},
663
+ )
664
+
665
+
666
+ def _legacy_ppt_text_fragments(stream: bytes) -> list[str]:
667
+ fragments: list[str] = []
668
+
669
+ def walk(start: int, end: int) -> None:
670
+ offset = start
671
+ while offset + 8 <= end:
672
+ options, record_type, length = struct.unpack_from("<HHI", stream, offset)
673
+ payload_start = offset + 8
674
+ payload_end = payload_start + length
675
+ if payload_end > end:
676
+ break
677
+ record_version = options & 0xF
678
+ if record_version == 0xF:
679
+ walk(payload_start, payload_end)
680
+ elif record_type == 4000:
681
+ _append_text_fragment(fragments, stream[payload_start:payload_end].decode("utf-16le", errors="ignore"))
682
+ elif record_type == 4008:
683
+ _append_text_fragment(fragments, stream[payload_start:payload_end].decode("cp1252", errors="ignore"))
684
+ offset = payload_end
685
+
686
+ walk(0, len(stream))
687
+ return fragments
688
+
689
+
690
+ def _append_text_fragment(fragments: list[str], text: str) -> None:
691
+ normalized = re.sub(r"\s+", " ", text).strip("\x00 \t\r\n")
692
+ if normalized and (not fragments or fragments[-1] != normalized):
693
+ fragments.append(normalized)
694
+
695
+
696
+ def _excel_cell_text(value: Any) -> str:
697
+ if value is None:
698
+ return ""
699
+ if isinstance(value, (dt.datetime, dt.date, dt.time)):
700
+ return value.isoformat()
701
+ if isinstance(value, bool):
702
+ return "TRUE" if value else "FALSE"
703
+ return str(value).strip()
704
+
705
+
706
+ def _rows_to_markdown_table(rows: list[list[str]]) -> str:
707
+ if not rows:
708
+ return ""
709
+ width = max((len(row) for row in rows), default=0)
710
+ if width == 0:
711
+ return ""
712
+ normalized = [row + [""] * (width - len(row)) for row in rows]
713
+ header = normalized[0]
714
+ body = normalized[1:]
715
+
716
+ def render(row: list[str]) -> str:
717
+ cells = [
718
+ str(value).replace("\\", "\\\\").replace("|", "\\|").replace("\r", " ").replace("\n", "<br>")
719
+ for value in row
720
+ ]
721
+ return f"| {' | '.join(cells)} |"
722
+
723
+ return "\n".join(
724
+ [
725
+ render(header),
726
+ f"| {' | '.join(['---'] * width)} |",
727
+ *(render(row) for row in body),
728
+ ]
729
+ )
730
+
731
+
732
+ def _parse_image(
733
+ file_name: str,
734
+ media_type: str,
735
+ file_bytes: bytes,
736
+ *,
737
+ describe_images: bool,
738
+ image_describer: ImageDescriber,
739
+ ) -> ParsedDocument:
740
+ description = None
741
+ perception = None
742
+ parser = "image_metadata"
743
+ if describe_images:
744
+ perception = image_describer.describe(
745
+ file_bytes,
746
+ media_type=media_type,
747
+ file_name=file_name,
748
+ default_include=True,
749
+ )
750
+ if perception:
751
+ description = perception.description
752
+ parser = "openrouter_vlm"
753
+ description = description or _image_description_fallback(
754
+ describe_images=describe_images,
755
+ has_api_key=bool(image_describer.api_key),
756
+ )
757
+ markdown = (
758
+ f"# {Path(file_name).stem}\n\n"
759
+ f"Image upload: `{file_name}`.\n\n"
760
+ f"{description}\n"
761
+ )
762
+ image = ParsedImage(
763
+ index=0,
764
+ file_name=file_name,
765
+ media_type=media_type or _guess_media_type(Path(file_name).suffix.lower()),
766
+ description=description,
767
+ include_in_graph=True,
768
+ image_kind=perception.image_kind if perception else "unknown",
769
+ reason=perception.reason if perception else "",
770
+ size_bytes=len(file_bytes),
771
+ source="upload",
772
+ metadata={
773
+ "parser": parser,
774
+ "vlm_include_in_graph": perception.include_in_graph if perception else True,
775
+ "vlm_image_kind": perception.image_kind if perception else "unknown",
776
+ "vlm_reason": perception.reason if perception else "",
777
+ },
778
+ raw_bytes=file_bytes,
779
+ )
780
+ return _parsed_from_markdown(
781
+ file_name=file_name,
782
+ media_type=media_type or _guess_media_type(Path(file_name).suffix.lower()),
783
+ source_type="image",
784
+ markdown=markdown,
785
+ metadata={
786
+ "size_bytes": len(file_bytes),
787
+ "parser": parser,
788
+ "visual_description": description,
789
+ "image_count": 1,
790
+ "graph_image_count": 1,
791
+ "skipped_image_count": 0,
792
+ "vlm_include_in_graph": perception.include_in_graph if perception else True,
793
+ "vlm_image_kind": perception.image_kind if perception else "unknown",
794
+ "vlm_reason": perception.reason if perception else "",
795
+ },
796
+ images=[image],
797
+ )
798
+
799
+
800
+ def _parsed_from_markdown(
801
+ *,
802
+ file_name: str,
803
+ media_type: str,
804
+ source_type: str,
805
+ markdown: str,
806
+ page_count: int = 0,
807
+ unreadable_pages: list[int] | None = None,
808
+ metadata: dict[str, Any] | None = None,
809
+ images: list[ParsedImage] | None = None,
810
+ ) -> ParsedDocument:
811
+ blocks = _markdown_blocks(markdown)
812
+ chunks = _chunk_blocks(blocks)
813
+ if not chunks:
814
+ raise DocumentParseError("No semantic chunks were produced from uploaded file")
815
+ return ParsedDocument(
816
+ file_name=file_name,
817
+ media_type=media_type,
818
+ source_type=source_type,
819
+ markdown=markdown,
820
+ chunks=chunks,
821
+ page_count=page_count,
822
+ unreadable_pages=unreadable_pages or [],
823
+ source_metadata=metadata or {},
824
+ images=images or [],
825
+ )
826
+
827
+
828
+ def _markdown_blocks(markdown: str) -> list[DocumentBlock]:
829
+ lines = markdown.splitlines()
830
+ blocks: list[DocumentBlock] = []
831
+ paragraph: list[str] = []
832
+ offset = 0
833
+ paragraph_start = 0
834
+ current_page = 0
835
+
836
+ def flush_paragraph(end_offset: int) -> None:
837
+ nonlocal paragraph, paragraph_start
838
+ if not paragraph:
839
+ return
840
+ text = " ".join(part.strip() for part in paragraph if part.strip()).strip()
841
+ if text:
842
+ blocks.append(
843
+ DocumentBlock(
844
+ text=text,
845
+ page_number=current_page,
846
+ char_start=paragraph_start,
847
+ char_end=end_offset,
848
+ )
849
+ )
850
+ paragraph = []
851
+ paragraph_start = end_offset
852
+
853
+ for raw_line in lines:
854
+ line_start = offset
855
+ stripped = raw_line.strip()
856
+ offset += len(raw_line) + 1
857
+
858
+ if not stripped:
859
+ flush_paragraph(line_start)
860
+ paragraph_start = offset
861
+ continue
862
+
863
+ heading = _HEADING_RE.match(stripped)
864
+ if heading:
865
+ flush_paragraph(line_start)
866
+ level = len(heading.group(1))
867
+ text = heading.group(2).strip()
868
+ page_match = re.match(r"Page\s+(\d+)$", text, flags=re.IGNORECASE)
869
+ if page_match:
870
+ current_page = int(page_match.group(1))
871
+ blocks.append(
872
+ DocumentBlock(
873
+ text=f"{'#' * level} {text}",
874
+ page_number=current_page,
875
+ kind="heading",
876
+ heading_level=level,
877
+ char_start=line_start,
878
+ char_end=offset,
879
+ )
880
+ )
881
+ paragraph_start = offset
882
+ continue
883
+
884
+ if not paragraph:
885
+ paragraph_start = line_start
886
+ paragraph.append(raw_line)
887
+
888
+ flush_paragraph(offset)
889
+ return blocks
890
+
891
+
892
+ def _chunk_blocks(
893
+ blocks: list[DocumentBlock],
894
+ *,
895
+ target_chars: int = DEFAULT_TARGET_CHARS,
896
+ max_chars: int = DEFAULT_MAX_CHARS,
897
+ ) -> list[ParsedChunk]:
898
+ chunks: list[ParsedChunk] = []
899
+ current: list[tuple[int, DocumentBlock]] = []
900
+
901
+ def current_len() -> int:
902
+ return sum(len(block.text) for _index, block in current) + max(0, len(current) - 1) * 2
903
+
904
+ def flush() -> None:
905
+ nonlocal current
906
+ if not current:
907
+ return
908
+ indices = [index for index, _block in current]
909
+ texts = [block.text for _index, block in current]
910
+ pages = [block.page_number for _index, block in current if block.page_number > 0]
911
+ page_range = (min(pages), max(pages)) if pages else (0, 0)
912
+ chunks.append(
913
+ ParsedChunk(
914
+ index=len(chunks),
915
+ text="\n\n".join(texts).strip(),
916
+ page_range=page_range,
917
+ element_range=(indices[0], indices[-1]),
918
+ char_range=(current[0][1].char_start, current[-1][1].char_end),
919
+ )
920
+ )
921
+ current = []
922
+
923
+ for index, block in enumerate(blocks):
924
+ if block.kind == "heading" and current and current_len() >= target_chars // 3:
925
+ flush()
926
+ if current and current_len() + len(block.text) + 2 > max_chars:
927
+ flush()
928
+ current.append((index, block))
929
+ if current_len() >= target_chars and block.kind != "heading":
930
+ flush()
931
+
932
+ flush()
933
+ return chunks
934
+
935
+
936
+ def _extract_pdf_images(
937
+ file_name: str,
938
+ file_bytes: bytes,
939
+ *,
940
+ unreadable_pages: list[int],
941
+ describe_images: bool,
942
+ image_describer: ImageDescriber,
943
+ ) -> list[ParsedImage]:
944
+ try:
945
+ import fitz
946
+ except Exception:
947
+ return []
948
+
949
+ images: list[ParsedImage] = []
950
+ seen_xrefs: set[int] = set()
951
+ pages_with_images: set[int] = set()
952
+ try:
953
+ document = fitz.open(stream=file_bytes, filetype="pdf")
954
+ except Exception:
955
+ return []
956
+
957
+ try:
958
+ for page_index, page in enumerate(document, start=1):
959
+ for image_info in page.get_images(full=True):
960
+ xref = int(image_info[0])
961
+ if xref in seen_xrefs:
962
+ continue
963
+ seen_xrefs.add(xref)
964
+ extracted = document.extract_image(xref)
965
+ image_bytes = extracted.get("image")
966
+ if not isinstance(image_bytes, bytes) or not image_bytes:
967
+ continue
968
+ ext = str(extracted.get("ext") or "png").lower()
969
+ image_media_type = _guess_media_type(f".{ext}")
970
+ image_name = f"{Path(file_name).stem}-page-{page_index}-image-{len(images) + 1}.{ext}"
971
+ description = None
972
+ perception = None
973
+ parser = "pdf_image_metadata"
974
+ if describe_images:
975
+ perception = image_describer.describe(
976
+ image_bytes,
977
+ media_type=image_media_type,
978
+ file_name=image_name,
979
+ default_include=False,
980
+ )
981
+ if perception:
982
+ description = perception.description
983
+ parser = "openrouter_vlm"
984
+ description = description or (
985
+ f"Extracted image from `{file_name}` on page {page_index}. "
986
+ f"{_image_description_fallback(describe_images=describe_images, has_api_key=bool(image_describer.api_key))}"
987
+ )
988
+ pages_with_images.add(page_index)
989
+ images.append(
990
+ ParsedImage(
991
+ index=len(images),
992
+ file_name=image_name,
993
+ media_type=image_media_type,
994
+ description=description,
995
+ include_in_graph=perception.include_in_graph if perception else not describe_images,
996
+ image_kind=perception.image_kind if perception else "unknown",
997
+ reason=perception.reason if perception else "",
998
+ page_number=page_index,
999
+ size_bytes=len(image_bytes),
1000
+ source="pdf",
1001
+ metadata={
1002
+ "parser": parser,
1003
+ "pdf_file_name": file_name,
1004
+ "xref": xref,
1005
+ "vlm_include_in_graph": perception.include_in_graph if perception else None,
1006
+ "vlm_image_kind": perception.image_kind if perception else "unknown",
1007
+ "vlm_reason": perception.reason if perception else "",
1008
+ },
1009
+ raw_bytes=image_bytes,
1010
+ )
1011
+ )
1012
+ for page_number in unreadable_pages:
1013
+ if page_number in pages_with_images or page_number < 1 or page_number > document.page_count:
1014
+ continue
1015
+ page = document[page_number - 1]
1016
+ pixmap = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
1017
+ image_bytes = pixmap.tobytes("png")
1018
+ image_name = f"{Path(file_name).stem}-page-{page_number}.png"
1019
+ description = None
1020
+ perception = None
1021
+ parser = "pdf_page_render"
1022
+ if describe_images:
1023
+ perception = image_describer.describe(
1024
+ image_bytes,
1025
+ media_type="image/png",
1026
+ file_name=image_name,
1027
+ default_include=True,
1028
+ )
1029
+ if perception:
1030
+ description = perception.description
1031
+ parser = "openrouter_vlm"
1032
+ description = description or (
1033
+ f"Rendered image-only page from `{file_name}` page {page_number}. "
1034
+ f"{_image_description_fallback(describe_images=describe_images, has_api_key=bool(image_describer.api_key))}"
1035
+ )
1036
+ images.append(
1037
+ ParsedImage(
1038
+ index=len(images),
1039
+ file_name=image_name,
1040
+ media_type="image/png",
1041
+ description=description,
1042
+ include_in_graph=perception.include_in_graph if perception else True,
1043
+ image_kind=perception.image_kind if perception else "document_page",
1044
+ reason=perception.reason if perception else "",
1045
+ page_number=page_number,
1046
+ size_bytes=len(image_bytes),
1047
+ source="pdf_page_render",
1048
+ metadata={
1049
+ "parser": parser,
1050
+ "pdf_file_name": file_name,
1051
+ "vlm_include_in_graph": perception.include_in_graph if perception else None,
1052
+ "vlm_image_kind": perception.image_kind if perception else "document_page",
1053
+ "vlm_reason": perception.reason if perception else "",
1054
+ },
1055
+ raw_bytes=image_bytes,
1056
+ )
1057
+ )
1058
+ finally:
1059
+ document.close()
1060
+ return images
1061
+
1062
+
1063
+ def _image_data_url(image_bytes: bytes, media_type: str) -> str:
1064
+ encoded = base64.b64encode(image_bytes).decode("ascii")
1065
+ return f"data:{media_type or 'image/png'};base64,{encoded}"
1066
+
1067
+
1068
+ def _image_description_fallback(*, describe_images: bool, has_api_key: bool) -> str:
1069
+ if not describe_images:
1070
+ return "Visual description was skipped because image VLM description is disabled."
1071
+ if not has_api_key:
1072
+ return "Visual description was skipped because OPENROUTER_API_KEY is not configured."
1073
+ return "Visual description was skipped because the image VLM did not return text."
1074
+
1075
+
1076
+ def _parse_image_perception(content: str, *, default_include: bool) -> ImagePerception | None:
1077
+ if not content:
1078
+ return None
1079
+ parsed = _loads_json_object(content)
1080
+ if not isinstance(parsed, dict):
1081
+ return ImagePerception(description=content, include_in_graph=default_include)
1082
+ description = str(parsed.get("description") or "").strip()
1083
+ if not description:
1084
+ description = content
1085
+ include_raw = parsed.get("include_in_graph", default_include)
1086
+ include = include_raw if isinstance(include_raw, bool) else default_include
1087
+ image_kind = str(parsed.get("image_kind") or "unknown").strip() or "unknown"
1088
+ reason = str(parsed.get("reason") or "").strip()
1089
+ return ImagePerception(
1090
+ description=description,
1091
+ include_in_graph=include,
1092
+ image_kind=image_kind,
1093
+ reason=reason,
1094
+ )
1095
+
1096
+
1097
+ def _loads_json_object(content: str) -> dict[str, Any] | None:
1098
+ text = content.strip()
1099
+ fenced = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, flags=re.DOTALL)
1100
+ if fenced:
1101
+ text = fenced.group(1).strip()
1102
+ if not text.startswith("{"):
1103
+ start = text.find("{")
1104
+ end = text.rfind("}")
1105
+ if start >= 0 and end > start:
1106
+ text = text[start : end + 1]
1107
+ try:
1108
+ data = json.loads(text)
1109
+ except ValueError:
1110
+ return None
1111
+ return data if isinstance(data, dict) else None
1112
+
1113
+
1114
+ def _decode_text(file_bytes: bytes, file_name: str) -> str:
1115
+ try:
1116
+ return file_bytes.decode("utf-8")
1117
+ except UnicodeDecodeError:
1118
+ decoded = file_bytes.decode("utf-8", errors="replace")
1119
+ if not decoded.strip():
1120
+ raise DocumentParseError(f"Could not decode text file {file_name}")
1121
+ return decoded
1122
+
1123
+
1124
+ def _guess_media_type(ext: str) -> str:
1125
+ return {
1126
+ ".pdf": "application/pdf",
1127
+ ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
1128
+ ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
1129
+ ".xlsm": "application/vnd.ms-excel.sheet.macroEnabled.12",
1130
+ ".xls": "application/vnd.ms-excel",
1131
+ ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
1132
+ ".ppt": "application/vnd.ms-powerpoint",
1133
+ ".md": "text/markdown",
1134
+ ".markdown": "text/markdown",
1135
+ ".txt": "text/plain",
1136
+ ".text": "text/plain",
1137
+ ".csv": "text/csv",
1138
+ ".tsv": "text/tab-separated-values",
1139
+ ".json": "application/json",
1140
+ ".jsonl": "application/x-ndjson",
1141
+ ".yaml": "application/yaml",
1142
+ ".yml": "application/yaml",
1143
+ ".html": "text/html",
1144
+ ".htm": "text/html",
1145
+ ".xml": "application/xml",
1146
+ ".rtf": "application/rtf",
1147
+ ".png": "image/png",
1148
+ ".jpg": "image/jpeg",
1149
+ ".jpeg": "image/jpeg",
1150
+ ".webp": "image/webp",
1151
+ ".gif": "image/gif",
1152
+ }.get(ext.lower(), "application/octet-stream")