docfold 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docfold might be problematic. Click here for more details.

@@ -0,0 +1,215 @@
1
+ """Google Document AI engine adapter — cloud document understanding.
2
+
3
+ Install: ``pip install docfold[google-docai]``
4
+
5
+ Requires Google Cloud credentials and a Document AI processor.
6
+ Set ``GOOGLE_APPLICATION_CREDENTIALS`` environment variable for auth,
7
+ and configure processor via constructor or environment variables:
8
+ ``GOOGLE_DOCAI_PROJECT_ID``, ``GOOGLE_DOCAI_LOCATION``, ``GOOGLE_DOCAI_PROCESSOR_ID``.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import logging
14
+ import os
15
+ import time
16
+ from typing import Any
17
+
18
+ from docfold.engines.base import DocumentEngine, EngineCapabilities, EngineResult, OutputFormat
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ _SUPPORTED_EXTENSIONS = {"pdf", "png", "jpg", "jpeg", "tiff", "tif", "gif", "bmp", "webp"}
23
+
24
+ _MIME_MAP = {
25
+ "pdf": "application/pdf",
26
+ "png": "image/png",
27
+ "jpg": "image/jpeg",
28
+ "jpeg": "image/jpeg",
29
+ "tiff": "image/tiff",
30
+ "tif": "image/tiff",
31
+ "gif": "image/gif",
32
+ "bmp": "image/bmp",
33
+ "webp": "image/webp",
34
+ }
35
+
36
+
37
+ class GoogleDocAIEngine(DocumentEngine):
38
+ """Adapter for Google Document AI.
39
+
40
+ Processes documents using a configured Document AI processor.
41
+ Supports OCR, layout analysis, table extraction, and more.
42
+
43
+ See https://cloud.google.com/document-ai
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ project_id: str | None = None,
49
+ location: str | None = None,
50
+ processor_id: str | None = None,
51
+ ) -> None:
52
+ self._project_id = project_id or os.getenv("GOOGLE_DOCAI_PROJECT_ID")
53
+ self._location = location or os.getenv("GOOGLE_DOCAI_LOCATION", "us")
54
+ self._processor_id = processor_id or os.getenv("GOOGLE_DOCAI_PROCESSOR_ID")
55
+
56
+ @property
57
+ def name(self) -> str:
58
+ return "google_docai"
59
+
60
+ @property
61
+ def supported_extensions(self) -> set[str]:
62
+ return _SUPPORTED_EXTENSIONS
63
+
64
+ @property
65
+ def capabilities(self) -> EngineCapabilities:
66
+ return EngineCapabilities(
67
+ bounding_boxes=True, confidence=True, table_structure=True,
68
+ heading_detection=True, reading_order=True,
69
+ )
70
+
71
+ def is_available(self) -> bool:
72
+ try:
73
+ from google.cloud import documentai # noqa: F401
74
+
75
+ return bool(self._project_id and self._processor_id)
76
+ except ImportError:
77
+ return False
78
+
79
+ async def process(
80
+ self,
81
+ file_path: str,
82
+ output_format: OutputFormat = OutputFormat.MARKDOWN,
83
+ **kwargs: Any,
84
+ ) -> EngineResult:
85
+ import asyncio
86
+
87
+ start = time.perf_counter()
88
+
89
+ loop = asyncio.get_running_loop()
90
+ content, metadata, boxes, conf, tables = await loop.run_in_executor(
91
+ None, self._process_document, file_path, output_format
92
+ )
93
+
94
+ elapsed_ms = int((time.perf_counter() - start) * 1000)
95
+
96
+ return EngineResult(
97
+ content=content,
98
+ format=output_format,
99
+ engine_name=self.name,
100
+ processing_time_ms=elapsed_ms,
101
+ metadata=metadata,
102
+ bounding_boxes=boxes,
103
+ confidence=conf,
104
+ tables=tables,
105
+ )
106
+
107
+ def _process_document(
108
+ self,
109
+ file_path: str,
110
+ output_format: OutputFormat,
111
+ ) -> tuple[str, dict, list[dict], float | None, list[dict] | None]:
112
+ from google.cloud import documentai
113
+
114
+ client = documentai.DocumentProcessorServiceClient()
115
+
116
+ processor_name = client.processor_path(
117
+ self._project_id, self._location, self._processor_id
118
+ )
119
+
120
+ ext = os.path.splitext(file_path)[1].lstrip(".").lower()
121
+ mime_type = _MIME_MAP.get(ext, "application/octet-stream")
122
+
123
+ with open(file_path, "rb") as f:
124
+ raw_document = documentai.RawDocument(content=f.read(), mime_type=mime_type)
125
+
126
+ request = documentai.ProcessRequest(name=processor_name, raw_document=raw_document)
127
+ result = client.process_document(request=request)
128
+ document = result.document
129
+
130
+ # Extract text
131
+ full_text = document.text or ""
132
+
133
+ # Extract bounding boxes and confidence
134
+ bounding_boxes: list[dict[str, Any]] = []
135
+ confidences: list[float] = []
136
+
137
+ for page in document.pages:
138
+ page_num = page.page_number
139
+
140
+ for paragraph in page.paragraphs:
141
+ text_segment = self._get_text_segment(paragraph.layout, document.text)
142
+ conf = paragraph.layout.confidence
143
+ if conf:
144
+ confidences.append(conf)
145
+
146
+ vertices = self._get_vertices(paragraph.layout)
147
+ if vertices:
148
+ bounding_boxes.append({
149
+ "type": "paragraph",
150
+ "text": text_segment,
151
+ "vertices": vertices,
152
+ "page": page_num,
153
+ "confidence": conf,
154
+ })
155
+
156
+ avg_conf = sum(confidences) / len(confidences) if confidences else None
157
+
158
+ # Extract tables
159
+ tables: list[dict[str, Any]] = []
160
+ for page in document.pages:
161
+ for table in page.tables:
162
+ table_data = self._extract_table(table, document.text)
163
+ if table_data:
164
+ tables.append(table_data)
165
+
166
+ # Format output
167
+ if output_format == OutputFormat.JSON:
168
+ import json
169
+ data = {"text": full_text, "page_count": len(document.pages)}
170
+ content = json.dumps(data, ensure_ascii=False)
171
+ elif output_format == OutputFormat.HTML:
172
+ content = f"<html><body><pre>{full_text}</pre></body></html>"
173
+ else:
174
+ content = full_text
175
+
176
+ metadata = {
177
+ "page_count": len(document.pages),
178
+ "processor_id": self._processor_id,
179
+ "mime_type": mime_type,
180
+ }
181
+
182
+ return content, metadata, bounding_boxes, avg_conf, tables or None
183
+
184
+ def _get_text_segment(self, layout: Any, full_text: str) -> str:
185
+ """Extract text from a layout's text_anchor."""
186
+ segments = layout.text_anchor.text_segments if layout.text_anchor else []
187
+ parts = []
188
+ for segment in segments:
189
+ start = int(segment.start_index) if segment.start_index else 0
190
+ end = int(segment.end_index) if segment.end_index else 0
191
+ parts.append(full_text[start:end])
192
+ return "".join(parts).strip()
193
+
194
+ def _get_vertices(self, layout: Any) -> list[dict[str, float]] | None:
195
+ """Extract normalized vertices from layout bounding poly."""
196
+ bp = layout.bounding_poly
197
+ if not bp or not bp.normalized_vertices:
198
+ return None
199
+ return [{"x": v.x, "y": v.y} for v in bp.normalized_vertices]
200
+
201
+ def _extract_table(self, table: Any, full_text: str) -> dict[str, Any] | None:
202
+ """Extract table rows from Document AI table object."""
203
+ rows_data = []
204
+
205
+ for row in table.header_rows:
206
+ cells = [self._get_text_segment(cell.layout, full_text) for cell in row.cells]
207
+ rows_data.append({"type": "header", "cells": cells})
208
+
209
+ for row in table.body_rows:
210
+ cells = [self._get_text_segment(cell.layout, full_text) for cell in row.cells]
211
+ rows_data.append({"type": "body", "cells": cells})
212
+
213
+ if not rows_data:
214
+ return None
215
+ return {"rows": rows_data}
@@ -0,0 +1,107 @@
1
+ """LlamaParse engine adapter — LLM-powered document parsing by LlamaIndex.
2
+
3
+ Install: ``pip install docfold[llamaparse]``
4
+
5
+ Requires an API key: https://cloud.llamaindex.ai/
6
+ Set ``LLAMA_CLOUD_API_KEY`` environment variable.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ import os
13
+ import time
14
+ from typing import Any
15
+
16
+ from docfold.engines.base import DocumentEngine, EngineCapabilities, EngineResult, OutputFormat
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ _SUPPORTED_EXTENSIONS = {
21
+ "pdf", "docx", "doc", "pptx", "ppt", "xlsx", "xls",
22
+ "html", "htm", "png", "jpg", "jpeg", "csv", "epub",
23
+ }
24
+
25
+
26
+ class LlamaParseEngine(DocumentEngine):
27
+ """Adapter for LlamaParse (LlamaIndex Cloud).
28
+
29
+ LLM-powered parsing with excellent table and layout understanding.
30
+ Free tier: 1000 pages/day.
31
+
32
+ See https://docs.llamaindex.ai/en/stable/llama_cloud/llama_parse/
33
+ """
34
+
35
+ def __init__(self, api_key: str | None = None, result_type: str = "markdown") -> None:
36
+ self._api_key = api_key or os.getenv("LLAMA_CLOUD_API_KEY")
37
+ self._result_type = result_type
38
+
39
+ @property
40
+ def name(self) -> str:
41
+ return "llamaparse"
42
+
43
+ @property
44
+ def supported_extensions(self) -> set[str]:
45
+ return _SUPPORTED_EXTENSIONS
46
+
47
+ @property
48
+ def capabilities(self) -> EngineCapabilities:
49
+ return EngineCapabilities(table_structure=True, heading_detection=True)
50
+
51
+ def is_available(self) -> bool:
52
+ try:
53
+ import llama_parse # noqa: F401
54
+
55
+ return bool(self._api_key)
56
+ except ImportError:
57
+ return False
58
+
59
+ async def process(
60
+ self,
61
+ file_path: str,
62
+ output_format: OutputFormat = OutputFormat.MARKDOWN,
63
+ **kwargs: Any,
64
+ ) -> EngineResult:
65
+ start = time.perf_counter()
66
+
67
+ content, metadata = await self._parse(file_path, output_format)
68
+
69
+ elapsed_ms = int((time.perf_counter() - start) * 1000)
70
+
71
+ return EngineResult(
72
+ content=content,
73
+ format=output_format,
74
+ engine_name=self.name,
75
+ processing_time_ms=elapsed_ms,
76
+ metadata=metadata,
77
+ )
78
+
79
+ async def _parse(
80
+ self, file_path: str, output_format: OutputFormat
81
+ ) -> tuple[str, dict]:
82
+ from llama_parse import LlamaParse
83
+
84
+ fmt_map = {
85
+ OutputFormat.MARKDOWN: "markdown",
86
+ OutputFormat.HTML: "html",
87
+ OutputFormat.JSON: "markdown",
88
+ OutputFormat.TEXT: "text",
89
+ }
90
+ result_type = fmt_map[output_format]
91
+
92
+ parser = LlamaParse(api_key=self._api_key, result_type=result_type)
93
+ documents = await parser.aload_data(file_path)
94
+
95
+ content = "\n\n".join(doc.text for doc in documents)
96
+
97
+ if output_format == OutputFormat.JSON:
98
+ import json
99
+
100
+ data = [{"page": i + 1, "text": doc.text} for i, doc in enumerate(documents)]
101
+ content = json.dumps(data, ensure_ascii=False)
102
+
103
+ metadata = {
104
+ "result_type": result_type,
105
+ "document_count": len(documents),
106
+ }
107
+ return content, metadata
@@ -0,0 +1,146 @@
1
+ """Marker API (Datalab) engine adapter.
2
+
3
+ Install: ``pip install docfold[marker]``
4
+
5
+ Requires a Datalab API key: https://www.datalab.to/
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ import os
12
+ import time
13
+ from pathlib import Path
14
+ from typing import Any
15
+
16
+ from docfold.engines.base import DocumentEngine, EngineCapabilities, EngineResult, OutputFormat
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ _SUPPORTED_EXTENSIONS = {
21
+ "pdf", "docx", "doc", "pptx", "ppt", "xlsx", "xls",
22
+ "odt", "odp", "ods", "html", "epub",
23
+ "png", "jpg", "jpeg", "webp", "gif", "tiff",
24
+ }
25
+
26
+ _API_BASE = "https://www.datalab.to/api/v1/marker"
27
+ _DEFAULT_POLL_INTERVAL = 2
28
+ _DEFAULT_MAX_POLLS = 300
29
+
30
+
31
+ class MarkerEngine(DocumentEngine):
32
+ """Adapter for the Marker API (Datalab SaaS).
33
+
34
+ See https://documentation.datalab.to/
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ api_key: str | None = None,
40
+ use_llm: bool = False,
41
+ force_ocr: bool = False,
42
+ ) -> None:
43
+ self._api_key = api_key or os.getenv("MARKER_API_KEY") or os.getenv("DATALAB_API_KEY")
44
+ self._use_llm = use_llm
45
+ self._force_ocr = force_ocr
46
+
47
+ @property
48
+ def name(self) -> str:
49
+ return "marker"
50
+
51
+ @property
52
+ def supported_extensions(self) -> set[str]:
53
+ return _SUPPORTED_EXTENSIONS
54
+
55
+ @property
56
+ def capabilities(self) -> EngineCapabilities:
57
+ return EngineCapabilities(
58
+ bounding_boxes=True, images=True, table_structure=True,
59
+ heading_detection=True,
60
+ )
61
+
62
+ def is_available(self) -> bool:
63
+ try:
64
+ import requests # noqa: F401
65
+ return bool(self._api_key)
66
+ except ImportError:
67
+ return False
68
+
69
+ async def process(
70
+ self,
71
+ file_path: str,
72
+ output_format: OutputFormat = OutputFormat.MARKDOWN,
73
+ **kwargs: Any,
74
+ ) -> EngineResult:
75
+ import asyncio
76
+
77
+ start = time.perf_counter()
78
+
79
+ loop = asyncio.get_running_loop()
80
+ content, images, meta = await loop.run_in_executor(
81
+ None, self._call_marker, file_path, output_format
82
+ )
83
+
84
+ elapsed_ms = int((time.perf_counter() - start) * 1000)
85
+
86
+ return EngineResult(
87
+ content=content,
88
+ format=output_format,
89
+ engine_name=self.name,
90
+ images=images,
91
+ pages=meta.get("page_count"),
92
+ processing_time_ms=elapsed_ms,
93
+ metadata=meta,
94
+ )
95
+
96
+ def _call_marker(
97
+ self,
98
+ file_path: str,
99
+ output_format: OutputFormat,
100
+ ) -> tuple[str, dict | None, dict]:
101
+ import requests
102
+
103
+ fmt_map = {
104
+ OutputFormat.MARKDOWN: "markdown",
105
+ OutputFormat.HTML: "html",
106
+ OutputFormat.JSON: "json",
107
+ OutputFormat.TEXT: "markdown", # Marker doesn't have plain text; use markdown
108
+ }
109
+ marker_fmt = fmt_map[output_format]
110
+
111
+ headers = {"X-Api-Key": self._api_key}
112
+
113
+ with open(file_path, "rb") as f:
114
+ form_data = {
115
+ "file": (Path(file_path).name, f, "application/octet-stream"),
116
+ "output_format": (None, marker_fmt),
117
+ "use_llm": (None, str(self._use_llm)),
118
+ "force_ocr": (None, str(self._force_ocr)),
119
+ "paginate": (None, "False"),
120
+ "strip_existing_ocr": (None, "False"),
121
+ "disable_image_extraction": (None, "False"),
122
+ }
123
+ resp = requests.post(_API_BASE, files=form_data, headers=headers, timeout=30)
124
+ resp.raise_for_status()
125
+ data = resp.json()
126
+
127
+ check_url = data["request_check_url"]
128
+
129
+ for _ in range(_DEFAULT_MAX_POLLS):
130
+ time.sleep(_DEFAULT_POLL_INTERVAL)
131
+ resp = requests.get(check_url, headers=headers, timeout=30)
132
+ result = resp.json()
133
+
134
+ if result.get("status") == "complete":
135
+ content = result.get(marker_fmt, "")
136
+ images = result.get("images")
137
+ meta = {
138
+ "page_count": result.get("page_count"),
139
+ "marker_output_format": marker_fmt,
140
+ }
141
+ return content, images, meta
142
+
143
+ if result.get("status") == "failed":
144
+ raise RuntimeError(f"Marker API failed: {result.get('error')}")
145
+
146
+ raise TimeoutError("Marker API did not complete within the polling window.")
@@ -0,0 +1,102 @@
1
+ """MinerU / PDF-Extract-Kit engine adapter.
2
+
3
+ Install: ``pip install docfold[mineru]``
4
+
5
+ Note: First run downloads model weights (~2-5 GB).
6
+ License: AGPL-3.0 — see https://github.com/opendatalab/MinerU
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ import time
13
+ from typing import Any
14
+
15
+ from docfold.engines.base import DocumentEngine, EngineCapabilities, EngineResult, OutputFormat
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ _SUPPORTED_EXTENSIONS = {"pdf"}
20
+
21
+
22
+ class MinerUEngine(DocumentEngine):
23
+ """Adapter for MinerU (magic-pdf), the end-to-end PDF structuring tool
24
+ built on PDF-Extract-Kit.
25
+
26
+ See https://github.com/opendatalab/MinerU
27
+ """
28
+
29
+ def __init__(self, config_path: str | None = None, gpu: bool = False) -> None:
30
+ self._config_path = config_path
31
+ self._gpu = gpu
32
+
33
+ @property
34
+ def name(self) -> str:
35
+ return "mineru"
36
+
37
+ @property
38
+ def supported_extensions(self) -> set[str]:
39
+ return _SUPPORTED_EXTENSIONS
40
+
41
+ @property
42
+ def capabilities(self) -> EngineCapabilities:
43
+ return EngineCapabilities(
44
+ table_structure=True, heading_detection=True, reading_order=True,
45
+ )
46
+
47
+ def is_available(self) -> bool:
48
+ try:
49
+ import magic_pdf # noqa: F401
50
+ return True
51
+ except ImportError:
52
+ return False
53
+
54
+ async def process(
55
+ self,
56
+ file_path: str,
57
+ output_format: OutputFormat = OutputFormat.MARKDOWN,
58
+ **kwargs: Any,
59
+ ) -> EngineResult:
60
+ import asyncio
61
+
62
+ start = time.perf_counter()
63
+
64
+ loop = asyncio.get_running_loop()
65
+ content, metadata = await loop.run_in_executor(
66
+ None, self._run_mineru, file_path, output_format
67
+ )
68
+
69
+ elapsed_ms = int((time.perf_counter() - start) * 1000)
70
+
71
+ return EngineResult(
72
+ content=content,
73
+ format=output_format,
74
+ engine_name=self.name,
75
+ processing_time_ms=elapsed_ms,
76
+ metadata=metadata,
77
+ )
78
+
79
+ def _run_mineru(self, file_path: str, output_format: OutputFormat) -> tuple[str, dict]:
80
+ """Synchronous MinerU processing.
81
+
82
+ This is a placeholder implementation. The actual integration will
83
+ depend on MinerU's Python API which may change across versions.
84
+ Adapt the import paths and function calls to the installed version.
85
+ """
86
+ # TODO: Replace with actual MinerU API calls once version is pinned.
87
+ # The general pattern is:
88
+ #
89
+ # from magic_pdf.pipe.UNIPipe import UNIPipe
90
+ # from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
91
+ #
92
+ # reader = DiskReaderWriter(parent_dir)
93
+ # pipe = UNIPipe(pdf_bytes, model_list, reader)
94
+ # pipe.pipe_classify()
95
+ # pipe.pipe_analyze()
96
+ # pipe.pipe_parse()
97
+ # md_content = pipe.pipe_mk_markdown(...)
98
+
99
+ raise NotImplementedError(
100
+ "MinerU adapter requires magic-pdf to be installed and configured. "
101
+ "Install with: pip install docfold[mineru]"
102
+ )
@@ -0,0 +1,128 @@
1
+ """Mistral OCR engine adapter — Vision LLM-powered document understanding.
2
+
3
+ Install: ``pip install docfold[mistral-ocr]``
4
+
5
+ Requires an API key: https://console.mistral.ai/
6
+ Set ``MISTRAL_API_KEY`` environment variable.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ import os
13
+ import time
14
+ from typing import Any
15
+
16
+ from docfold.engines.base import DocumentEngine, EngineCapabilities, EngineResult, OutputFormat
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ _SUPPORTED_EXTENSIONS = {"pdf", "png", "jpg", "jpeg", "tiff", "tif", "webp", "bmp"}
21
+
22
+
23
+ class MistralOCREngine(DocumentEngine):
24
+ """Adapter for Mistral's OCR API.
25
+
26
+ Uses Mistral's document understanding capabilities via the
27
+ ``mistral.ocr.process`` endpoint for high-quality structured
28
+ extraction from PDFs and images.
29
+
30
+ See https://docs.mistral.ai/capabilities/document/
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ api_key: str | None = None,
36
+ model: str = "mistral-ocr-latest",
37
+ ) -> None:
38
+ self._api_key = api_key or os.getenv("MISTRAL_API_KEY")
39
+ self._model = model
40
+
41
+ @property
42
+ def name(self) -> str:
43
+ return "mistral_ocr"
44
+
45
+ @property
46
+ def supported_extensions(self) -> set[str]:
47
+ return _SUPPORTED_EXTENSIONS
48
+
49
+ @property
50
+ def capabilities(self) -> EngineCapabilities:
51
+ return EngineCapabilities(table_structure=True, heading_detection=True)
52
+
53
+ def is_available(self) -> bool:
54
+ try:
55
+ import mistralai # noqa: F401
56
+
57
+ return bool(self._api_key)
58
+ except ImportError:
59
+ return False
60
+
61
+ async def process(
62
+ self,
63
+ file_path: str,
64
+ output_format: OutputFormat = OutputFormat.MARKDOWN,
65
+ **kwargs: Any,
66
+ ) -> EngineResult:
67
+ import asyncio
68
+
69
+ start = time.perf_counter()
70
+
71
+ loop = asyncio.get_running_loop()
72
+ content, metadata = await loop.run_in_executor(
73
+ None, self._call_ocr, file_path, output_format
74
+ )
75
+
76
+ elapsed_ms = int((time.perf_counter() - start) * 1000)
77
+
78
+ return EngineResult(
79
+ content=content,
80
+ format=output_format,
81
+ engine_name=self.name,
82
+ processing_time_ms=elapsed_ms,
83
+ metadata=metadata,
84
+ )
85
+
86
+ def _call_ocr(self, file_path: str, output_format: OutputFormat) -> tuple[str, dict]:
87
+ from mistralai import Mistral
88
+
89
+ client = Mistral(api_key=self._api_key)
90
+
91
+ # Upload file and process with OCR
92
+ with open(file_path, "rb") as f:
93
+ file_data = {"file_name": os.path.basename(file_path), "content": f}
94
+ uploaded = client.files.upload(file=file_data)
95
+
96
+ ocr_response = client.ocr.process(
97
+ model=self._model,
98
+ document={"type": "file_id", "file_id": uploaded.id},
99
+ )
100
+
101
+ # Combine pages into single output
102
+ pages_md = []
103
+ for page in ocr_response.pages:
104
+ pages_md.append(page.markdown)
105
+
106
+ content = "\n\n".join(pages_md)
107
+
108
+ if output_format == OutputFormat.JSON:
109
+ import json
110
+
111
+ data = [
112
+ {"page": i + 1, "text": page.markdown}
113
+ for i, page in enumerate(ocr_response.pages)
114
+ ]
115
+ content = json.dumps(data, ensure_ascii=False)
116
+ elif output_format == OutputFormat.HTML:
117
+ html_parts = [
118
+ f"<div class='page' data-page='{i + 1}'><p>{page.markdown}</p></div>"
119
+ for i, page in enumerate(ocr_response.pages)
120
+ ]
121
+ content = "<html><body>" + "\n".join(html_parts) + "</body></html>"
122
+
123
+ metadata = {
124
+ "model": self._model,
125
+ "page_count": len(ocr_response.pages),
126
+ "file_id": uploaded.id,
127
+ }
128
+ return content, metadata