docling 2.43.0__py3-none-any.whl → 2.45.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,399 @@
1
+ """Backend for GBS Google Books schema."""
2
+
3
+ import logging
4
+ import tarfile
5
+ from collections.abc import Iterable
6
+ from dataclasses import dataclass
7
+ from enum import Enum
8
+ from io import BytesIO
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union
11
+
12
+ from docling_core.types.doc import BoundingBox, CoordOrigin, Size
13
+ from docling_core.types.doc.page import (
14
+ BoundingRectangle,
15
+ PdfPageBoundaryType,
16
+ PdfPageGeometry,
17
+ SegmentedPdfPage,
18
+ TextCell,
19
+ )
20
+ from lxml import etree
21
+ from PIL import Image
22
+ from PIL.Image import Image as PILImage
23
+
24
+ from docling.backend.abstract_backend import PaginatedDocumentBackend
25
+ from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
26
+ from docling.datamodel.base_models import InputFormat
27
+
28
+ if TYPE_CHECKING:
29
+ from docling.datamodel.document import InputDocument
30
+
31
+ _log = logging.getLogger(__name__)
32
+
33
+
34
+ def _get_pdf_page_geometry(
35
+ size: Size,
36
+ ) -> PdfPageGeometry:
37
+ boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX
38
+
39
+ bbox_tuple = (0, 0, size.width, size.height)
40
+ bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.TOPLEFT)
41
+
42
+ return PdfPageGeometry(
43
+ angle=0.0,
44
+ rect=BoundingRectangle.from_bounding_box(bbox),
45
+ boundary_type=boundary_type,
46
+ art_bbox=bbox,
47
+ bleed_bbox=bbox,
48
+ crop_bbox=bbox,
49
+ media_bbox=bbox,
50
+ trim_bbox=bbox,
51
+ )
52
+
53
+
54
+ class MetsGbsPageBackend(PdfPageBackend):
55
+ def __init__(self, parsed_page: SegmentedPdfPage, page_im: PILImage):
56
+ self._im = page_im
57
+ self._dpage = parsed_page
58
+ self.valid = parsed_page is not None
59
+
60
+ def is_valid(self) -> bool:
61
+ return self.valid
62
+
63
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
64
+ # Find intersecting cells on the page
65
+ text_piece = ""
66
+ page_size = self.get_size()
67
+
68
+ scale = (
69
+ 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
70
+ )
71
+
72
+ for i, cell in enumerate(self._dpage.textline_cells):
73
+ cell_bbox = (
74
+ cell.rect.to_bounding_box()
75
+ .to_top_left_origin(page_height=page_size.height)
76
+ .scaled(scale)
77
+ )
78
+
79
+ overlap_frac = cell_bbox.intersection_over_self(bbox)
80
+
81
+ if overlap_frac > 0.5:
82
+ if len(text_piece) > 0:
83
+ text_piece += " "
84
+ text_piece += cell.text
85
+
86
+ return text_piece
87
+
88
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
89
+ return self._dpage
90
+
91
+ def get_text_cells(self) -> Iterable[TextCell]:
92
+ return self._dpage.textline_cells
93
+
94
+ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
95
+ AREA_THRESHOLD = 0 # 32 * 32
96
+
97
+ images = self._dpage.bitmap_resources
98
+
99
+ for img in images:
100
+ cropbox = img.rect.to_bounding_box().to_top_left_origin(
101
+ self.get_size().height
102
+ )
103
+
104
+ if cropbox.area() > AREA_THRESHOLD:
105
+ cropbox = cropbox.scaled(scale=scale)
106
+
107
+ yield cropbox
108
+
109
+ def get_page_image(
110
+ self, scale: float = 1, cropbox: Optional[BoundingBox] = None
111
+ ) -> Image.Image:
112
+ page_size = self.get_size()
113
+ assert (
114
+ page_size.width == self._im.size[0] and page_size.height == self._im.size[1]
115
+ )
116
+
117
+ if not cropbox:
118
+ cropbox = BoundingBox(
119
+ l=0,
120
+ r=page_size.width,
121
+ t=0,
122
+ b=page_size.height,
123
+ coord_origin=CoordOrigin.TOPLEFT,
124
+ )
125
+
126
+ image = self._im.resize(
127
+ size=(round(page_size.width * scale), round(page_size.height * scale))
128
+ ).crop(cropbox.scaled(scale=scale).as_tuple())
129
+ return image
130
+
131
+ def get_size(self) -> Size:
132
+ return Size(
133
+ width=self._dpage.dimension.width, height=self._dpage.dimension.height
134
+ )
135
+
136
+ def unload(self) -> None:
137
+ if hasattr(self, "_im"):
138
+ delattr(self, "_im")
139
+ if hasattr(self, "_dpage"):
140
+ delattr(self, "_dpage")
141
+
142
+
143
+ class _UseType(str, Enum):
144
+ IMAGE = "image"
145
+ OCR = "OCR"
146
+ COORD_OCR = "coordOCR"
147
+
148
+
149
+ @dataclass
150
+ class _FileInfo:
151
+ file_id: str
152
+ mimetype: str
153
+ path: str
154
+ use: _UseType
155
+
156
+
157
+ @dataclass
158
+ class _PageFiles:
159
+ image: Optional[_FileInfo] = None
160
+ ocr: Optional[_FileInfo] = None
161
+ coordOCR: Optional[_FileInfo] = None
162
+
163
+
164
+ def _extract_rect(title_str: str) -> Optional[BoundingRectangle]:
165
+ """
166
+ Extracts bbox from title string like 'bbox 279 177 306 214;x_wconf 97'
167
+ """
168
+ parts = title_str.split(";")
169
+ for part in parts:
170
+ part = part.strip()
171
+ if part.startswith("bbox "):
172
+ try:
173
+ coords = part.split()[1:]
174
+ rect = BoundingRectangle.from_bounding_box(
175
+ bbox=BoundingBox.from_tuple(
176
+ tuple(map(int, coords)), origin=CoordOrigin.TOPLEFT
177
+ )
178
+ )
179
+ return rect
180
+ except Exception:
181
+ return None
182
+ return None
183
+
184
+
185
+ def _extract_confidence(title_str) -> float:
186
+ """Extracts x_wconf (OCR confidence) value from title string."""
187
+ for part in title_str.split(";"):
188
+ part = part.strip()
189
+ if part.startswith("x_wconf"):
190
+ try:
191
+ return float(part.split()[1]) / 100.0
192
+ except Exception:
193
+ return 1
194
+ return 1
195
+
196
+
197
+ class MetsGbsDocumentBackend(PdfDocumentBackend):
198
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
199
+ super().__init__(in_doc, path_or_stream)
200
+
201
+ self._tar: tarfile.TarFile = (
202
+ tarfile.open(name=self.path_or_stream, mode="r:gz")
203
+ if isinstance(self.path_or_stream, Path)
204
+ else tarfile.open(fileobj=self.path_or_stream, mode="r:gz")
205
+ )
206
+ self.root_mets: Optional[etree._Element] = None
207
+ self.page_map: Dict[int, _PageFiles] = {}
208
+
209
+ for member in self._tar.getmembers():
210
+ if member.name.endswith(".xml"):
211
+ file = self._tar.extractfile(member)
212
+ if file is not None:
213
+ content = file.read()
214
+ self.root_mets = self._validate_mets_xml(content)
215
+ if self.root_mets is not None:
216
+ break
217
+
218
+ if self.root_mets is None:
219
+ raise RuntimeError(
220
+ f"METS GBS backend could not load document {self.document_hash}."
221
+ )
222
+
223
+ ns = {
224
+ "mets": "http://www.loc.gov/METS/",
225
+ "xlink": "http://www.w3.org/1999/xlink",
226
+ "xsi": "http://www.w3.org/2001/XMLSchema-instance",
227
+ "gbs": "http://books.google.com/gbs",
228
+ "premis": "info:lc/xmlns/premis-v2",
229
+ "marc": "http://www.loc.gov/MARC21/slim",
230
+ }
231
+
232
+ file_info_by_id: Dict[str, _FileInfo] = {}
233
+
234
+ for filegrp in self.root_mets.xpath(".//mets:fileGrp", namespaces=ns):
235
+ use_raw = filegrp.get("USE")
236
+ try:
237
+ use = _UseType(use_raw)
238
+ except ValueError:
239
+ continue # Ignore unknown USE types
240
+
241
+ for file_elem in filegrp.xpath("./mets:file", namespaces=ns):
242
+ file_id = file_elem.get("ID")
243
+ mimetype = file_elem.get("MIMETYPE")
244
+ flocat_elem = file_elem.find("mets:FLocat", namespaces=ns)
245
+ href = (
246
+ flocat_elem.get("{http://www.w3.org/1999/xlink}href")
247
+ if flocat_elem is not None
248
+ else None
249
+ )
250
+ if href is None:
251
+ continue
252
+
253
+ file_info_by_id[file_id] = _FileInfo(
254
+ file_id=file_id, mimetype=mimetype, path=href, use=use
255
+ )
256
+
257
+ USE_TO_ATTR = {
258
+ _UseType.IMAGE: "image",
259
+ _UseType.OCR: "ocr",
260
+ _UseType.COORD_OCR: "coordOCR",
261
+ }
262
+
263
+ for div in self.root_mets.xpath('.//mets:div[@TYPE="page"]', namespaces=ns):
264
+ order_str = div.get("ORDER")
265
+ if not order_str:
266
+ continue
267
+ try:
268
+ page_no = int(order_str) - 1 # make 0-index pages
269
+ except ValueError:
270
+ continue
271
+
272
+ page_files = _PageFiles()
273
+
274
+ for fptr in div.xpath("./mets:fptr", namespaces=ns):
275
+ file_id = fptr.get("FILEID")
276
+ file_info = file_info_by_id.get(file_id)
277
+
278
+ if file_info:
279
+ attr = USE_TO_ATTR.get(file_info.use)
280
+ if attr:
281
+ setattr(page_files, attr, file_info)
282
+
283
+ self.page_map[page_no] = page_files
284
+
285
+ def _validate_mets_xml(self, xml_string) -> Optional[etree._Element]:
286
+ root: etree._Element = etree.fromstring(xml_string)
287
+ if (
288
+ root.tag == "{http://www.loc.gov/METS/}mets"
289
+ and root.get("PROFILE") == "gbs"
290
+ ):
291
+ return root
292
+
293
+ _log.warning(f"The root element is not <mets:mets> with PROFILE='gbs': {root}")
294
+ return None
295
+
296
+ def _parse_page(self, page_no: int) -> Tuple[SegmentedPdfPage, PILImage]:
297
+ # TODO: use better fallbacks...
298
+ image_info = self.page_map[page_no].image
299
+ assert image_info is not None
300
+ ocr_info = self.page_map[page_no].coordOCR
301
+ assert ocr_info is not None
302
+
303
+ image_file = self._tar.extractfile(image_info.path)
304
+ assert image_file is not None
305
+ buf = BytesIO(image_file.read())
306
+ im: PILImage = Image.open(buf)
307
+ ocr_file = self._tar.extractfile(ocr_info.path)
308
+ assert ocr_file is not None
309
+ ocr_content = ocr_file.read()
310
+ parser = etree.HTMLParser()
311
+ ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser)
312
+
313
+ line_cells: List[TextCell] = []
314
+ word_cells: List[TextCell] = []
315
+
316
+ page_div = ocr_root.xpath("//div[@class='ocr_page']")
317
+
318
+ size = Size(width=im.size[0], height=im.size[1])
319
+ if page_div:
320
+ title = page_div[0].attrib.get("title", "")
321
+ rect = _extract_rect(title)
322
+ if rect:
323
+ size = Size(width=rect.width, height=rect.height)
324
+ else:
325
+ _log.error(f"Could not find ocr_page for page {page_no}")
326
+
327
+ im = im.resize(size=(round(size.width), round(size.height)))
328
+ im = im.convert("RGB")
329
+
330
+ # Extract all ocrx_word spans
331
+ for ix, word in enumerate(ocr_root.xpath("//span[@class='ocrx_word']")):
332
+ text = "".join(word.itertext()).strip()
333
+ title = word.attrib.get("title", "")
334
+ rect = _extract_rect(title)
335
+ conf = _extract_confidence(title)
336
+ if rect:
337
+ word_cells.append(
338
+ TextCell(
339
+ index=ix,
340
+ text=text,
341
+ orig=text,
342
+ rect=rect,
343
+ from_ocr=True,
344
+ confidence=conf,
345
+ )
346
+ )
347
+
348
+ # Extract all ocr_line spans
349
+ # line: etree._Element
350
+ for ix, line in enumerate(ocr_root.xpath("//span[@class='ocr_line']")):
351
+ text = "".join(line.itertext()).strip()
352
+ title = line.attrib.get("title", "")
353
+ rect = _extract_rect(title)
354
+ conf = _extract_confidence(title)
355
+ if rect:
356
+ line_cells.append(
357
+ TextCell(
358
+ index=ix,
359
+ text=text,
360
+ orig=text,
361
+ rect=rect,
362
+ from_ocr=True,
363
+ confidence=conf,
364
+ )
365
+ )
366
+
367
+ page = SegmentedPdfPage(
368
+ dimension=_get_pdf_page_geometry(size),
369
+ textline_cells=line_cells,
370
+ char_cells=[],
371
+ word_cells=word_cells,
372
+ has_textlines=True,
373
+ has_words=True,
374
+ has_chars=False,
375
+ )
376
+ return page, im
377
+
378
+ def page_count(self) -> int:
379
+ return len(self.page_map)
380
+
381
+ def load_page(self, page_no: int) -> MetsGbsPageBackend:
382
+ # TODO: is this thread-safe?
383
+ page, im = self._parse_page(page_no)
384
+ return MetsGbsPageBackend(parsed_page=page, page_im=im)
385
+
386
+ def is_valid(self) -> bool:
387
+ return self.root_mets is not None and self.page_count() > 0
388
+
389
+ @classmethod
390
+ def supported_formats(cls) -> Set[InputFormat]:
391
+ return {InputFormat.METS_GBS}
392
+
393
+ @classmethod
394
+ def supports_pagination(cls) -> bool:
395
+ return True
396
+
397
+ def unload(self) -> None:
398
+ super().unload()
399
+ self._tar.close()
@@ -84,9 +84,9 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
84
84
 
85
85
  buf.seek(0)
86
86
  self.path_or_stream = buf
87
- else:
87
+ elif self.input_format not in self.supported_formats():
88
88
  raise RuntimeError(
89
- f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
89
+ f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
90
90
  )
91
91
 
92
92
  @abstractmethod
@@ -99,7 +99,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
99
99
 
100
100
  @classmethod
101
101
  def supported_formats(cls) -> Set[InputFormat]:
102
- return {InputFormat.PDF}
102
+ return {InputFormat.PDF, InputFormat.IMAGE}
103
103
 
104
104
  @classmethod
105
105
  def supports_pagination(cls) -> bool:
docling/cli/main.py CHANGED
@@ -26,6 +26,7 @@ from rich.console import Console
26
26
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
27
27
  from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
28
28
  from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
29
+ from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
29
30
  from docling.backend.pdf_backend import PdfDocumentBackend
30
31
  from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
31
32
  from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
@@ -262,6 +263,12 @@ def export_documents(
262
263
 
263
264
  else:
264
265
  _log.warning(f"Document {conv_res.input.file} failed to convert.")
266
+ if _log.isEnabledFor(logging.INFO):
267
+ for err in conv_res.errors:
268
+ _log.info(
269
+ f" [Failure Detail] Component: {err.component_type}, "
270
+ f"Module: {err.module_name}, Message: {err.error_message}"
271
+ )
265
272
  failure_count += 1
266
273
 
267
274
  _log.info(
@@ -601,9 +608,18 @@ def convert( # noqa: C901
601
608
  backend=backend, # pdf_backend
602
609
  )
603
610
 
611
+ # METS GBS options
612
+ mets_gbs_options = pipeline_options.model_copy()
613
+ mets_gbs_options.do_ocr = False
614
+ mets_gbs_format_option = PdfFormatOption(
615
+ pipeline_options=mets_gbs_options,
616
+ backend=MetsGbsDocumentBackend,
617
+ )
618
+
604
619
  format_options = {
605
620
  InputFormat.PDF: pdf_format_option,
606
621
  InputFormat.IMAGE: pdf_format_option,
622
+ InputFormat.METS_GBS: mets_gbs_format_option,
607
623
  }
608
624
 
609
625
  elif pipeline == ProcessingPipeline.VLM:
@@ -56,6 +56,7 @@ class InputFormat(str, Enum):
56
56
  XLSX = "xlsx"
57
57
  XML_USPTO = "xml_uspto"
58
58
  XML_JATS = "xml_jats"
59
+ METS_GBS = "mets_gbs"
59
60
  JSON_DOCLING = "json_docling"
60
61
  AUDIO = "audio"
61
62
 
@@ -81,6 +82,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
81
82
  InputFormat.CSV: ["csv"],
82
83
  InputFormat.XLSX: ["xlsx", "xlsm"],
83
84
  InputFormat.XML_USPTO: ["xml", "txt"],
85
+ InputFormat.METS_GBS: ["tar.gz"],
84
86
  InputFormat.JSON_DOCLING: ["json"],
85
87
  InputFormat.AUDIO: ["wav", "mp3"],
86
88
  }
@@ -113,6 +115,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
113
115
  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
114
116
  ],
115
117
  InputFormat.XML_USPTO: ["application/xml", "text/plain"],
118
+ InputFormat.METS_GBS: ["application/mets+xml"],
116
119
  InputFormat.JSON_DOCLING: ["application/json"],
117
120
  InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
118
121
  }
@@ -1,6 +1,7 @@
1
1
  import csv
2
2
  import logging
3
3
  import re
4
+ import tarfile
4
5
  from collections.abc import Iterable
5
6
  from enum import Enum
6
7
  from io import BytesIO
@@ -314,6 +315,10 @@ class _DocumentConversionInput(BaseModel):
314
315
  elif objname.endswith(".pptx"):
315
316
  mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
316
317
 
318
+ if mime is not None and mime.lower() == "application/gzip":
319
+ if detected_mime := _DocumentConversionInput._detect_mets_gbs(obj):
320
+ mime = detected_mime
321
+
317
322
  mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
318
323
  mime = mime or _DocumentConversionInput._detect_csv(content)
319
324
  mime = mime or "text/plain"
@@ -457,3 +462,24 @@ class _DocumentConversionInput(BaseModel):
457
462
  return None
458
463
 
459
464
  return None
465
+
466
+ @staticmethod
467
+ def _detect_mets_gbs(
468
+ obj: Union[Path, DocumentStream],
469
+ ) -> Optional[Literal["application/mets+xml"]]:
470
+ content = obj if isinstance(obj, Path) else obj.stream
471
+ tar: tarfile.TarFile
472
+ member: tarfile.TarInfo
473
+ with tarfile.open(
474
+ name=content if isinstance(content, Path) else None,
475
+ fileobj=content if isinstance(content, BytesIO) else None,
476
+ mode="r:gz",
477
+ ) as tar:
478
+ for member in tar.getmembers():
479
+ if member.name.endswith(".xml"):
480
+ file = tar.extractfile(member)
481
+ if file is not None:
482
+ content_str = file.read().decode(errors="ignore")
483
+ if "http://www.loc.gov/METS/" in content_str:
484
+ return "application/mets+xml"
485
+ return None
@@ -1,5 +1,5 @@
1
1
  from enum import Enum
2
- from typing import Any, Callable, Dict, List, Literal, Optional, Union
2
+ from typing import Any, Dict, List, Literal, Optional
3
3
 
4
4
  from docling_core.types.doc.page import SegmentedPage
5
5
  from pydantic import AnyUrl, BaseModel
@@ -10,11 +10,17 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
10
10
 
11
11
  class BaseVlmOptions(BaseModel):
12
12
  kind: str
13
- prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
13
+ prompt: str
14
14
  scale: float = 2.0
15
15
  max_size: Optional[int] = None
16
16
  temperature: float = 0.0
17
17
 
18
+ def build_prompt(self, page: Optional[SegmentedPage]) -> str:
19
+ return self.prompt
20
+
21
+ def decode_response(self, text: str) -> str:
22
+ return text
23
+
18
24
 
19
25
  class ResponseFormat(str, Enum):
20
26
  DOCTAGS = "doctags"
@@ -5,7 +5,9 @@ import threading
5
5
  import time
6
6
  from collections.abc import Iterable, Iterator
7
7
  from concurrent.futures import ThreadPoolExecutor
8
+ from datetime import datetime
8
9
  from functools import partial
10
+ from io import BytesIO
9
11
  from pathlib import Path
10
12
  from typing import Dict, List, Optional, Tuple, Type, Union
11
13
 
@@ -18,6 +20,7 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
18
20
  from docling.backend.html_backend import HTMLDocumentBackend
19
21
  from docling.backend.json.docling_json_backend import DoclingJSONBackend
20
22
  from docling.backend.md_backend import MarkdownDocumentBackend
23
+ from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
21
24
  from docling.backend.msexcel_backend import MsExcelDocumentBackend
22
25
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
23
26
  from docling.backend.msword_backend import MsWordDocumentBackend
@@ -157,6 +160,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
157
160
  InputFormat.XML_JATS: FormatOption(
158
161
  pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
159
162
  ),
163
+ InputFormat.METS_GBS: FormatOption(
164
+ pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend
165
+ ),
160
166
  InputFormat.IMAGE: FormatOption(
161
167
  pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
162
168
  ),
@@ -275,6 +281,34 @@ class DocumentConverter:
275
281
  "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
276
282
  )
277
283
 
284
+ @validate_call(config=ConfigDict(strict=True))
285
+ def convert_string(
286
+ self,
287
+ content: str,
288
+ format: InputFormat,
289
+ name: Optional[str],
290
+ ) -> ConversionResult:
291
+ name = name or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
292
+
293
+ if format == InputFormat.MD:
294
+ if not name.endswith(".md"):
295
+ name += ".md"
296
+
297
+ buff = BytesIO(content.encode("utf-8"))
298
+ doc_stream = DocumentStream(name=name, stream=buff)
299
+
300
+ return self.convert(doc_stream)
301
+ elif format == InputFormat.HTML:
302
+ if not name.endswith(".html"):
303
+ name += ".html"
304
+
305
+ buff = BytesIO(content.encode("utf-8"))
306
+ doc_stream = DocumentStream(name=name, stream=buff)
307
+
308
+ return self.convert(doc_stream)
309
+ else:
310
+ raise ValueError(f"format {format} is not supported in `convert_string`")
311
+
278
312
  def _convert(
279
313
  self, conv_input: _DocumentConversionInput, raises_on_error: bool
280
314
  ) -> Iterator[ConversionResult]:
@@ -53,11 +53,7 @@ class ApiVlmModel(BasePageModel):
53
53
  if hi_res_image.mode != "RGB":
54
54
  hi_res_image = hi_res_image.convert("RGB")
55
55
 
56
- if callable(self.vlm_options.prompt):
57
- prompt = self.vlm_options.prompt(page.parsed_page)
58
- else:
59
- prompt = self.vlm_options.prompt
60
-
56
+ prompt = self.vlm_options.build_prompt(page.parsed_page)
61
57
  page_tags = api_image_request(
62
58
  image=hi_res_image,
63
59
  prompt=prompt,
@@ -67,6 +63,7 @@ class ApiVlmModel(BasePageModel):
67
63
  **self.params,
68
64
  )
69
65
 
66
+ page_tags = self.vlm_options.decode_response(page_tags)
70
67
  page.predictions.vlm_response = VlmPrediction(text=page_tags)
71
68
 
72
69
  return page
@@ -135,10 +135,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
135
135
  )
136
136
 
137
137
  # Define prompt structure
138
- if callable(self.vlm_options.prompt):
139
- user_prompt = self.vlm_options.prompt(page.parsed_page)
140
- else:
141
- user_prompt = self.vlm_options.prompt
138
+ user_prompt = self.vlm_options.build_prompt(page.parsed_page)
142
139
  prompt = self.formulate_prompt(user_prompt)
143
140
 
144
141
  inputs = self.processor(
@@ -166,6 +163,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
166
163
  _log.debug(
167
164
  f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
168
165
  )
166
+ generated_texts = self.vlm_options.decode_response(generated_texts)
169
167
  page.predictions.vlm_response = VlmPrediction(
170
168
  text=generated_texts,
171
169
  generation_time=generation_time,
@@ -35,9 +35,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
35
35
 
36
36
  if self.enabled:
37
37
  try:
38
- from mlx_vlm import generate, load # type: ignore
38
+ from mlx_vlm import generate, load, stream_generate # type: ignore
39
39
  from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
40
- from mlx_vlm.utils import load_config, stream_generate # type: ignore
40
+ from mlx_vlm.utils import load_config # type: ignore
41
41
  except ImportError:
42
42
  raise ImportError(
43
43
  "mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
@@ -84,10 +84,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
84
84
  if hi_res_image.mode != "RGB":
85
85
  hi_res_image = hi_res_image.convert("RGB")
86
86
 
87
- if callable(self.vlm_options.prompt):
88
- user_prompt = self.vlm_options.prompt(page.parsed_page)
89
- else:
90
- user_prompt = self.vlm_options.prompt
87
+ user_prompt = self.vlm_options.build_prompt(page.parsed_page)
91
88
  prompt = self.apply_chat_template(
92
89
  self.processor, self.config, user_prompt, num_images=1
93
90
  )
@@ -142,6 +139,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
142
139
  _log.debug(
143
140
  f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time} tokens/sec)."
144
141
  )
142
+ page_tags = self.vlm_options.decode_response(page_tags)
145
143
  page.predictions.vlm_response = VlmPrediction(
146
144
  text=page_tags,
147
145
  generation_time=generation_time,