docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,399 @@
1
+ """Backend for GBS Google Books schema."""
2
+
3
+ import logging
4
+ import tarfile
5
+ from collections.abc import Iterable
6
+ from dataclasses import dataclass
7
+ from enum import Enum
8
+ from io import BytesIO
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union
11
+
12
+ from docling_core.types.doc import BoundingBox, CoordOrigin, Size
13
+ from docling_core.types.doc.page import (
14
+ BoundingRectangle,
15
+ PdfPageBoundaryType,
16
+ PdfPageGeometry,
17
+ SegmentedPdfPage,
18
+ TextCell,
19
+ )
20
+ from lxml import etree
21
+ from PIL import Image
22
+ from PIL.Image import Image as PILImage
23
+
24
+ from docling.backend.abstract_backend import PaginatedDocumentBackend
25
+ from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
26
+ from docling.datamodel.base_models import InputFormat
27
+
28
+ if TYPE_CHECKING:
29
+ from docling.datamodel.document import InputDocument
30
+
31
+ _log = logging.getLogger(__name__)
32
+
33
+
34
+ def _get_pdf_page_geometry(
35
+ size: Size,
36
+ ) -> PdfPageGeometry:
37
+ boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX
38
+
39
+ bbox_tuple = (0, 0, size.width, size.height)
40
+ bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.TOPLEFT)
41
+
42
+ return PdfPageGeometry(
43
+ angle=0.0,
44
+ rect=BoundingRectangle.from_bounding_box(bbox),
45
+ boundary_type=boundary_type,
46
+ art_bbox=bbox,
47
+ bleed_bbox=bbox,
48
+ crop_bbox=bbox,
49
+ media_bbox=bbox,
50
+ trim_bbox=bbox,
51
+ )
52
+
53
+
54
+ class MetsGbsPageBackend(PdfPageBackend):
55
+ def __init__(self, parsed_page: SegmentedPdfPage, page_im: PILImage):
56
+ self._im = page_im
57
+ self._dpage = parsed_page
58
+ self.valid = parsed_page is not None
59
+
60
+ def is_valid(self) -> bool:
61
+ return self.valid
62
+
63
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
64
+ # Find intersecting cells on the page
65
+ text_piece = ""
66
+ page_size = self.get_size()
67
+
68
+ scale = (
69
+ 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
70
+ )
71
+
72
+ for i, cell in enumerate(self._dpage.textline_cells):
73
+ cell_bbox = (
74
+ cell.rect.to_bounding_box()
75
+ .to_top_left_origin(page_height=page_size.height)
76
+ .scaled(scale)
77
+ )
78
+
79
+ overlap_frac = cell_bbox.intersection_over_self(bbox)
80
+
81
+ if overlap_frac > 0.5:
82
+ if len(text_piece) > 0:
83
+ text_piece += " "
84
+ text_piece += cell.text
85
+
86
+ return text_piece
87
+
88
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
89
+ return self._dpage
90
+
91
+ def get_text_cells(self) -> Iterable[TextCell]:
92
+ return self._dpage.textline_cells
93
+
94
+ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
95
+ AREA_THRESHOLD = 0 # 32 * 32
96
+
97
+ images = self._dpage.bitmap_resources
98
+
99
+ for img in images:
100
+ cropbox = img.rect.to_bounding_box().to_top_left_origin(
101
+ self.get_size().height
102
+ )
103
+
104
+ if cropbox.area() > AREA_THRESHOLD:
105
+ cropbox = cropbox.scaled(scale=scale)
106
+
107
+ yield cropbox
108
+
109
+ def get_page_image(
110
+ self, scale: float = 1, cropbox: Optional[BoundingBox] = None
111
+ ) -> Image.Image:
112
+ page_size = self.get_size()
113
+ assert (
114
+ page_size.width == self._im.size[0] and page_size.height == self._im.size[1]
115
+ )
116
+
117
+ if not cropbox:
118
+ cropbox = BoundingBox(
119
+ l=0,
120
+ r=page_size.width,
121
+ t=0,
122
+ b=page_size.height,
123
+ coord_origin=CoordOrigin.TOPLEFT,
124
+ )
125
+
126
+ image = self._im.resize(
127
+ size=(round(page_size.width * scale), round(page_size.height * scale))
128
+ ).crop(cropbox.scaled(scale=scale).as_tuple())
129
+ return image
130
+
131
+ def get_size(self) -> Size:
132
+ return Size(
133
+ width=self._dpage.dimension.width, height=self._dpage.dimension.height
134
+ )
135
+
136
+ def unload(self) -> None:
137
+ if hasattr(self, "_im"):
138
+ delattr(self, "_im")
139
+ if hasattr(self, "_dpage"):
140
+ delattr(self, "_dpage")
141
+
142
+
143
+ class _UseType(str, Enum):
144
+ IMAGE = "image"
145
+ OCR = "OCR"
146
+ COORD_OCR = "coordOCR"
147
+
148
+
149
+ @dataclass
150
+ class _FileInfo:
151
+ file_id: str
152
+ mimetype: str
153
+ path: str
154
+ use: _UseType
155
+
156
+
157
+ @dataclass
158
+ class _PageFiles:
159
+ image: Optional[_FileInfo] = None
160
+ ocr: Optional[_FileInfo] = None
161
+ coordOCR: Optional[_FileInfo] = None
162
+
163
+
164
+ def _extract_rect(title_str: str) -> Optional[BoundingRectangle]:
165
+ """
166
+ Extracts bbox from title string like 'bbox 279 177 306 214;x_wconf 97'
167
+ """
168
+ parts = title_str.split(";")
169
+ for part in parts:
170
+ part = part.strip()
171
+ if part.startswith("bbox "):
172
+ try:
173
+ coords = part.split()[1:]
174
+ rect = BoundingRectangle.from_bounding_box(
175
+ bbox=BoundingBox.from_tuple(
176
+ tuple(map(int, coords)), origin=CoordOrigin.TOPLEFT
177
+ )
178
+ )
179
+ return rect
180
+ except Exception:
181
+ return None
182
+ return None
183
+
184
+
185
+ def _extract_confidence(title_str) -> float:
186
+ """Extracts x_wconf (OCR confidence) value from title string."""
187
+ for part in title_str.split(";"):
188
+ part = part.strip()
189
+ if part.startswith("x_wconf"):
190
+ try:
191
+ return float(part.split()[1]) / 100.0
192
+ except Exception:
193
+ return 1
194
+ return 1
195
+
196
+
197
+ class MetsGbsDocumentBackend(PdfDocumentBackend):
198
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
199
+ super().__init__(in_doc, path_or_stream)
200
+
201
+ self._tar: tarfile.TarFile = (
202
+ tarfile.open(name=self.path_or_stream, mode="r:gz")
203
+ if isinstance(self.path_or_stream, Path)
204
+ else tarfile.open(fileobj=self.path_or_stream, mode="r:gz")
205
+ )
206
+ self.root_mets: Optional[etree._Element] = None
207
+ self.page_map: Dict[int, _PageFiles] = {}
208
+
209
+ for member in self._tar.getmembers():
210
+ if member.name.endswith(".xml"):
211
+ file = self._tar.extractfile(member)
212
+ if file is not None:
213
+ content = file.read()
214
+ self.root_mets = self._validate_mets_xml(content)
215
+ if self.root_mets is not None:
216
+ break
217
+
218
+ if self.root_mets is None:
219
+ raise RuntimeError(
220
+ f"METS GBS backend could not load document {self.document_hash}."
221
+ )
222
+
223
+ ns = {
224
+ "mets": "http://www.loc.gov/METS/",
225
+ "xlink": "http://www.w3.org/1999/xlink",
226
+ "xsi": "http://www.w3.org/2001/XMLSchema-instance",
227
+ "gbs": "http://books.google.com/gbs",
228
+ "premis": "info:lc/xmlns/premis-v2",
229
+ "marc": "http://www.loc.gov/MARC21/slim",
230
+ }
231
+
232
+ file_info_by_id: Dict[str, _FileInfo] = {}
233
+
234
+ for filegrp in self.root_mets.xpath(".//mets:fileGrp", namespaces=ns):
235
+ use_raw = filegrp.get("USE")
236
+ try:
237
+ use = _UseType(use_raw)
238
+ except ValueError:
239
+ continue # Ignore unknown USE types
240
+
241
+ for file_elem in filegrp.xpath("./mets:file", namespaces=ns):
242
+ file_id = file_elem.get("ID")
243
+ mimetype = file_elem.get("MIMETYPE")
244
+ flocat_elem = file_elem.find("mets:FLocat", namespaces=ns)
245
+ href = (
246
+ flocat_elem.get("{http://www.w3.org/1999/xlink}href")
247
+ if flocat_elem is not None
248
+ else None
249
+ )
250
+ if href is None:
251
+ continue
252
+
253
+ file_info_by_id[file_id] = _FileInfo(
254
+ file_id=file_id, mimetype=mimetype, path=href, use=use
255
+ )
256
+
257
+ USE_TO_ATTR = {
258
+ _UseType.IMAGE: "image",
259
+ _UseType.OCR: "ocr",
260
+ _UseType.COORD_OCR: "coordOCR",
261
+ }
262
+
263
+ for div in self.root_mets.xpath('.//mets:div[@TYPE="page"]', namespaces=ns):
264
+ order_str = div.get("ORDER")
265
+ if not order_str:
266
+ continue
267
+ try:
268
+ page_no = int(order_str) - 1 # make 0-index pages
269
+ except ValueError:
270
+ continue
271
+
272
+ page_files = _PageFiles()
273
+
274
+ for fptr in div.xpath("./mets:fptr", namespaces=ns):
275
+ file_id = fptr.get("FILEID")
276
+ file_info = file_info_by_id.get(file_id)
277
+
278
+ if file_info:
279
+ attr = USE_TO_ATTR.get(file_info.use)
280
+ if attr:
281
+ setattr(page_files, attr, file_info)
282
+
283
+ self.page_map[page_no] = page_files
284
+
285
+ def _validate_mets_xml(self, xml_string) -> Optional[etree._Element]:
286
+ root: etree._Element = etree.fromstring(xml_string)
287
+ if (
288
+ root.tag == "{http://www.loc.gov/METS/}mets"
289
+ and root.get("PROFILE") == "gbs"
290
+ ):
291
+ return root
292
+
293
+ _log.warning(f"The root element is not <mets:mets> with PROFILE='gbs': {root}")
294
+ return None
295
+
296
+ def _parse_page(self, page_no: int) -> Tuple[SegmentedPdfPage, PILImage]:
297
+ # TODO: use better fallbacks...
298
+ image_info = self.page_map[page_no].image
299
+ assert image_info is not None
300
+ ocr_info = self.page_map[page_no].coordOCR
301
+ assert ocr_info is not None
302
+
303
+ image_file = self._tar.extractfile(image_info.path)
304
+ assert image_file is not None
305
+ buf = BytesIO(image_file.read())
306
+ im: PILImage = Image.open(buf)
307
+ ocr_file = self._tar.extractfile(ocr_info.path)
308
+ assert ocr_file is not None
309
+ ocr_content = ocr_file.read()
310
+ parser = etree.HTMLParser()
311
+ ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser)
312
+
313
+ line_cells: List[TextCell] = []
314
+ word_cells: List[TextCell] = []
315
+
316
+ page_div = ocr_root.xpath("//div[@class='ocr_page']")
317
+
318
+ size = Size(width=im.size[0], height=im.size[1])
319
+ if page_div:
320
+ title = page_div[0].attrib.get("title", "")
321
+ rect = _extract_rect(title)
322
+ if rect:
323
+ size = Size(width=rect.width, height=rect.height)
324
+ else:
325
+ _log.error(f"Could not find ocr_page for page {page_no}")
326
+
327
+ im = im.resize(size=(round(size.width), round(size.height)))
328
+ im = im.convert("RGB")
329
+
330
+ # Extract all ocrx_word spans
331
+ for ix, word in enumerate(ocr_root.xpath("//span[@class='ocrx_word']")):
332
+ text = "".join(word.itertext()).strip()
333
+ title = word.attrib.get("title", "")
334
+ rect = _extract_rect(title)
335
+ conf = _extract_confidence(title)
336
+ if rect:
337
+ word_cells.append(
338
+ TextCell(
339
+ index=ix,
340
+ text=text,
341
+ orig=text,
342
+ rect=rect,
343
+ from_ocr=True,
344
+ confidence=conf,
345
+ )
346
+ )
347
+
348
+ # Extract all ocr_line spans
349
+ # line: etree._Element
350
+ for ix, line in enumerate(ocr_root.xpath("//span[@class='ocr_line']")):
351
+ text = "".join(line.itertext()).strip()
352
+ title = line.attrib.get("title", "")
353
+ rect = _extract_rect(title)
354
+ conf = _extract_confidence(title)
355
+ if rect:
356
+ line_cells.append(
357
+ TextCell(
358
+ index=ix,
359
+ text=text,
360
+ orig=text,
361
+ rect=rect,
362
+ from_ocr=True,
363
+ confidence=conf,
364
+ )
365
+ )
366
+
367
+ page = SegmentedPdfPage(
368
+ dimension=_get_pdf_page_geometry(size),
369
+ textline_cells=line_cells,
370
+ char_cells=[],
371
+ word_cells=word_cells,
372
+ has_textlines=True,
373
+ has_words=True,
374
+ has_chars=False,
375
+ )
376
+ return page, im
377
+
378
+ def page_count(self) -> int:
379
+ return len(self.page_map)
380
+
381
+ def load_page(self, page_no: int) -> MetsGbsPageBackend:
382
+ # TODO: is this thread-safe?
383
+ page, im = self._parse_page(page_no)
384
+ return MetsGbsPageBackend(parsed_page=page, page_im=im)
385
+
386
+ def is_valid(self) -> bool:
387
+ return self.root_mets is not None and self.page_count() > 0
388
+
389
+ @classmethod
390
+ def supported_formats(cls) -> Set[InputFormat]:
391
+ return {InputFormat.METS_GBS}
392
+
393
+ @classmethod
394
+ def supports_pagination(cls) -> bool:
395
+ return True
396
+
397
+ def unload(self) -> None:
398
+ super().unload()
399
+ self._tar.close()