docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,188 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Iterable, List, Optional, Union
5
+
6
+ from docling_core.types.doc import BoundingBox, CoordOrigin
7
+ from docling_core.types.doc.page import (
8
+ BoundingRectangle,
9
+ PdfPageBoundaryType,
10
+ PdfPageGeometry,
11
+ SegmentedPdfPage,
12
+ TextCell,
13
+ )
14
+ from PIL import Image
15
+
16
+ from docling.backend.abstract_backend import AbstractDocumentBackend
17
+ from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
18
+ from docling.datamodel.backend_options import PdfBackendOptions
19
+ from docling.datamodel.base_models import InputFormat, Size
20
+ from docling.datamodel.document import InputDocument
21
+
22
+ _log = logging.getLogger(__name__)
23
+
24
+
25
+ class _ImagePageBackend(PdfPageBackend):
26
+ def __init__(self, image: Image.Image):
27
+ self._image: Optional[Image.Image] = image
28
+ self.valid: bool = self._image is not None
29
+
30
+ def is_valid(self) -> bool:
31
+ return self.valid
32
+
33
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
34
+ # No text extraction from raw images without OCR
35
+ return ""
36
+
37
+ def get_segmented_page(self) -> SegmentedPdfPage:
38
+ # Return empty segmented page with proper dimensions for raw images
39
+ assert self._image is not None
40
+ page_size = self.get_size()
41
+ bbox = BoundingBox(
42
+ l=0.0,
43
+ t=0.0,
44
+ r=float(page_size.width),
45
+ b=float(page_size.height),
46
+ coord_origin=CoordOrigin.BOTTOMLEFT,
47
+ )
48
+ dimension = PdfPageGeometry(
49
+ angle=0.0,
50
+ rect=BoundingRectangle.from_bounding_box(bbox),
51
+ boundary_type=PdfPageBoundaryType.CROP_BOX,
52
+ art_bbox=bbox,
53
+ bleed_bbox=bbox,
54
+ crop_bbox=bbox,
55
+ media_bbox=bbox,
56
+ trim_bbox=bbox,
57
+ )
58
+ return SegmentedPdfPage(
59
+ dimension=dimension,
60
+ char_cells=[],
61
+ word_cells=[],
62
+ textline_cells=[],
63
+ has_chars=False,
64
+ has_words=False,
65
+ has_lines=False,
66
+ )
67
+
68
+ def get_text_cells(self) -> Iterable[TextCell]:
69
+ # No text cells on raw images
70
+ return []
71
+
72
+ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
73
+ # For raw images, the entire page is a bitmap
74
+ assert self._image is not None
75
+ page_size = self.get_size()
76
+ full_page_bbox = BoundingBox(
77
+ l=0.0,
78
+ t=0.0,
79
+ r=float(page_size.width),
80
+ b=float(page_size.height),
81
+ coord_origin=CoordOrigin.TOPLEFT,
82
+ )
83
+ if scale != 1:
84
+ full_page_bbox = full_page_bbox.scaled(scale=scale)
85
+ yield full_page_bbox
86
+
87
+ def get_page_image(
88
+ self, scale: float = 1, cropbox: Optional[BoundingBox] = None
89
+ ) -> Image.Image:
90
+ assert self._image is not None
91
+ img = self._image
92
+
93
+ if cropbox is not None:
94
+ # Expected cropbox comes in TOPLEFT coords in our pipeline
95
+ if cropbox.coord_origin != CoordOrigin.TOPLEFT:
96
+ # Convert to TOPLEFT relative to current image height
97
+ cropbox = cropbox.to_top_left_origin(img.height)
98
+ left, top, right, bottom = cropbox.as_tuple()
99
+ left = max(0, round(left))
100
+ top = max(0, round(top))
101
+ right = min(img.width, round(right))
102
+ bottom = min(img.height, round(bottom))
103
+ img = img.crop((left, top, right, bottom))
104
+
105
+ if scale != 1:
106
+ new_w = max(1, round(img.width * scale))
107
+ new_h = max(1, round(img.height * scale))
108
+ img = img.resize((new_w, new_h))
109
+
110
+ return img
111
+
112
+ def get_size(self) -> Size:
113
+ assert self._image is not None
114
+ return Size(width=self._image.width, height=self._image.height)
115
+
116
+ def unload(self):
117
+ # Help GC and free memory
118
+ self._image = None
119
+
120
+
121
+ class ImageDocumentBackend(PdfDocumentBackend):
122
+ """Image-native backend that bypasses pypdfium2.
123
+
124
+ Notes:
125
+ - Subclasses PdfDocumentBackend to satisfy pipeline type checks.
126
+ - Intentionally avoids calling PdfDocumentBackend.__init__ to skip
127
+ the image→PDF conversion and any pypdfium2 usage.
128
+ - Handles multi-page TIFF by extracting frames eagerly to separate
129
+ Image objects to keep thread-safety when pages process in parallel.
130
+ """
131
+
132
+ def __init__(
133
+ self,
134
+ in_doc: InputDocument,
135
+ path_or_stream: Union[BytesIO, Path],
136
+ options: PdfBackendOptions = PdfBackendOptions(),
137
+ ):
138
+ # Bypass PdfDocumentBackend.__init__ to avoid image→PDF conversion
139
+ AbstractDocumentBackend.__init__(self, in_doc, path_or_stream, options)
140
+ self.options: PdfBackendOptions = options
141
+
142
+ if self.input_format not in {InputFormat.IMAGE}:
143
+ raise RuntimeError(
144
+ f"Incompatible file format {self.input_format} was passed to ImageDocumentBackend."
145
+ )
146
+
147
+ # Load frames eagerly for thread-safety across pages
148
+ self._frames: List[Image.Image] = []
149
+ try:
150
+ img = Image.open(self.path_or_stream) # type: ignore[arg-type]
151
+
152
+ # Handle multi-frame and single-frame images
153
+ # - multiframe formats: TIFF, GIF, ICO
154
+ # - singleframe formats: JPEG (.jpg, .jpeg), PNG (.png), BMP, WEBP (unless animated), HEIC
155
+ frame_count = getattr(img, "n_frames", 1)
156
+
157
+ if frame_count > 1:
158
+ for i in range(frame_count):
159
+ img.seek(i)
160
+ self._frames.append(img.copy().convert("RGB"))
161
+ else:
162
+ self._frames.append(img.convert("RGB"))
163
+ except Exception as e:
164
+ raise RuntimeError(f"Could not load image for document {self.file}") from e
165
+
166
+ def is_valid(self) -> bool:
167
+ return len(self._frames) > 0
168
+
169
+ def page_count(self) -> int:
170
+ return len(self._frames)
171
+
172
+ def load_page(self, page_no: int) -> _ImagePageBackend:
173
+ if not (0 <= page_no < len(self._frames)):
174
+ raise IndexError(f"Page index out of range: {page_no}")
175
+ return _ImagePageBackend(self._frames[page_no])
176
+
177
+ @classmethod
178
+ def supported_formats(cls) -> set[InputFormat]:
179
+ # Only IMAGE here; PDF handling remains in PDF-oriented backends
180
+ return {InputFormat.IMAGE}
181
+
182
+ @classmethod
183
+ def supports_pagination(cls) -> bool:
184
+ return True
185
+
186
+ def unload(self):
187
+ super().unload()
188
+ self._frames = []
File without changes
@@ -0,0 +1,58 @@
1
+ from io import BytesIO
2
+ from pathlib import Path
3
+ from typing import Union
4
+
5
+ from docling_core.types.doc import DoclingDocument
6
+ from typing_extensions import override
7
+
8
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
9
+ from docling.datamodel.base_models import InputFormat
10
+ from docling.datamodel.document import InputDocument
11
+
12
+
13
+ class DoclingJSONBackend(DeclarativeDocumentBackend):
14
+ @override
15
+ def __init__(
16
+ self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
17
+ ) -> None:
18
+ super().__init__(in_doc, path_or_stream)
19
+
20
+ # given we need to store any actual conversion exception for raising it from
21
+ # convert(), this captures the successful result or the actual error in a
22
+ # mutually exclusive way:
23
+ self._doc_or_err = self._get_doc_or_err()
24
+
25
+ @override
26
+ def is_valid(self) -> bool:
27
+ return isinstance(self._doc_or_err, DoclingDocument)
28
+
29
+ @classmethod
30
+ @override
31
+ def supports_pagination(cls) -> bool:
32
+ return False
33
+
34
+ @classmethod
35
+ @override
36
+ def supported_formats(cls) -> set[InputFormat]:
37
+ return {InputFormat.JSON_DOCLING}
38
+
39
+ def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
40
+ try:
41
+ json_data: Union[str, bytes]
42
+ if isinstance(self.path_or_stream, Path):
43
+ with open(self.path_or_stream, encoding="utf-8") as f:
44
+ json_data = f.read()
45
+ elif isinstance(self.path_or_stream, BytesIO):
46
+ json_data = self.path_or_stream.getvalue()
47
+ else:
48
+ raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
49
+ return DoclingDocument.model_validate_json(json_data=json_data)
50
+ except Exception as e:
51
+ return e
52
+
53
+ @override
54
+ def convert(self) -> DoclingDocument:
55
+ if isinstance(self._doc_or_err, DoclingDocument):
56
+ return self._doc_or_err
57
+ else:
58
+ raise self._doc_or_err