docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,262 @@
1
+ import logging
2
+ import warnings
3
+ from pathlib import Path
4
+ from typing import Optional, cast
5
+
6
+ import numpy as np
7
+ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
8
+
9
+ from docling.backend.abstract_backend import AbstractDocumentBackend
10
+ from docling.backend.pdf_backend import PdfDocumentBackend
11
+ from docling.datamodel.base_models import AssembledUnit, Page
12
+ from docling.datamodel.document import ConversionResult
13
+ from docling.datamodel.layout_model_specs import LayoutModelConfig
14
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
15
+ from docling.datamodel.settings import settings
16
+ from docling.models.base_ocr_model import BaseOcrModel
17
+ from docling.models.factories import (
18
+ get_layout_factory,
19
+ get_ocr_factory,
20
+ get_table_structure_factory,
21
+ )
22
+ from docling.models.stages.code_formula.code_formula_model import (
23
+ CodeFormulaModel,
24
+ CodeFormulaModelOptions,
25
+ )
26
+ from docling.models.stages.page_assemble.page_assemble_model import (
27
+ PageAssembleModel,
28
+ PageAssembleOptions,
29
+ )
30
+ from docling.models.stages.page_preprocessing.page_preprocessing_model import (
31
+ PagePreprocessingModel,
32
+ PagePreprocessingOptions,
33
+ )
34
+ from docling.models.stages.reading_order.readingorder_model import (
35
+ ReadingOrderModel,
36
+ ReadingOrderOptions,
37
+ )
38
+ from docling.pipeline.base_pipeline import PaginatedPipeline
39
+ from docling.utils.model_downloader import download_models
40
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
41
+
42
+ _log = logging.getLogger(__name__)
43
+
44
+
45
+ class LegacyStandardPdfPipeline(PaginatedPipeline):
46
+ def __init__(self, pipeline_options: PdfPipelineOptions):
47
+ super().__init__(pipeline_options)
48
+ self.pipeline_options: PdfPipelineOptions
49
+
50
+ with warnings.catch_warnings(): # deprecated generate_table_images
51
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
52
+ self.keep_images = (
53
+ self.pipeline_options.generate_page_images
54
+ or self.pipeline_options.generate_picture_images
55
+ or self.pipeline_options.generate_table_images
56
+ )
57
+
58
+ self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
59
+
60
+ ocr_model = self.get_ocr_model(artifacts_path=self.artifacts_path)
61
+
62
+ layout_factory = get_layout_factory(
63
+ allow_external_plugins=self.pipeline_options.allow_external_plugins
64
+ )
65
+ layout_model = layout_factory.create_instance(
66
+ options=pipeline_options.layout_options,
67
+ artifacts_path=self.artifacts_path,
68
+ accelerator_options=pipeline_options.accelerator_options,
69
+ )
70
+ table_factory = get_table_structure_factory(
71
+ allow_external_plugins=self.pipeline_options.allow_external_plugins
72
+ )
73
+ table_model = table_factory.create_instance(
74
+ options=pipeline_options.table_structure_options,
75
+ enabled=pipeline_options.do_table_structure,
76
+ artifacts_path=self.artifacts_path,
77
+ accelerator_options=pipeline_options.accelerator_options,
78
+ )
79
+
80
+ self.build_pipe = [
81
+ # Pre-processing
82
+ PagePreprocessingModel(
83
+ options=PagePreprocessingOptions(
84
+ images_scale=pipeline_options.images_scale,
85
+ )
86
+ ),
87
+ # OCR
88
+ ocr_model,
89
+ # Layout model
90
+ layout_model,
91
+ # Table structure model
92
+ table_model,
93
+ # Page assemble
94
+ PageAssembleModel(options=PageAssembleOptions()),
95
+ ]
96
+
97
+ self.enrichment_pipe = [
98
+ # Code Formula Enrichment Model
99
+ CodeFormulaModel(
100
+ enabled=pipeline_options.do_code_enrichment
101
+ or pipeline_options.do_formula_enrichment,
102
+ artifacts_path=self.artifacts_path,
103
+ options=CodeFormulaModelOptions(
104
+ do_code_enrichment=pipeline_options.do_code_enrichment,
105
+ do_formula_enrichment=pipeline_options.do_formula_enrichment,
106
+ ),
107
+ accelerator_options=pipeline_options.accelerator_options,
108
+ ),
109
+ *self.enrichment_pipe,
110
+ ]
111
+
112
+ if (
113
+ self.pipeline_options.do_formula_enrichment
114
+ or self.pipeline_options.do_code_enrichment
115
+ or self.pipeline_options.do_picture_classification
116
+ or self.pipeline_options.do_picture_description
117
+ ):
118
+ self.keep_backend = True
119
+
120
+ @staticmethod
121
+ def download_models_hf(
122
+ local_dir: Optional[Path] = None, force: bool = False
123
+ ) -> Path:
124
+ warnings.warn(
125
+ "The usage of LegacyStandardPdfPipeline.download_models_hf() is deprecated "
126
+ "use instead the utility `docling-tools models download`, or "
127
+ "the upstream method docling.utils.models_downloader.download_all()",
128
+ DeprecationWarning,
129
+ stacklevel=3,
130
+ )
131
+
132
+ output_dir = download_models(output_dir=local_dir, force=force, progress=False)
133
+ return output_dir
134
+
135
+ def get_ocr_model(self, artifacts_path: Optional[Path] = None) -> BaseOcrModel:
136
+ factory = get_ocr_factory(
137
+ allow_external_plugins=self.pipeline_options.allow_external_plugins
138
+ )
139
+ return factory.create_instance(
140
+ options=self.pipeline_options.ocr_options,
141
+ enabled=self.pipeline_options.do_ocr,
142
+ artifacts_path=artifacts_path,
143
+ accelerator_options=self.pipeline_options.accelerator_options,
144
+ )
145
+
146
+ def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
147
+ with TimeRecorder(conv_res, "page_init"):
148
+ page._backend = conv_res.input._backend.load_page(page.page_no - 1) # type: ignore
149
+ if page._backend is not None and page._backend.is_valid():
150
+ page.size = page._backend.get_size()
151
+
152
+ return page
153
+
154
+ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
155
+ all_elements = []
156
+ all_headers = []
157
+ all_body = []
158
+
159
+ with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
160
+ for p in conv_res.pages:
161
+ if p.assembled is not None:
162
+ for el in p.assembled.body:
163
+ all_body.append(el)
164
+ for el in p.assembled.headers:
165
+ all_headers.append(el)
166
+ for el in p.assembled.elements:
167
+ all_elements.append(el)
168
+
169
+ conv_res.assembled = AssembledUnit(
170
+ elements=all_elements, headers=all_headers, body=all_body
171
+ )
172
+
173
+ conv_res.document = self.reading_order_model(conv_res)
174
+
175
+ # Generate page images in the output
176
+ if self.pipeline_options.generate_page_images:
177
+ for page in conv_res.pages:
178
+ assert page.image is not None
179
+ page_no = page.page_no
180
+ conv_res.document.pages[page_no].image = ImageRef.from_pil(
181
+ page.image, dpi=int(72 * self.pipeline_options.images_scale)
182
+ )
183
+
184
+ # Generate images of the requested element types
185
+ with warnings.catch_warnings(): # deprecated generate_table_images
186
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
187
+ if (
188
+ self.pipeline_options.generate_picture_images
189
+ or self.pipeline_options.generate_table_images
190
+ ):
191
+ scale = self.pipeline_options.images_scale
192
+ for element, _level in conv_res.document.iterate_items():
193
+ if not isinstance(element, DocItem) or len(element.prov) == 0:
194
+ continue
195
+ if (
196
+ isinstance(element, PictureItem)
197
+ and self.pipeline_options.generate_picture_images
198
+ ) or (
199
+ isinstance(element, TableItem)
200
+ and self.pipeline_options.generate_table_images
201
+ ):
202
+ page_ix = element.prov[0].page_no - 1
203
+ page = next(
204
+ (p for p in conv_res.pages if p.page_no == page_ix),
205
+ cast("Page", None),
206
+ )
207
+ assert page is not None
208
+ assert page.size is not None
209
+ assert page.image is not None
210
+
211
+ crop_bbox = (
212
+ element.prov[0]
213
+ .bbox.scaled(scale=scale)
214
+ .to_top_left_origin(
215
+ page_height=page.size.height * scale
216
+ )
217
+ )
218
+
219
+ cropped_im = page.image.crop(crop_bbox.as_tuple())
220
+ element.image = ImageRef.from_pil(
221
+ cropped_im, dpi=int(72 * scale)
222
+ )
223
+
224
+ # Aggregate confidence values for document:
225
+ if len(conv_res.pages) > 0:
226
+ with warnings.catch_warnings():
227
+ warnings.filterwarnings(
228
+ "ignore",
229
+ category=RuntimeWarning,
230
+ message="Mean of empty slice|All-NaN slice encountered",
231
+ )
232
+ conv_res.confidence.layout_score = float(
233
+ np.nanmean(
234
+ [c.layout_score for c in conv_res.confidence.pages.values()]
235
+ )
236
+ )
237
+ conv_res.confidence.parse_score = float(
238
+ np.nanquantile(
239
+ [c.parse_score for c in conv_res.confidence.pages.values()],
240
+ q=0.1, # parse score should relate to worst 10% of pages.
241
+ )
242
+ )
243
+ conv_res.confidence.table_score = float(
244
+ np.nanmean(
245
+ [c.table_score for c in conv_res.confidence.pages.values()]
246
+ )
247
+ )
248
+ conv_res.confidence.ocr_score = float(
249
+ np.nanmean(
250
+ [c.ocr_score for c in conv_res.confidence.pages.values()]
251
+ )
252
+ )
253
+
254
+ return conv_res
255
+
256
+ @classmethod
257
+ def get_default_options(cls) -> PdfPipelineOptions:
258
+ return PdfPipelineOptions()
259
+
260
+ @classmethod
261
+ def is_backend_supported(cls, backend: AbstractDocumentBackend):
262
+ return isinstance(backend, PdfDocumentBackend)
@@ -0,0 +1,55 @@
1
+ import logging
2
+
3
+ from docling.backend.abstract_backend import (
4
+ AbstractDocumentBackend,
5
+ DeclarativeDocumentBackend,
6
+ )
7
+ from docling.datamodel.base_models import ConversionStatus
8
+ from docling.datamodel.document import ConversionResult
9
+ from docling.datamodel.pipeline_options import ConvertPipelineOptions
10
+ from docling.pipeline.base_pipeline import ConvertPipeline
11
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
12
+
13
+ _log = logging.getLogger(__name__)
14
+
15
+
16
+ class SimplePipeline(ConvertPipeline):
17
+ """SimpleModelPipeline.
18
+
19
+ This class is used at the moment for formats / backends
20
+ which produce straight DoclingDocument output.
21
+ """
22
+
23
+ def __init__(self, pipeline_options: ConvertPipelineOptions):
24
+ super().__init__(pipeline_options)
25
+
26
+ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
27
+ if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
28
+ raise RuntimeError(
29
+ f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
30
+ f"Can not convert this with simple pipeline. "
31
+ f"Please check your format configuration on DocumentConverter."
32
+ )
33
+ # conv_res.status = ConversionStatus.FAILURE
34
+ # return conv_res
35
+
36
+ # Instead of running a page-level pipeline to build up the document structure,
37
+ # the backend is expected to be of type DeclarativeDocumentBackend, which can output
38
+ # a DoclingDocument straight.
39
+ with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
40
+ conv_res.document = conv_res.input._backend.convert()
41
+ return conv_res
42
+
43
+ def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
44
+ # This is called only if the previous steps didn't raise.
45
+ # Since we don't have anything else to evaluate, we can
46
+ # safely return SUCCESS.
47
+ return ConversionStatus.SUCCESS
48
+
49
+ @classmethod
50
+ def get_default_options(cls) -> ConvertPipelineOptions:
51
+ return ConvertPipelineOptions()
52
+
53
+ @classmethod
54
+ def is_backend_supported(cls, backend: AbstractDocumentBackend):
55
+ return isinstance(backend, DeclarativeDocumentBackend)