docling 2.28.0__tar.gz → 2.28.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.28.0 → docling-2.28.2}/PKG-INFO +1 -1
- {docling-2.28.0 → docling-2.28.2}/docling/backend/html_backend.py +3 -3
- {docling-2.28.0 → docling-2.28.2}/docling/backend/md_backend.py +15 -5
- {docling-2.28.0 → docling-2.28.2}/docling/backend/msword_backend.py +51 -12
- {docling-2.28.0 → docling-2.28.2}/docling/document_converter.py +29 -17
- {docling-2.28.0 → docling-2.28.2}/docling/models/page_preprocessing_model.py +7 -1
- {docling-2.28.0 → docling-2.28.2}/pyproject.toml +1 -1
- {docling-2.28.0 → docling-2.28.2}/LICENSE +0 -0
- {docling-2.28.0 → docling-2.28.2}/README.md +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/__init__.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/backend/__init__.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/backend/abstract_backend.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/backend/csv_backend.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/backend/docling_parse_v4_backend.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/backend/docx/__init__.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/backend/json/__init__.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/backend/pdf_backend.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/backend/xml/__init__.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/chunking/__init__.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/cli/__init__.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/cli/main.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/cli/models.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/cli/tools.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/datamodel/__init__.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/datamodel/base_models.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/datamodel/document.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/datamodel/pipeline_options.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/datamodel/settings.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/exceptions.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/__init__.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/base_model.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/base_ocr_model.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/code_formula_model.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/easyocr_model.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/factories/__init__.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/factories/base_factory.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/hf_mlx_model.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/hf_vlm_model.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/layout_model.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/page_assemble_model.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/plugins/__init__.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/plugins/defaults.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/readingorder_model.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/table_structure_model.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/pipeline/__init__.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/py.typed +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/utils/__init__.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/utils/export.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/utils/glm_utils.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/utils/locks.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/utils/model_downloader.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/utils/ocr_utils.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/utils/profiling.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/utils/utils.py +0 -0
- {docling-2.28.0 → docling-2.28.2}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.28.
|
3
|
+
Version: 2.28.2
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/docling-project/docling
|
6
6
|
License: MIT
|
@@ -206,9 +206,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
206
206
|
hlevel = int(element.name.replace("h", ""))
|
207
207
|
text = element.text.strip()
|
208
208
|
|
209
|
-
|
210
|
-
self.content_layer = ContentLayer.BODY
|
209
|
+
self.content_layer = ContentLayer.BODY
|
211
210
|
|
211
|
+
if hlevel == 1:
|
212
212
|
for key in self.parents.keys():
|
213
213
|
self.parents[key] = None
|
214
214
|
|
@@ -243,7 +243,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
243
243
|
self.parents[hlevel] = doc.add_heading(
|
244
244
|
parent=self.parents[hlevel - 1],
|
245
245
|
text=text,
|
246
|
-
level=hlevel,
|
246
|
+
level=hlevel - 1,
|
247
247
|
content_layer=self.content_layer,
|
248
248
|
)
|
249
249
|
|
@@ -212,9 +212,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
212
212
|
traverse(element)
|
213
213
|
snippet_text = "".join(strings)
|
214
214
|
if len(snippet_text) > 0:
|
215
|
-
|
216
|
-
|
217
|
-
|
215
|
+
if doc_label == DocItemLabel.SECTION_HEADER:
|
216
|
+
parent_item = doc.add_heading(
|
217
|
+
text=snippet_text,
|
218
|
+
level=element.level - 1,
|
219
|
+
parent=parent_item,
|
220
|
+
)
|
221
|
+
else:
|
222
|
+
parent_item = doc.add_text(
|
223
|
+
label=doc_label, parent=parent_item, text=snippet_text
|
224
|
+
)
|
218
225
|
|
219
226
|
elif isinstance(element, marko.block.List):
|
220
227
|
has_non_empty_list_items = False
|
@@ -232,12 +239,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
232
239
|
label=label, name=f"list", parent=parent_item
|
233
240
|
)
|
234
241
|
|
235
|
-
elif
|
242
|
+
elif (
|
243
|
+
isinstance(element, marko.block.ListItem)
|
244
|
+
and len(element.children) > 0
|
245
|
+
and isinstance((first_child := element.children[0]), marko.block.Paragraph)
|
246
|
+
):
|
236
247
|
self._close_table(doc)
|
237
248
|
self._process_inline_text(parent_item, doc)
|
238
249
|
_log.debug(" - List item")
|
239
250
|
|
240
|
-
first_child = element.children[0]
|
241
251
|
snippet_text = str(first_child.children[0].children) # type: ignore
|
242
252
|
is_numbered = False
|
243
253
|
if (
|
@@ -53,6 +53,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
53
53
|
self.max_levels: int = 10
|
54
54
|
self.level_at_new_list: Optional[int] = None
|
55
55
|
self.parents: dict[int, Optional[NodeItem]] = {}
|
56
|
+
self.numbered_headers: dict[int, int] = {}
|
56
57
|
for i in range(-1, self.max_levels):
|
57
58
|
self.parents[i] = None
|
58
59
|
|
@@ -346,7 +347,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
346
347
|
parent=None, label=DocItemLabel.TITLE, text=text
|
347
348
|
)
|
348
349
|
elif "Heading" in p_style_id:
|
349
|
-
|
350
|
+
style_element = getattr(paragraph.style, "element", None)
|
351
|
+
if style_element:
|
352
|
+
is_numbered_style = (
|
353
|
+
"<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
|
354
|
+
)
|
355
|
+
else:
|
356
|
+
is_numbered_style = False
|
357
|
+
self.add_header(doc, p_level, text, is_numbered_style)
|
350
358
|
|
351
359
|
elif len(equations) > 0:
|
352
360
|
if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
|
@@ -415,7 +423,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
415
423
|
return
|
416
424
|
|
417
425
|
def add_header(
|
418
|
-
self,
|
426
|
+
self,
|
427
|
+
doc: DoclingDocument,
|
428
|
+
curr_level: Optional[int],
|
429
|
+
text: str,
|
430
|
+
is_numbered_style: bool = False,
|
419
431
|
) -> None:
|
420
432
|
level = self.get_level()
|
421
433
|
if isinstance(curr_level, int):
|
@@ -433,17 +445,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
433
445
|
if key >= curr_level:
|
434
446
|
self.parents[key] = None
|
435
447
|
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
level=curr_level,
|
440
|
-
)
|
448
|
+
current_level = curr_level
|
449
|
+
parent_level = curr_level - 1
|
450
|
+
add_level = curr_level
|
441
451
|
else:
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
452
|
+
current_level = self.level
|
453
|
+
parent_level = self.level - 1
|
454
|
+
add_level = 1
|
455
|
+
|
456
|
+
if is_numbered_style:
|
457
|
+
if add_level in self.numbered_headers:
|
458
|
+
self.numbered_headers[add_level] += 1
|
459
|
+
else:
|
460
|
+
self.numbered_headers[add_level] = 1
|
461
|
+
text = f"{self.numbered_headers[add_level]} {text}"
|
462
|
+
|
463
|
+
# Reset deeper levels
|
464
|
+
next_level = add_level + 1
|
465
|
+
while next_level in self.numbered_headers:
|
466
|
+
self.numbered_headers[next_level] = 0
|
467
|
+
next_level += 1
|
468
|
+
|
469
|
+
# Scan upper levels
|
470
|
+
previous_level = add_level - 1
|
471
|
+
while previous_level in self.numbered_headers:
|
472
|
+
# MSWord convention: no empty sublevels
|
473
|
+
# I.e., sub-sub section (2.0.1) without a sub-section (2.1)
|
474
|
+
# is processed as 2.1.1
|
475
|
+
if self.numbered_headers[previous_level] == 0:
|
476
|
+
self.numbered_headers[previous_level] += 1
|
477
|
+
|
478
|
+
text = f"{self.numbered_headers[previous_level]}.{text}"
|
479
|
+
previous_level -= 1
|
480
|
+
|
481
|
+
self.parents[current_level] = doc.add_heading(
|
482
|
+
parent=self.parents[parent_level],
|
483
|
+
text=text,
|
484
|
+
level=add_level,
|
485
|
+
)
|
447
486
|
return
|
448
487
|
|
449
488
|
def add_listitem(
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import hashlib
|
1
2
|
import logging
|
2
3
|
import math
|
3
4
|
import sys
|
@@ -181,7 +182,14 @@ class DocumentConverter:
|
|
181
182
|
)
|
182
183
|
for format in self.allowed_formats
|
183
184
|
}
|
184
|
-
self.initialized_pipelines: Dict[
|
185
|
+
self.initialized_pipelines: Dict[
|
186
|
+
Tuple[Type[BasePipeline], str], BasePipeline
|
187
|
+
] = {}
|
188
|
+
|
189
|
+
def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
|
190
|
+
"""Generate a hash of pipeline options to use as part of the cache key."""
|
191
|
+
options_str = str(pipeline_options.model_dump())
|
192
|
+
return hashlib.md5(options_str.encode("utf-8")).hexdigest()
|
185
193
|
|
186
194
|
def initialize_pipeline(self, format: InputFormat):
|
187
195
|
"""Initialize the conversion pipeline for the selected format."""
|
@@ -279,31 +287,36 @@ class DocumentConverter:
|
|
279
287
|
yield item
|
280
288
|
|
281
289
|
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
290
|
+
"""Retrieve or initialize a pipeline, reusing instances based on class and options."""
|
282
291
|
fopt = self.format_to_options.get(doc_format)
|
283
292
|
|
284
|
-
if fopt is None:
|
293
|
+
if fopt is None or fopt.pipeline_options is None:
|
285
294
|
return None
|
286
|
-
else:
|
287
|
-
pipeline_class = fopt.pipeline_cls
|
288
|
-
pipeline_options = fopt.pipeline_options
|
289
295
|
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
296
|
+
pipeline_class = fopt.pipeline_cls
|
297
|
+
pipeline_options = fopt.pipeline_options
|
298
|
+
options_hash = self._get_pipeline_options_hash(pipeline_options)
|
299
|
+
|
300
|
+
# Use a composite key to cache pipelines
|
301
|
+
cache_key = (pipeline_class, options_hash)
|
302
|
+
|
303
|
+
if cache_key not in self.initialized_pipelines:
|
304
|
+
_log.info(
|
305
|
+
f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
|
306
|
+
)
|
307
|
+
self.initialized_pipelines[cache_key] = pipeline_class(
|
299
308
|
pipeline_options=pipeline_options
|
300
309
|
)
|
301
|
-
|
310
|
+
else:
|
311
|
+
_log.debug(
|
312
|
+
f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
|
313
|
+
)
|
314
|
+
|
315
|
+
return self.initialized_pipelines[cache_key]
|
302
316
|
|
303
317
|
def _process_document(
|
304
318
|
self, in_doc: InputDocument, raises_on_error: bool
|
305
319
|
) -> ConversionResult:
|
306
|
-
|
307
320
|
valid = (
|
308
321
|
self.allowed_formats is not None and in_doc.format in self.allowed_formats
|
309
322
|
)
|
@@ -345,7 +358,6 @@ class DocumentConverter:
|
|
345
358
|
else:
|
346
359
|
if raises_on_error:
|
347
360
|
raise ConversionError(f"Input document {in_doc.file} is not valid.")
|
348
|
-
|
349
361
|
else:
|
350
362
|
# invalid doc or not of desired format
|
351
363
|
conv_res = ConversionResult(
|
@@ -63,7 +63,13 @@ class PagePreprocessingModel(BasePageModel):
|
|
63
63
|
def draw_text_boxes(image, cells, show: bool = False):
|
64
64
|
draw = ImageDraw.Draw(image)
|
65
65
|
for c in cells:
|
66
|
-
x0, y0, x1, y1 =
|
66
|
+
x0, y0, x1, y1 = (
|
67
|
+
c.to_bounding_box().l,
|
68
|
+
c.to_bounding_box().t,
|
69
|
+
c.to_bounding_box().r,
|
70
|
+
c.to_bounding_box().b,
|
71
|
+
)
|
72
|
+
|
67
73
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
68
74
|
if show:
|
69
75
|
image.show()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.28.
|
3
|
+
version = "2.28.2" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
authors = [
|
6
6
|
"Christoph Auer <cau@zurich.ibm.com>",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|