docling 2.32.0__tar.gz → 2.33.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.32.0 → docling-2.33.0}/PKG-INFO +2 -2
- {docling-2.32.0 → docling-2.33.0}/docling/backend/msword_backend.py +269 -12
- {docling-2.32.0 → docling-2.33.0}/docling/backend/pypdfium2_backend.py +6 -1
- {docling-2.32.0 → docling-2.33.0}/docling/datamodel/document.py +8 -1
- {docling-2.32.0 → docling-2.33.0}/docling/pipeline/vlm_pipeline.py +19 -21
- {docling-2.32.0 → docling-2.33.0}/pyproject.toml +2 -2
- {docling-2.32.0 → docling-2.33.0}/LICENSE +0 -0
- {docling-2.32.0 → docling-2.33.0}/README.md +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/__init__.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/__init__.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/docling_parse_v4_backend.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/html_backend.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/md_backend.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/chunking/__init__.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/cli/__init__.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/cli/main.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/cli/models.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/cli/tools.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/datamodel/base_models.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/datamodel/pipeline_options.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/datamodel/settings.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/document_converter.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/exceptions.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/__init__.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/api_vlm_model.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/base_model.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/hf_mlx_model.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/hf_vlm_model.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/layout_model.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/py.typed +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/utils/__init__.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/utils/export.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/utils/locks.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/utils/profiling.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/utils/utils.py +0 -0
- {docling-2.32.0 → docling-2.33.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.33.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/docling-project/docling
|
6
6
|
License: MIT
|
@@ -29,7 +29,7 @@ Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platfo
|
|
29
29
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
30
30
|
Requires-Dist: certifi (>=2024.7.4)
|
31
31
|
Requires-Dist: click (<8.2.0)
|
32
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
32
|
+
Requires-Dist: docling-core[chunking] (>=2.29.0,<3.0.0)
|
33
33
|
Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
|
34
34
|
Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
|
35
35
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -2,7 +2,7 @@ import logging
|
|
2
2
|
import re
|
3
3
|
from io import BytesIO
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import Any, Optional, Union
|
5
|
+
from typing import Any, List, Optional, Union
|
6
6
|
|
7
7
|
from docling_core.types.doc import (
|
8
8
|
DocItemLabel,
|
@@ -24,7 +24,6 @@ from docx.text.hyperlink import Hyperlink
|
|
24
24
|
from docx.text.paragraph import Paragraph
|
25
25
|
from docx.text.run import Run
|
26
26
|
from lxml import etree
|
27
|
-
from lxml.etree import XPath
|
28
27
|
from PIL import Image, UnidentifiedImageError
|
29
28
|
from pydantic import AnyUrl
|
30
29
|
from typing_extensions import override
|
@@ -59,6 +58,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
59
58
|
self.parents: dict[int, Optional[NodeItem]] = {}
|
60
59
|
self.numbered_headers: dict[int, int] = {}
|
61
60
|
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
61
|
+
# Track processed textbox elements to avoid duplication
|
62
|
+
self.processed_textbox_elements: List[int] = []
|
63
|
+
# Track content hash of processed paragraphs to avoid duplicate content
|
64
|
+
self.processed_paragraph_content: List[str] = []
|
65
|
+
|
62
66
|
for i in range(-1, self.max_levels):
|
63
67
|
self.parents[i] = None
|
64
68
|
|
@@ -175,10 +179,74 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
175
179
|
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
176
180
|
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
177
181
|
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
182
|
+
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
|
183
|
+
"mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
|
184
|
+
"v": "urn:schemas-microsoft-com:vml",
|
185
|
+
"wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
|
186
|
+
"w10": "urn:schemas-microsoft-com:office:word",
|
187
|
+
"a14": "http://schemas.microsoft.com/office/drawing/2010/main",
|
178
188
|
}
|
179
|
-
xpath_expr = XPath(".//a:blip", namespaces=namespaces)
|
189
|
+
xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
|
180
190
|
drawing_blip = xpath_expr(element)
|
181
191
|
|
192
|
+
# Check for textbox content - check multiple textbox formats
|
193
|
+
# Only process if the element hasn't been processed before
|
194
|
+
element_id = id(element)
|
195
|
+
if element_id not in self.processed_textbox_elements:
|
196
|
+
# Modern Word textboxes
|
197
|
+
txbx_xpath = etree.XPath(
|
198
|
+
".//w:txbxContent|.//v:textbox//w:p", namespaces=namespaces
|
199
|
+
)
|
200
|
+
textbox_elements = txbx_xpath(element)
|
201
|
+
|
202
|
+
# No modern textboxes found, check for alternate/legacy textbox formats
|
203
|
+
if not textbox_elements and tag_name in ["drawing", "pict"]:
|
204
|
+
# Additional checks for textboxes in DrawingML and VML formats
|
205
|
+
alt_txbx_xpath = etree.XPath(
|
206
|
+
".//wps:txbx//w:p|.//w10:wrap//w:p|.//a:p//a:t",
|
207
|
+
namespaces=namespaces,
|
208
|
+
)
|
209
|
+
textbox_elements = alt_txbx_xpath(element)
|
210
|
+
|
211
|
+
# Check for shape text that's not in a standard textbox
|
212
|
+
if not textbox_elements:
|
213
|
+
shape_text_xpath = etree.XPath(
|
214
|
+
".//a:bodyPr/ancestor::*//a:t|.//a:txBody//a:t",
|
215
|
+
namespaces=namespaces,
|
216
|
+
)
|
217
|
+
shape_text_elements = shape_text_xpath(element)
|
218
|
+
if shape_text_elements:
|
219
|
+
# Create custom text elements from shape text
|
220
|
+
text_content = " ".join(
|
221
|
+
[t.text for t in shape_text_elements if t.text]
|
222
|
+
)
|
223
|
+
if text_content.strip():
|
224
|
+
_log.debug(f"Found shape text: {text_content[:50]}...")
|
225
|
+
# Create a paragraph-like element to process with standard handler
|
226
|
+
level = self._get_level()
|
227
|
+
shape_group = doc.add_group(
|
228
|
+
label=GroupLabel.SECTION,
|
229
|
+
parent=self.parents[level - 1],
|
230
|
+
name="shape-text",
|
231
|
+
)
|
232
|
+
doc.add_text(
|
233
|
+
label=DocItemLabel.PARAGRAPH,
|
234
|
+
parent=shape_group,
|
235
|
+
text=text_content,
|
236
|
+
)
|
237
|
+
|
238
|
+
if textbox_elements:
|
239
|
+
# Mark the parent element as processed
|
240
|
+
self.processed_textbox_elements.append(element_id)
|
241
|
+
# Also mark all found textbox elements as processed
|
242
|
+
for tb_element in textbox_elements:
|
243
|
+
self.processed_textbox_elements.append(id(tb_element))
|
244
|
+
|
245
|
+
_log.debug(
|
246
|
+
f"Found textbox content with {len(textbox_elements)} elements"
|
247
|
+
)
|
248
|
+
self._handle_textbox_content(textbox_elements, docx_obj, doc)
|
249
|
+
|
182
250
|
# Check for Tables
|
183
251
|
if element.tag.endswith("tbl"):
|
184
252
|
try:
|
@@ -291,15 +359,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
291
359
|
|
292
360
|
@classmethod
|
293
361
|
def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
362
|
+
# The .bold and .italic properties are booleans, but .underline can be an enum
|
363
|
+
# like WD_UNDERLINE.THICK (value 6), so we need to convert it to a boolean
|
364
|
+
has_bold = run.bold or False
|
365
|
+
has_italic = run.italic or False
|
366
|
+
# Convert any non-None underline value to True
|
367
|
+
has_underline = bool(run.underline is not None and run.underline)
|
368
|
+
|
369
|
+
return Formatting(
|
370
|
+
bold=has_bold,
|
371
|
+
italic=has_italic,
|
372
|
+
underline=has_underline,
|
303
373
|
)
|
304
374
|
|
305
375
|
def _get_paragraph_elements(self, paragraph: Paragraph):
|
@@ -355,6 +425,182 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
355
425
|
|
356
426
|
return paragraph_elements
|
357
427
|
|
428
|
+
def _get_paragraph_position(self, paragraph_element):
|
429
|
+
"""Extract vertical position information from paragraph element."""
|
430
|
+
# First try to directly get the index from w:p element that has an order-related attribute
|
431
|
+
if (
|
432
|
+
hasattr(paragraph_element, "getparent")
|
433
|
+
and paragraph_element.getparent() is not None
|
434
|
+
):
|
435
|
+
parent = paragraph_element.getparent()
|
436
|
+
# Get all paragraph siblings
|
437
|
+
paragraphs = [
|
438
|
+
p for p in parent.getchildren() if etree.QName(p).localname == "p"
|
439
|
+
]
|
440
|
+
# Find index of current paragraph within its siblings
|
441
|
+
try:
|
442
|
+
paragraph_index = paragraphs.index(paragraph_element)
|
443
|
+
return paragraph_index # Use index as position for consistent ordering
|
444
|
+
except ValueError:
|
445
|
+
pass
|
446
|
+
|
447
|
+
# Look for position hints in element attributes and ancestor elements
|
448
|
+
for elem in (*[paragraph_element], *paragraph_element.iterancestors()):
|
449
|
+
# Check for direct position attributes
|
450
|
+
for attr_name in ["y", "top", "positionY", "y-position", "position"]:
|
451
|
+
value = elem.get(attr_name)
|
452
|
+
if value:
|
453
|
+
try:
|
454
|
+
# Remove any non-numeric characters (like 'pt', 'px', etc.)
|
455
|
+
clean_value = re.sub(r"[^0-9.]", "", value)
|
456
|
+
if clean_value:
|
457
|
+
return float(clean_value)
|
458
|
+
except (ValueError, TypeError):
|
459
|
+
pass
|
460
|
+
|
461
|
+
# Check for position in transform attribute
|
462
|
+
transform = elem.get("transform")
|
463
|
+
if transform:
|
464
|
+
# Extract translation component from transform matrix
|
465
|
+
match = re.search(r"translate\([^,]+,\s*([0-9.]+)", transform)
|
466
|
+
if match:
|
467
|
+
try:
|
468
|
+
return float(match.group(1))
|
469
|
+
except ValueError:
|
470
|
+
pass
|
471
|
+
|
472
|
+
# Check for anchors or relative position indicators in Word format
|
473
|
+
# 'dist' attributes can indicate relative positioning
|
474
|
+
for attr_name in ["distT", "distB", "anchor", "relativeFrom"]:
|
475
|
+
if elem.get(attr_name) is not None:
|
476
|
+
return elem.sourceline # Use the XML source line number as fallback
|
477
|
+
|
478
|
+
# For VML shapes, look for specific attributes
|
479
|
+
for ns_uri in paragraph_element.nsmap.values():
|
480
|
+
if "vml" in ns_uri:
|
481
|
+
# Try to extract position from style attribute
|
482
|
+
style = paragraph_element.get("style")
|
483
|
+
if style:
|
484
|
+
match = re.search(r"top:([0-9.]+)pt", style)
|
485
|
+
if match:
|
486
|
+
try:
|
487
|
+
return float(match.group(1))
|
488
|
+
except ValueError:
|
489
|
+
pass
|
490
|
+
|
491
|
+
# If no better position indicator found, use XML source line number as proxy for order
|
492
|
+
return (
|
493
|
+
paragraph_element.sourceline
|
494
|
+
if hasattr(paragraph_element, "sourceline")
|
495
|
+
else None
|
496
|
+
)
|
497
|
+
|
498
|
+
def _collect_textbox_paragraphs(self, textbox_elements):
|
499
|
+
"""Collect and organize paragraphs from textbox elements."""
|
500
|
+
processed_paragraphs = []
|
501
|
+
container_paragraphs = {}
|
502
|
+
|
503
|
+
for element in textbox_elements:
|
504
|
+
element_id = id(element)
|
505
|
+
# Skip if we've already processed this exact element
|
506
|
+
if element_id in processed_paragraphs:
|
507
|
+
continue
|
508
|
+
|
509
|
+
tag_name = etree.QName(element).localname
|
510
|
+
processed_paragraphs.append(element_id)
|
511
|
+
|
512
|
+
# Handle paragraphs directly found (VML textboxes)
|
513
|
+
if tag_name == "p":
|
514
|
+
# Find the containing textbox or shape element
|
515
|
+
container_id = None
|
516
|
+
for ancestor in element.iterancestors():
|
517
|
+
if any(ns in ancestor.tag for ns in ["textbox", "shape", "txbx"]):
|
518
|
+
container_id = id(ancestor)
|
519
|
+
break
|
520
|
+
|
521
|
+
if container_id not in container_paragraphs:
|
522
|
+
container_paragraphs[container_id] = []
|
523
|
+
container_paragraphs[container_id].append(
|
524
|
+
(element, self._get_paragraph_position(element))
|
525
|
+
)
|
526
|
+
|
527
|
+
# Handle txbxContent elements (Word DrawingML textboxes)
|
528
|
+
elif tag_name == "txbxContent":
|
529
|
+
paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
|
530
|
+
container_id = id(element)
|
531
|
+
if container_id not in container_paragraphs:
|
532
|
+
container_paragraphs[container_id] = []
|
533
|
+
|
534
|
+
for p in paragraphs:
|
535
|
+
p_id = id(p)
|
536
|
+
if p_id not in processed_paragraphs:
|
537
|
+
processed_paragraphs.append(p_id)
|
538
|
+
container_paragraphs[container_id].append(
|
539
|
+
(p, self._get_paragraph_position(p))
|
540
|
+
)
|
541
|
+
else:
|
542
|
+
# Try to extract any paragraphs from unknown elements
|
543
|
+
paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
|
544
|
+
container_id = id(element)
|
545
|
+
if container_id not in container_paragraphs:
|
546
|
+
container_paragraphs[container_id] = []
|
547
|
+
|
548
|
+
for p in paragraphs:
|
549
|
+
p_id = id(p)
|
550
|
+
if p_id not in processed_paragraphs:
|
551
|
+
processed_paragraphs.append(p_id)
|
552
|
+
container_paragraphs[container_id].append(
|
553
|
+
(p, self._get_paragraph_position(p))
|
554
|
+
)
|
555
|
+
|
556
|
+
return container_paragraphs
|
557
|
+
|
558
|
+
def _handle_textbox_content(
|
559
|
+
self,
|
560
|
+
textbox_elements: list,
|
561
|
+
docx_obj: DocxDocument,
|
562
|
+
doc: DoclingDocument,
|
563
|
+
) -> None:
|
564
|
+
"""Process textbox content and add it to the document structure."""
|
565
|
+
level = self._get_level()
|
566
|
+
# Create a textbox group to contain all text from the textbox
|
567
|
+
textbox_group = doc.add_group(
|
568
|
+
label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox"
|
569
|
+
)
|
570
|
+
|
571
|
+
# Set this as the current parent to ensure textbox content
|
572
|
+
# is properly nested in document structure
|
573
|
+
original_parent = self.parents[level]
|
574
|
+
self.parents[level] = textbox_group
|
575
|
+
|
576
|
+
# Collect and organize paragraphs
|
577
|
+
container_paragraphs = self._collect_textbox_paragraphs(textbox_elements)
|
578
|
+
|
579
|
+
# Process all paragraphs
|
580
|
+
all_paragraphs = []
|
581
|
+
|
582
|
+
# Sort paragraphs within each container, then process containers
|
583
|
+
for container_id, paragraphs in container_paragraphs.items():
|
584
|
+
# Sort by vertical position within each container
|
585
|
+
sorted_container_paragraphs = sorted(
|
586
|
+
paragraphs,
|
587
|
+
key=lambda x: (
|
588
|
+
x[1] is None,
|
589
|
+
x[1] if x[1] is not None else float("inf"),
|
590
|
+
),
|
591
|
+
)
|
592
|
+
|
593
|
+
# Add the sorted paragraphs to our processing list
|
594
|
+
all_paragraphs.extend(sorted_container_paragraphs)
|
595
|
+
|
596
|
+
# Process all the paragraphs
|
597
|
+
for p, _ in all_paragraphs:
|
598
|
+
self._handle_text_elements(p, docx_obj, doc, is_from_textbox=True)
|
599
|
+
|
600
|
+
# Restore original parent
|
601
|
+
self.parents[level] = original_parent
|
602
|
+
return
|
603
|
+
|
358
604
|
def _handle_equations_in_text(self, element, text):
|
359
605
|
only_texts = []
|
360
606
|
only_equations = []
|
@@ -423,10 +669,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
423
669
|
element: BaseOxmlElement,
|
424
670
|
docx_obj: DocxDocument,
|
425
671
|
doc: DoclingDocument,
|
672
|
+
is_from_textbox: bool = False,
|
426
673
|
) -> None:
|
427
674
|
paragraph = Paragraph(element, docx_obj)
|
428
675
|
|
676
|
+
# Skip if from a textbox and this exact paragraph content was already processed
|
677
|
+
# Skip if from a textbox and this exact paragraph content was already processed
|
429
678
|
raw_text = paragraph.text
|
679
|
+
if is_from_textbox and raw_text:
|
680
|
+
# Create a simple hash of content to detect duplicates
|
681
|
+
content_hash = f"{len(raw_text)}:{raw_text[:50]}"
|
682
|
+
if content_hash in self.processed_paragraph_content:
|
683
|
+
_log.debug(f"Skipping duplicate paragraph content: {content_hash}")
|
684
|
+
return
|
685
|
+
self.processed_paragraph_content.append(content_hash)
|
686
|
+
|
430
687
|
text, equations = self._handle_equations_in_text(element=element, text=raw_text)
|
431
688
|
|
432
689
|
if text is None:
|
@@ -175,13 +175,18 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
175
175
|
if len(group) == 1:
|
176
176
|
return group[0]
|
177
177
|
|
178
|
-
merged_text = "".join(cell.text for cell in group)
|
179
178
|
merged_bbox = BoundingBox(
|
180
179
|
l=min(cell.rect.to_bounding_box().l for cell in group),
|
181
180
|
t=min(cell.rect.to_bounding_box().t for cell in group),
|
182
181
|
r=max(cell.rect.to_bounding_box().r for cell in group),
|
183
182
|
b=max(cell.rect.to_bounding_box().b for cell in group),
|
184
183
|
)
|
184
|
+
|
185
|
+
assert self._ppage is not None
|
186
|
+
self.text_page = self._ppage.get_textpage()
|
187
|
+
bbox = merged_bbox.to_bottom_left_origin(page_size.height)
|
188
|
+
merged_text = self.text_page.get_text_bounded(*bbox.as_tuple())
|
189
|
+
|
185
190
|
return TextCell(
|
186
191
|
index=group[0].index,
|
187
192
|
text=merged_text,
|
@@ -302,7 +302,7 @@ class _DocumentConversionInput(BaseModel):
|
|
302
302
|
if ("." in obj.name and not obj.name.startswith("."))
|
303
303
|
else ""
|
304
304
|
)
|
305
|
-
mime = _DocumentConversionInput._mime_from_extension(ext)
|
305
|
+
mime = _DocumentConversionInput._mime_from_extension(ext.lower())
|
306
306
|
if mime is not None and mime.lower() == "application/zip":
|
307
307
|
objname = obj.name.lower()
|
308
308
|
if objname.endswith(".xlsx"):
|
@@ -376,6 +376,13 @@ class _DocumentConversionInput(BaseModel):
|
|
376
376
|
mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
|
377
377
|
elif ext in FormatToExtensions[InputFormat.PDF]:
|
378
378
|
mime = FormatToMimeType[InputFormat.PDF][0]
|
379
|
+
elif ext in FormatToExtensions[InputFormat.DOCX]:
|
380
|
+
mime = FormatToMimeType[InputFormat.DOCX][0]
|
381
|
+
elif ext in FormatToExtensions[InputFormat.PPTX]:
|
382
|
+
mime = FormatToMimeType[InputFormat.PPTX][0]
|
383
|
+
elif ext in FormatToExtensions[InputFormat.XLSX]:
|
384
|
+
mime = FormatToMimeType[InputFormat.XLSX][0]
|
385
|
+
|
379
386
|
return mime
|
380
387
|
|
381
388
|
@staticmethod
|
@@ -3,7 +3,7 @@ from io import BytesIO
|
|
3
3
|
from pathlib import Path
|
4
4
|
from typing import List, Optional, Union, cast
|
5
5
|
|
6
|
-
|
6
|
+
from docling_core.types import DoclingDocument
|
7
7
|
from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem
|
8
8
|
from docling_core.types.doc.document import DocTagsDocument
|
9
9
|
from PIL import Image as PILImage
|
@@ -133,28 +133,26 @@ class VlmPipeline(PaginatedPipeline):
|
|
133
133
|
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
|
134
134
|
doctags_list_c, image_list_c
|
135
135
|
)
|
136
|
-
conv_res.document.load_from_doctags(doctags_doc)
|
136
|
+
conv_res.document = DoclingDocument.load_from_doctags(doctags_doc)
|
137
137
|
|
138
138
|
# If forced backend text, replace model predicted text with backend one
|
139
|
-
if
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
element.text = txt
|
157
|
-
element.orig = txt
|
139
|
+
if self.force_backend_text:
|
140
|
+
scale = self.pipeline_options.images_scale
|
141
|
+
for element, _level in conv_res.document.iterate_items():
|
142
|
+
if not isinstance(element, TextItem) or len(element.prov) == 0:
|
143
|
+
continue
|
144
|
+
page_ix = element.prov[0].page_no - 1
|
145
|
+
page = conv_res.pages[page_ix]
|
146
|
+
if not page.size:
|
147
|
+
continue
|
148
|
+
crop_bbox = (
|
149
|
+
element.prov[0]
|
150
|
+
.bbox.scaled(scale=scale)
|
151
|
+
.to_top_left_origin(page_height=page.size.height * scale)
|
152
|
+
)
|
153
|
+
txt = self.extract_text_from_backend(page, crop_bbox)
|
154
|
+
element.text = txt
|
155
|
+
element.orig = txt
|
158
156
|
elif (
|
159
157
|
self.pipeline_options.vlm_options.response_format
|
160
158
|
== ResponseFormat.MARKDOWN
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.
|
3
|
+
version = "2.33.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
authors = [
|
6
6
|
"Christoph Auer <cau@zurich.ibm.com>",
|
@@ -46,7 +46,7 @@ packages = [{ include = "docling" }]
|
|
46
46
|
######################
|
47
47
|
python = "^3.9"
|
48
48
|
pydantic = "^2.0.0"
|
49
|
-
docling-core = {version = "^2.
|
49
|
+
docling-core = {version = "^2.29.0", extras = ["chunking"]}
|
50
50
|
docling-ibm-models = "^3.4.0"
|
51
51
|
docling-parse = "^4.0.0"
|
52
52
|
filetype = "^1.2.0"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|