docling 2.32.0__tar.gz → 2.34.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. {docling-2.32.0 → docling-2.34.0}/PKG-INFO +2 -2
  2. {docling-2.32.0 → docling-2.34.0}/docling/backend/docling_parse_backend.py +1 -1
  3. {docling-2.32.0 → docling-2.34.0}/docling/backend/docling_parse_v2_backend.py +1 -1
  4. {docling-2.32.0 → docling-2.34.0}/docling/backend/docling_parse_v4_backend.py +1 -1
  5. {docling-2.32.0 → docling-2.34.0}/docling/backend/msword_backend.py +269 -12
  6. {docling-2.32.0 → docling-2.34.0}/docling/backend/pypdfium2_backend.py +6 -1
  7. {docling-2.32.0 → docling-2.34.0}/docling/datamodel/base_models.py +99 -2
  8. {docling-2.32.0 → docling-2.34.0}/docling/datamodel/document.py +11 -2
  9. {docling-2.32.0 → docling-2.34.0}/docling/models/layout_model.py +9 -0
  10. {docling-2.32.0 → docling-2.34.0}/docling/models/page_assemble_model.py +1 -0
  11. {docling-2.32.0 → docling-2.34.0}/docling/models/page_preprocessing_model.py +50 -1
  12. {docling-2.32.0 → docling-2.34.0}/docling/models/tesseract_ocr_cli_model.py +85 -41
  13. {docling-2.32.0 → docling-2.34.0}/docling/models/tesseract_ocr_model.py +52 -30
  14. {docling-2.32.0 → docling-2.34.0}/docling/pipeline/standard_pdf_pipeline.py +28 -3
  15. {docling-2.32.0 → docling-2.34.0}/docling/pipeline/vlm_pipeline.py +19 -21
  16. {docling-2.32.0 → docling-2.34.0}/docling/utils/layout_postprocessor.py +10 -22
  17. docling-2.34.0/docling/utils/ocr_utils.py +69 -0
  18. docling-2.34.0/docling/utils/orientation.py +71 -0
  19. {docling-2.32.0 → docling-2.34.0}/pyproject.toml +2 -2
  20. docling-2.32.0/docling/utils/ocr_utils.py +0 -9
  21. {docling-2.32.0 → docling-2.34.0}/LICENSE +0 -0
  22. {docling-2.32.0 → docling-2.34.0}/README.md +0 -0
  23. {docling-2.32.0 → docling-2.34.0}/docling/__init__.py +0 -0
  24. {docling-2.32.0 → docling-2.34.0}/docling/backend/__init__.py +0 -0
  25. {docling-2.32.0 → docling-2.34.0}/docling/backend/abstract_backend.py +0 -0
  26. {docling-2.32.0 → docling-2.34.0}/docling/backend/asciidoc_backend.py +0 -0
  27. {docling-2.32.0 → docling-2.34.0}/docling/backend/csv_backend.py +0 -0
  28. {docling-2.32.0 → docling-2.34.0}/docling/backend/docx/__init__.py +0 -0
  29. {docling-2.32.0 → docling-2.34.0}/docling/backend/docx/latex/__init__.py +0 -0
  30. {docling-2.32.0 → docling-2.34.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  31. {docling-2.32.0 → docling-2.34.0}/docling/backend/docx/latex/omml.py +0 -0
  32. {docling-2.32.0 → docling-2.34.0}/docling/backend/html_backend.py +0 -0
  33. {docling-2.32.0 → docling-2.34.0}/docling/backend/json/__init__.py +0 -0
  34. {docling-2.32.0 → docling-2.34.0}/docling/backend/json/docling_json_backend.py +0 -0
  35. {docling-2.32.0 → docling-2.34.0}/docling/backend/md_backend.py +0 -0
  36. {docling-2.32.0 → docling-2.34.0}/docling/backend/msexcel_backend.py +0 -0
  37. {docling-2.32.0 → docling-2.34.0}/docling/backend/mspowerpoint_backend.py +0 -0
  38. {docling-2.32.0 → docling-2.34.0}/docling/backend/pdf_backend.py +0 -0
  39. {docling-2.32.0 → docling-2.34.0}/docling/backend/xml/__init__.py +0 -0
  40. {docling-2.32.0 → docling-2.34.0}/docling/backend/xml/jats_backend.py +0 -0
  41. {docling-2.32.0 → docling-2.34.0}/docling/backend/xml/uspto_backend.py +0 -0
  42. {docling-2.32.0 → docling-2.34.0}/docling/chunking/__init__.py +0 -0
  43. {docling-2.32.0 → docling-2.34.0}/docling/cli/__init__.py +0 -0
  44. {docling-2.32.0 → docling-2.34.0}/docling/cli/main.py +0 -0
  45. {docling-2.32.0 → docling-2.34.0}/docling/cli/models.py +0 -0
  46. {docling-2.32.0 → docling-2.34.0}/docling/cli/tools.py +0 -0
  47. {docling-2.32.0 → docling-2.34.0}/docling/datamodel/__init__.py +0 -0
  48. {docling-2.32.0 → docling-2.34.0}/docling/datamodel/pipeline_options.py +0 -0
  49. {docling-2.32.0 → docling-2.34.0}/docling/datamodel/settings.py +0 -0
  50. {docling-2.32.0 → docling-2.34.0}/docling/document_converter.py +0 -0
  51. {docling-2.32.0 → docling-2.34.0}/docling/exceptions.py +0 -0
  52. {docling-2.32.0 → docling-2.34.0}/docling/models/__init__.py +0 -0
  53. {docling-2.32.0 → docling-2.34.0}/docling/models/api_vlm_model.py +0 -0
  54. {docling-2.32.0 → docling-2.34.0}/docling/models/base_model.py +0 -0
  55. {docling-2.32.0 → docling-2.34.0}/docling/models/base_ocr_model.py +0 -0
  56. {docling-2.32.0 → docling-2.34.0}/docling/models/code_formula_model.py +0 -0
  57. {docling-2.32.0 → docling-2.34.0}/docling/models/document_picture_classifier.py +0 -0
  58. {docling-2.32.0 → docling-2.34.0}/docling/models/easyocr_model.py +0 -0
  59. {docling-2.32.0 → docling-2.34.0}/docling/models/factories/__init__.py +0 -0
  60. {docling-2.32.0 → docling-2.34.0}/docling/models/factories/base_factory.py +0 -0
  61. {docling-2.32.0 → docling-2.34.0}/docling/models/factories/ocr_factory.py +0 -0
  62. {docling-2.32.0 → docling-2.34.0}/docling/models/factories/picture_description_factory.py +0 -0
  63. {docling-2.32.0 → docling-2.34.0}/docling/models/hf_mlx_model.py +0 -0
  64. {docling-2.32.0 → docling-2.34.0}/docling/models/hf_vlm_model.py +0 -0
  65. {docling-2.32.0 → docling-2.34.0}/docling/models/ocr_mac_model.py +0 -0
  66. {docling-2.32.0 → docling-2.34.0}/docling/models/picture_description_api_model.py +0 -0
  67. {docling-2.32.0 → docling-2.34.0}/docling/models/picture_description_base_model.py +0 -0
  68. {docling-2.32.0 → docling-2.34.0}/docling/models/picture_description_vlm_model.py +0 -0
  69. {docling-2.32.0 → docling-2.34.0}/docling/models/plugins/__init__.py +0 -0
  70. {docling-2.32.0 → docling-2.34.0}/docling/models/plugins/defaults.py +0 -0
  71. {docling-2.32.0 → docling-2.34.0}/docling/models/rapid_ocr_model.py +0 -0
  72. {docling-2.32.0 → docling-2.34.0}/docling/models/readingorder_model.py +0 -0
  73. {docling-2.32.0 → docling-2.34.0}/docling/models/table_structure_model.py +0 -0
  74. {docling-2.32.0 → docling-2.34.0}/docling/pipeline/__init__.py +0 -0
  75. {docling-2.32.0 → docling-2.34.0}/docling/pipeline/base_pipeline.py +0 -0
  76. {docling-2.32.0 → docling-2.34.0}/docling/pipeline/simple_pipeline.py +0 -0
  77. {docling-2.32.0 → docling-2.34.0}/docling/py.typed +0 -0
  78. {docling-2.32.0 → docling-2.34.0}/docling/utils/__init__.py +0 -0
  79. {docling-2.32.0 → docling-2.34.0}/docling/utils/accelerator_utils.py +0 -0
  80. {docling-2.32.0 → docling-2.34.0}/docling/utils/api_image_request.py +0 -0
  81. {docling-2.32.0 → docling-2.34.0}/docling/utils/export.py +0 -0
  82. {docling-2.32.0 → docling-2.34.0}/docling/utils/glm_utils.py +0 -0
  83. {docling-2.32.0 → docling-2.34.0}/docling/utils/locks.py +0 -0
  84. {docling-2.32.0 → docling-2.34.0}/docling/utils/model_downloader.py +0 -0
  85. {docling-2.32.0 → docling-2.34.0}/docling/utils/profiling.py +0 -0
  86. {docling-2.32.0 → docling-2.34.0}/docling/utils/utils.py +0 -0
  87. {docling-2.32.0 → docling-2.34.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.32.0
3
+ Version: 2.34.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
@@ -29,7 +29,7 @@ Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platfo
29
29
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
30
30
  Requires-Dist: certifi (>=2024.7.4)
31
31
  Requires-Dist: click (<8.2.0)
32
- Requires-Dist: docling-core[chunking] (>=2.26.0,<3.0.0)
32
+ Requires-Dist: docling-core[chunking] (>=2.29.0,<3.0.0)
33
33
  Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
34
34
  Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
35
35
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -60,7 +60,7 @@ class DoclingParsePageBackend(PdfPageBackend):
60
60
  coord_origin=CoordOrigin.BOTTOMLEFT,
61
61
  ).to_top_left_origin(page_height=page_size.height * scale)
62
62
 
63
- overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
63
+ overlap_frac = cell_bbox.intersection_over_self(bbox)
64
64
 
65
65
  if overlap_frac > 0.5:
66
66
  if len(text_piece) > 0:
@@ -71,7 +71,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
71
71
  coord_origin=CoordOrigin.BOTTOMLEFT,
72
72
  ).to_top_left_origin(page_height=page_size.height * scale)
73
73
 
74
- overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
74
+ overlap_frac = cell_bbox.intersection_over_self(bbox)
75
75
 
76
76
  if overlap_frac > 0.5:
77
77
  if len(text_piece) > 0:
@@ -46,7 +46,7 @@ class DoclingParseV4PageBackend(PdfPageBackend):
46
46
  .scaled(scale)
47
47
  )
48
48
 
49
- overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
49
+ overlap_frac = cell_bbox.intersection_over_self(bbox)
50
50
 
51
51
  if overlap_frac > 0.5:
52
52
  if len(text_piece) > 0:
@@ -2,7 +2,7 @@ import logging
2
2
  import re
3
3
  from io import BytesIO
4
4
  from pathlib import Path
5
- from typing import Any, Optional, Union
5
+ from typing import Any, List, Optional, Union
6
6
 
7
7
  from docling_core.types.doc import (
8
8
  DocItemLabel,
@@ -24,7 +24,6 @@ from docx.text.hyperlink import Hyperlink
24
24
  from docx.text.paragraph import Paragraph
25
25
  from docx.text.run import Run
26
26
  from lxml import etree
27
- from lxml.etree import XPath
28
27
  from PIL import Image, UnidentifiedImageError
29
28
  from pydantic import AnyUrl
30
29
  from typing_extensions import override
@@ -59,6 +58,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
59
58
  self.parents: dict[int, Optional[NodeItem]] = {}
60
59
  self.numbered_headers: dict[int, int] = {}
61
60
  self.equation_bookends: str = "<eq>{EQ}</eq>"
61
+ # Track processed textbox elements to avoid duplication
62
+ self.processed_textbox_elements: List[int] = []
63
+ # Track content hash of processed paragraphs to avoid duplicate content
64
+ self.processed_paragraph_content: List[str] = []
65
+
62
66
  for i in range(-1, self.max_levels):
63
67
  self.parents[i] = None
64
68
 
@@ -175,10 +179,74 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
175
179
  "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
176
180
  "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
177
181
  "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
182
+ "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
183
+ "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
184
+ "v": "urn:schemas-microsoft-com:vml",
185
+ "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
186
+ "w10": "urn:schemas-microsoft-com:office:word",
187
+ "a14": "http://schemas.microsoft.com/office/drawing/2010/main",
178
188
  }
179
- xpath_expr = XPath(".//a:blip", namespaces=namespaces)
189
+ xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
180
190
  drawing_blip = xpath_expr(element)
181
191
 
192
+ # Check for textbox content - check multiple textbox formats
193
+ # Only process if the element hasn't been processed before
194
+ element_id = id(element)
195
+ if element_id not in self.processed_textbox_elements:
196
+ # Modern Word textboxes
197
+ txbx_xpath = etree.XPath(
198
+ ".//w:txbxContent|.//v:textbox//w:p", namespaces=namespaces
199
+ )
200
+ textbox_elements = txbx_xpath(element)
201
+
202
+ # No modern textboxes found, check for alternate/legacy textbox formats
203
+ if not textbox_elements and tag_name in ["drawing", "pict"]:
204
+ # Additional checks for textboxes in DrawingML and VML formats
205
+ alt_txbx_xpath = etree.XPath(
206
+ ".//wps:txbx//w:p|.//w10:wrap//w:p|.//a:p//a:t",
207
+ namespaces=namespaces,
208
+ )
209
+ textbox_elements = alt_txbx_xpath(element)
210
+
211
+ # Check for shape text that's not in a standard textbox
212
+ if not textbox_elements:
213
+ shape_text_xpath = etree.XPath(
214
+ ".//a:bodyPr/ancestor::*//a:t|.//a:txBody//a:t",
215
+ namespaces=namespaces,
216
+ )
217
+ shape_text_elements = shape_text_xpath(element)
218
+ if shape_text_elements:
219
+ # Create custom text elements from shape text
220
+ text_content = " ".join(
221
+ [t.text for t in shape_text_elements if t.text]
222
+ )
223
+ if text_content.strip():
224
+ _log.debug(f"Found shape text: {text_content[:50]}...")
225
+ # Create a paragraph-like element to process with standard handler
226
+ level = self._get_level()
227
+ shape_group = doc.add_group(
228
+ label=GroupLabel.SECTION,
229
+ parent=self.parents[level - 1],
230
+ name="shape-text",
231
+ )
232
+ doc.add_text(
233
+ label=DocItemLabel.PARAGRAPH,
234
+ parent=shape_group,
235
+ text=text_content,
236
+ )
237
+
238
+ if textbox_elements:
239
+ # Mark the parent element as processed
240
+ self.processed_textbox_elements.append(element_id)
241
+ # Also mark all found textbox elements as processed
242
+ for tb_element in textbox_elements:
243
+ self.processed_textbox_elements.append(id(tb_element))
244
+
245
+ _log.debug(
246
+ f"Found textbox content with {len(textbox_elements)} elements"
247
+ )
248
+ self._handle_textbox_content(textbox_elements, docx_obj, doc)
249
+
182
250
  # Check for Tables
183
251
  if element.tag.endswith("tbl"):
184
252
  try:
@@ -291,15 +359,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
291
359
 
292
360
  @classmethod
293
361
  def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
294
- has_any_formatting = run.bold or run.italic or run.underline
295
- return (
296
- Formatting(
297
- bold=run.bold or False,
298
- italic=run.italic or False,
299
- underline=run.underline or False,
300
- )
301
- if has_any_formatting
302
- else None
362
+ # The .bold and .italic properties are booleans, but .underline can be an enum
363
+ # like WD_UNDERLINE.THICK (value 6), so we need to convert it to a boolean
364
+ has_bold = run.bold or False
365
+ has_italic = run.italic or False
366
+ # Convert any non-None underline value to True
367
+ has_underline = bool(run.underline is not None and run.underline)
368
+
369
+ return Formatting(
370
+ bold=has_bold,
371
+ italic=has_italic,
372
+ underline=has_underline,
303
373
  )
304
374
 
305
375
  def _get_paragraph_elements(self, paragraph: Paragraph):
@@ -355,6 +425,182 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
355
425
 
356
426
  return paragraph_elements
357
427
 
428
+ def _get_paragraph_position(self, paragraph_element):
429
+ """Extract vertical position information from paragraph element."""
430
+ # First try to directly get the index from w:p element that has an order-related attribute
431
+ if (
432
+ hasattr(paragraph_element, "getparent")
433
+ and paragraph_element.getparent() is not None
434
+ ):
435
+ parent = paragraph_element.getparent()
436
+ # Get all paragraph siblings
437
+ paragraphs = [
438
+ p for p in parent.getchildren() if etree.QName(p).localname == "p"
439
+ ]
440
+ # Find index of current paragraph within its siblings
441
+ try:
442
+ paragraph_index = paragraphs.index(paragraph_element)
443
+ return paragraph_index # Use index as position for consistent ordering
444
+ except ValueError:
445
+ pass
446
+
447
+ # Look for position hints in element attributes and ancestor elements
448
+ for elem in (*[paragraph_element], *paragraph_element.iterancestors()):
449
+ # Check for direct position attributes
450
+ for attr_name in ["y", "top", "positionY", "y-position", "position"]:
451
+ value = elem.get(attr_name)
452
+ if value:
453
+ try:
454
+ # Remove any non-numeric characters (like 'pt', 'px', etc.)
455
+ clean_value = re.sub(r"[^0-9.]", "", value)
456
+ if clean_value:
457
+ return float(clean_value)
458
+ except (ValueError, TypeError):
459
+ pass
460
+
461
+ # Check for position in transform attribute
462
+ transform = elem.get("transform")
463
+ if transform:
464
+ # Extract translation component from transform matrix
465
+ match = re.search(r"translate\([^,]+,\s*([0-9.]+)", transform)
466
+ if match:
467
+ try:
468
+ return float(match.group(1))
469
+ except ValueError:
470
+ pass
471
+
472
+ # Check for anchors or relative position indicators in Word format
473
+ # 'dist' attributes can indicate relative positioning
474
+ for attr_name in ["distT", "distB", "anchor", "relativeFrom"]:
475
+ if elem.get(attr_name) is not None:
476
+ return elem.sourceline # Use the XML source line number as fallback
477
+
478
+ # For VML shapes, look for specific attributes
479
+ for ns_uri in paragraph_element.nsmap.values():
480
+ if "vml" in ns_uri:
481
+ # Try to extract position from style attribute
482
+ style = paragraph_element.get("style")
483
+ if style:
484
+ match = re.search(r"top:([0-9.]+)pt", style)
485
+ if match:
486
+ try:
487
+ return float(match.group(1))
488
+ except ValueError:
489
+ pass
490
+
491
+ # If no better position indicator found, use XML source line number as proxy for order
492
+ return (
493
+ paragraph_element.sourceline
494
+ if hasattr(paragraph_element, "sourceline")
495
+ else None
496
+ )
497
+
498
+ def _collect_textbox_paragraphs(self, textbox_elements):
499
+ """Collect and organize paragraphs from textbox elements."""
500
+ processed_paragraphs = []
501
+ container_paragraphs = {}
502
+
503
+ for element in textbox_elements:
504
+ element_id = id(element)
505
+ # Skip if we've already processed this exact element
506
+ if element_id in processed_paragraphs:
507
+ continue
508
+
509
+ tag_name = etree.QName(element).localname
510
+ processed_paragraphs.append(element_id)
511
+
512
+ # Handle paragraphs directly found (VML textboxes)
513
+ if tag_name == "p":
514
+ # Find the containing textbox or shape element
515
+ container_id = None
516
+ for ancestor in element.iterancestors():
517
+ if any(ns in ancestor.tag for ns in ["textbox", "shape", "txbx"]):
518
+ container_id = id(ancestor)
519
+ break
520
+
521
+ if container_id not in container_paragraphs:
522
+ container_paragraphs[container_id] = []
523
+ container_paragraphs[container_id].append(
524
+ (element, self._get_paragraph_position(element))
525
+ )
526
+
527
+ # Handle txbxContent elements (Word DrawingML textboxes)
528
+ elif tag_name == "txbxContent":
529
+ paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
530
+ container_id = id(element)
531
+ if container_id not in container_paragraphs:
532
+ container_paragraphs[container_id] = []
533
+
534
+ for p in paragraphs:
535
+ p_id = id(p)
536
+ if p_id not in processed_paragraphs:
537
+ processed_paragraphs.append(p_id)
538
+ container_paragraphs[container_id].append(
539
+ (p, self._get_paragraph_position(p))
540
+ )
541
+ else:
542
+ # Try to extract any paragraphs from unknown elements
543
+ paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
544
+ container_id = id(element)
545
+ if container_id not in container_paragraphs:
546
+ container_paragraphs[container_id] = []
547
+
548
+ for p in paragraphs:
549
+ p_id = id(p)
550
+ if p_id not in processed_paragraphs:
551
+ processed_paragraphs.append(p_id)
552
+ container_paragraphs[container_id].append(
553
+ (p, self._get_paragraph_position(p))
554
+ )
555
+
556
+ return container_paragraphs
557
+
558
+ def _handle_textbox_content(
559
+ self,
560
+ textbox_elements: list,
561
+ docx_obj: DocxDocument,
562
+ doc: DoclingDocument,
563
+ ) -> None:
564
+ """Process textbox content and add it to the document structure."""
565
+ level = self._get_level()
566
+ # Create a textbox group to contain all text from the textbox
567
+ textbox_group = doc.add_group(
568
+ label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox"
569
+ )
570
+
571
+ # Set this as the current parent to ensure textbox content
572
+ # is properly nested in document structure
573
+ original_parent = self.parents[level]
574
+ self.parents[level] = textbox_group
575
+
576
+ # Collect and organize paragraphs
577
+ container_paragraphs = self._collect_textbox_paragraphs(textbox_elements)
578
+
579
+ # Process all paragraphs
580
+ all_paragraphs = []
581
+
582
+ # Sort paragraphs within each container, then process containers
583
+ for container_id, paragraphs in container_paragraphs.items():
584
+ # Sort by vertical position within each container
585
+ sorted_container_paragraphs = sorted(
586
+ paragraphs,
587
+ key=lambda x: (
588
+ x[1] is None,
589
+ x[1] if x[1] is not None else float("inf"),
590
+ ),
591
+ )
592
+
593
+ # Add the sorted paragraphs to our processing list
594
+ all_paragraphs.extend(sorted_container_paragraphs)
595
+
596
+ # Process all the paragraphs
597
+ for p, _ in all_paragraphs:
598
+ self._handle_text_elements(p, docx_obj, doc, is_from_textbox=True)
599
+
600
+ # Restore original parent
601
+ self.parents[level] = original_parent
602
+ return
603
+
358
604
  def _handle_equations_in_text(self, element, text):
359
605
  only_texts = []
360
606
  only_equations = []
@@ -423,10 +669,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
423
669
  element: BaseOxmlElement,
424
670
  docx_obj: DocxDocument,
425
671
  doc: DoclingDocument,
672
+ is_from_textbox: bool = False,
426
673
  ) -> None:
427
674
  paragraph = Paragraph(element, docx_obj)
428
675
 
676
+ # Skip if from a textbox and this exact paragraph content was already processed
677
+ # Skip if from a textbox and this exact paragraph content was already processed
429
678
  raw_text = paragraph.text
679
+ if is_from_textbox and raw_text:
680
+ # Create a simple hash of content to detect duplicates
681
+ content_hash = f"{len(raw_text)}:{raw_text[:50]}"
682
+ if content_hash in self.processed_paragraph_content:
683
+ _log.debug(f"Skipping duplicate paragraph content: {content_hash}")
684
+ return
685
+ self.processed_paragraph_content.append(content_hash)
686
+
430
687
  text, equations = self._handle_equations_in_text(element=element, text=raw_text)
431
688
 
432
689
  if text is None:
@@ -175,13 +175,18 @@ class PyPdfiumPageBackend(PdfPageBackend):
175
175
  if len(group) == 1:
176
176
  return group[0]
177
177
 
178
- merged_text = "".join(cell.text for cell in group)
179
178
  merged_bbox = BoundingBox(
180
179
  l=min(cell.rect.to_bounding_box().l for cell in group),
181
180
  t=min(cell.rect.to_bounding_box().t for cell in group),
182
181
  r=max(cell.rect.to_bounding_box().r for cell in group),
183
182
  b=max(cell.rect.to_bounding_box().b for cell in group),
184
183
  )
184
+
185
+ assert self._ppage is not None
186
+ self.text_page = self._ppage.get_textpage()
187
+ bbox = merged_bbox.to_bottom_left_origin(page_size.height)
188
+ merged_text = self.text_page.get_text_bounded(*bbox.as_tuple())
189
+
185
190
  return TextCell(
186
191
  index=group[0].index,
187
192
  text=merged_text,
@@ -1,6 +1,9 @@
1
+ import math
2
+ from collections import defaultdict
1
3
  from enum import Enum
2
- from typing import TYPE_CHECKING, Dict, List, Optional, Union
4
+ from typing import TYPE_CHECKING, Annotated, Dict, List, Literal, Optional, Union
3
5
 
6
+ import numpy as np
4
7
  from docling_core.types.doc import (
5
8
  BoundingBox,
6
9
  DocItemLabel,
@@ -16,7 +19,7 @@ from docling_core.types.io import (
16
19
  DocumentStream,
17
20
  )
18
21
  from PIL.Image import Image
19
- from pydantic import BaseModel, ConfigDict
22
+ from pydantic import BaseModel, ConfigDict, Field, computed_field
20
23
 
21
24
  if TYPE_CHECKING:
22
25
  from docling.backend.pdf_backend import PdfPageBackend
@@ -298,3 +301,97 @@ class OpenAiApiResponse(BaseModel):
298
301
  choices: List[OpenAiResponseChoice]
299
302
  created: int
300
303
  usage: OpenAiResponseUsage
304
+
305
+
306
+ # Create a type alias for score values
307
+ ScoreValue = float
308
+
309
+
310
+ class QualityGrade(str, Enum):
311
+ POOR = "poor"
312
+ FAIR = "fair"
313
+ GOOD = "good"
314
+ EXCELLENT = "excellent"
315
+ UNSPECIFIED = "unspecified"
316
+
317
+
318
+ class PageConfidenceScores(BaseModel):
319
+ parse_score: ScoreValue = np.nan
320
+ layout_score: ScoreValue = np.nan
321
+ table_score: ScoreValue = np.nan
322
+ ocr_score: ScoreValue = np.nan
323
+
324
+ def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
325
+ if score < 0.5:
326
+ return QualityGrade.POOR
327
+ elif score < 0.8:
328
+ return QualityGrade.FAIR
329
+ elif score < 0.9:
330
+ return QualityGrade.GOOD
331
+ elif score >= 0.9:
332
+ return QualityGrade.EXCELLENT
333
+
334
+ return QualityGrade.UNSPECIFIED
335
+
336
+ @computed_field # type: ignore
337
+ @property
338
+ def mean_grade(self) -> QualityGrade:
339
+ return self._score_to_grade(self.mean_score)
340
+
341
+ @computed_field # type: ignore
342
+ @property
343
+ def low_grade(self) -> QualityGrade:
344
+ return self._score_to_grade(self.low_score)
345
+
346
+ @computed_field # type: ignore
347
+ @property
348
+ def mean_score(self) -> ScoreValue:
349
+ return ScoreValue(
350
+ np.nanmean(
351
+ [
352
+ self.ocr_score,
353
+ self.table_score,
354
+ self.layout_score,
355
+ self.parse_score,
356
+ ]
357
+ )
358
+ )
359
+
360
+ @computed_field # type: ignore
361
+ @property
362
+ def low_score(self) -> ScoreValue:
363
+ return ScoreValue(
364
+ np.nanquantile(
365
+ [
366
+ self.ocr_score,
367
+ self.table_score,
368
+ self.layout_score,
369
+ self.parse_score,
370
+ ],
371
+ q=0.05,
372
+ )
373
+ )
374
+
375
+
376
+ class ConfidenceReport(PageConfidenceScores):
377
+ pages: Dict[int, PageConfidenceScores] = Field(
378
+ default_factory=lambda: defaultdict(PageConfidenceScores)
379
+ )
380
+
381
+ @computed_field # type: ignore
382
+ @property
383
+ def mean_score(self) -> ScoreValue:
384
+ return ScoreValue(
385
+ np.nanmean(
386
+ [c.mean_score for c in self.pages.values()],
387
+ )
388
+ )
389
+
390
+ @computed_field # type: ignore
391
+ @property
392
+ def low_score(self) -> ScoreValue:
393
+ return ScoreValue(
394
+ np.nanmean(
395
+ [c.low_score for c in self.pages.values()],
396
+ )
397
+ )
@@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import (
47
47
  )
48
48
  from docling_core.utils.file import resolve_source_to_stream
49
49
  from docling_core.utils.legacy import docling_document_to_legacy
50
- from pydantic import BaseModel
50
+ from pydantic import BaseModel, Field
51
51
  from typing_extensions import deprecated
52
52
 
53
53
  from docling.backend.abstract_backend import (
@@ -56,6 +56,7 @@ from docling.backend.abstract_backend import (
56
56
  )
57
57
  from docling.datamodel.base_models import (
58
58
  AssembledUnit,
59
+ ConfidenceReport,
59
60
  ConversionStatus,
60
61
  DocumentStream,
61
62
  ErrorItem,
@@ -201,6 +202,7 @@ class ConversionResult(BaseModel):
201
202
  pages: List[Page] = []
202
203
  assembled: AssembledUnit = AssembledUnit()
203
204
  timings: Dict[str, ProfilingItem] = {}
205
+ confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
204
206
 
205
207
  document: DoclingDocument = _EMPTY_DOCLING_DOC
206
208
 
@@ -302,7 +304,7 @@ class _DocumentConversionInput(BaseModel):
302
304
  if ("." in obj.name and not obj.name.startswith("."))
303
305
  else ""
304
306
  )
305
- mime = _DocumentConversionInput._mime_from_extension(ext)
307
+ mime = _DocumentConversionInput._mime_from_extension(ext.lower())
306
308
  if mime is not None and mime.lower() == "application/zip":
307
309
  objname = obj.name.lower()
308
310
  if objname.endswith(".xlsx"):
@@ -376,6 +378,13 @@ class _DocumentConversionInput(BaseModel):
376
378
  mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
377
379
  elif ext in FormatToExtensions[InputFormat.PDF]:
378
380
  mime = FormatToMimeType[InputFormat.PDF][0]
381
+ elif ext in FormatToExtensions[InputFormat.DOCX]:
382
+ mime = FormatToMimeType[InputFormat.DOCX][0]
383
+ elif ext in FormatToExtensions[InputFormat.PPTX]:
384
+ mime = FormatToMimeType[InputFormat.PPTX][0]
385
+ elif ext in FormatToExtensions[InputFormat.XLSX]:
386
+ mime = FormatToMimeType[InputFormat.XLSX][0]
387
+
379
388
  return mime
380
389
 
381
390
  @staticmethod
@@ -5,6 +5,7 @@ from collections.abc import Iterable
5
5
  from pathlib import Path
6
6
  from typing import Optional
7
7
 
8
+ import numpy as np
8
9
  from docling_core.types.doc import DocItemLabel
9
10
  from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
10
11
  from PIL import Image
@@ -184,6 +185,14 @@ class LayoutModel(BasePageModel):
184
185
  ).postprocess()
185
186
  # processed_clusters, processed_cells = clusters, page.cells
186
187
 
188
+ conv_res.confidence.pages[page.page_no].layout_score = float(
189
+ np.mean([c.confidence for c in processed_clusters])
190
+ )
191
+
192
+ conv_res.confidence.pages[page.page_no].ocr_score = float(
193
+ np.mean([c.confidence for c in processed_cells if c.from_ocr])
194
+ )
195
+
187
196
  page.cells = processed_cells
188
197
  page.predictions.layout = LayoutPrediction(
189
198
  clusters=processed_clusters
@@ -3,6 +3,7 @@ import re
3
3
  from collections.abc import Iterable
4
4
  from typing import List
5
5
 
6
+ import numpy as np
6
7
  from pydantic import BaseModel
7
8
 
8
9
  from docling.datamodel.base_models import (
@@ -1,11 +1,13 @@
1
+ import re
1
2
  from collections.abc import Iterable
2
3
  from pathlib import Path
3
4
  from typing import Optional
4
5
 
6
+ import numpy as np
5
7
  from PIL import ImageDraw
6
8
  from pydantic import BaseModel
7
9
 
8
- from docling.datamodel.base_models import Page
10
+ from docling.datamodel.base_models import Page, ScoreValue
9
11
  from docling.datamodel.document import ConversionResult
10
12
  from docling.datamodel.settings import settings
11
13
  from docling.models.base_model import BasePageModel
@@ -21,6 +23,14 @@ class PagePreprocessingModel(BasePageModel):
21
23
  def __init__(self, options: PagePreprocessingOptions):
22
24
  self.options = options
23
25
 
26
+ # Pre-compiled regex patterns for efficiency
27
+ self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
28
+ self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
29
+ self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
30
+ self.SLASH_NUMBER_GARBAGE_RE = re.compile(
31
+ r"(?:/\w+\s*){2,}"
32
+ ) # Two or more "/token " sequences
33
+
24
34
  def __call__(
25
35
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
26
36
  ) -> Iterable[Page]:
@@ -60,6 +70,18 @@ class PagePreprocessingModel(BasePageModel):
60
70
  if self.options.create_parsed_page:
61
71
  page.parsed_page = page._backend.get_segmented_page()
62
72
 
73
+ # Rate the text quality from the PDF parser, and aggregate on page
74
+ text_scores = []
75
+ for c in page.cells:
76
+ score = self.rate_text_quality(c.text)
77
+ text_scores.append(score)
78
+
79
+ conv_res.confidence.pages[page.page_no].parse_score = float(
80
+ np.nanquantile(
81
+ text_scores, q=0.10
82
+ ) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
83
+ )
84
+
63
85
  # DEBUG code:
64
86
  def draw_text_boxes(image, cells, show: bool = False):
65
87
  draw = ImageDraw.Draw(image)
@@ -88,3 +110,30 @@ class PagePreprocessingModel(BasePageModel):
88
110
  draw_text_boxes(page.get_image(scale=1.0), page.cells)
89
111
 
90
112
  return page
113
+
114
+ def rate_text_quality(self, text: str) -> float:
115
+ # Hard errors: if any of these patterns are found, return 0.0 immediately.
116
+ blacklist_chars = ["�"]
117
+ if (
118
+ any(text.find(c) >= 0 for c in blacklist_chars)
119
+ or self.GLYPH_RE.search(text)
120
+ or self.SLASH_G_RE.search(text)
121
+ or self.SLASH_NUMBER_GARBAGE_RE.match(
122
+ text
123
+ ) # Check if text is mostly slash-number pattern
124
+ ):
125
+ return 0.0
126
+
127
+ penalty = 0.0
128
+
129
+ # Apply a penalty only if the fragmented words pattern occurs at least three times.
130
+ frag_matches = self.FRAG_RE.findall(text)
131
+ if len(frag_matches) >= 3:
132
+ penalty += 0.1 * len(frag_matches)
133
+
134
+ # Additional heuristic: if the average token length is below 2, add a penalty.
135
+ # tokens = text.split()
136
+ # if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
137
+ # penalty += 0.2
138
+
139
+ return max(1.0 - penalty, 0.0)