docling 2.31.2__tar.gz → 2.33.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {docling-2.31.2 → docling-2.33.0}/PKG-INFO +2 -2
  2. {docling-2.31.2 → docling-2.33.0}/docling/backend/msword_backend.py +269 -12
  3. {docling-2.31.2 → docling-2.33.0}/docling/backend/pypdfium2_backend.py +6 -1
  4. {docling-2.31.2 → docling-2.33.0}/docling/datamodel/base_models.py +1 -0
  5. {docling-2.31.2 → docling-2.33.0}/docling/datamodel/document.py +8 -1
  6. {docling-2.31.2 → docling-2.33.0}/docling/datamodel/pipeline_options.py +2 -0
  7. {docling-2.31.2 → docling-2.33.0}/docling/datamodel/settings.py +6 -4
  8. {docling-2.31.2 → docling-2.33.0}/docling/models/api_vlm_model.py +8 -3
  9. {docling-2.31.2 → docling-2.33.0}/docling/models/picture_description_api_model.py +7 -2
  10. {docling-2.31.2 → docling-2.33.0}/docling/models/tesseract_ocr_cli_model.py +1 -1
  11. {docling-2.31.2 → docling-2.33.0}/docling/pipeline/vlm_pipeline.py +19 -21
  12. {docling-2.31.2 → docling-2.33.0}/pyproject.toml +2 -2
  13. {docling-2.31.2 → docling-2.33.0}/LICENSE +0 -0
  14. {docling-2.31.2 → docling-2.33.0}/README.md +0 -0
  15. {docling-2.31.2 → docling-2.33.0}/docling/__init__.py +0 -0
  16. {docling-2.31.2 → docling-2.33.0}/docling/backend/__init__.py +0 -0
  17. {docling-2.31.2 → docling-2.33.0}/docling/backend/abstract_backend.py +0 -0
  18. {docling-2.31.2 → docling-2.33.0}/docling/backend/asciidoc_backend.py +0 -0
  19. {docling-2.31.2 → docling-2.33.0}/docling/backend/csv_backend.py +0 -0
  20. {docling-2.31.2 → docling-2.33.0}/docling/backend/docling_parse_backend.py +0 -0
  21. {docling-2.31.2 → docling-2.33.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  22. {docling-2.31.2 → docling-2.33.0}/docling/backend/docling_parse_v4_backend.py +0 -0
  23. {docling-2.31.2 → docling-2.33.0}/docling/backend/docx/__init__.py +0 -0
  24. {docling-2.31.2 → docling-2.33.0}/docling/backend/docx/latex/__init__.py +0 -0
  25. {docling-2.31.2 → docling-2.33.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  26. {docling-2.31.2 → docling-2.33.0}/docling/backend/docx/latex/omml.py +0 -0
  27. {docling-2.31.2 → docling-2.33.0}/docling/backend/html_backend.py +0 -0
  28. {docling-2.31.2 → docling-2.33.0}/docling/backend/json/__init__.py +0 -0
  29. {docling-2.31.2 → docling-2.33.0}/docling/backend/json/docling_json_backend.py +0 -0
  30. {docling-2.31.2 → docling-2.33.0}/docling/backend/md_backend.py +0 -0
  31. {docling-2.31.2 → docling-2.33.0}/docling/backend/msexcel_backend.py +0 -0
  32. {docling-2.31.2 → docling-2.33.0}/docling/backend/mspowerpoint_backend.py +0 -0
  33. {docling-2.31.2 → docling-2.33.0}/docling/backend/pdf_backend.py +0 -0
  34. {docling-2.31.2 → docling-2.33.0}/docling/backend/xml/__init__.py +0 -0
  35. {docling-2.31.2 → docling-2.33.0}/docling/backend/xml/jats_backend.py +0 -0
  36. {docling-2.31.2 → docling-2.33.0}/docling/backend/xml/uspto_backend.py +0 -0
  37. {docling-2.31.2 → docling-2.33.0}/docling/chunking/__init__.py +0 -0
  38. {docling-2.31.2 → docling-2.33.0}/docling/cli/__init__.py +0 -0
  39. {docling-2.31.2 → docling-2.33.0}/docling/cli/main.py +0 -0
  40. {docling-2.31.2 → docling-2.33.0}/docling/cli/models.py +0 -0
  41. {docling-2.31.2 → docling-2.33.0}/docling/cli/tools.py +0 -0
  42. {docling-2.31.2 → docling-2.33.0}/docling/datamodel/__init__.py +0 -0
  43. {docling-2.31.2 → docling-2.33.0}/docling/document_converter.py +0 -0
  44. {docling-2.31.2 → docling-2.33.0}/docling/exceptions.py +0 -0
  45. {docling-2.31.2 → docling-2.33.0}/docling/models/__init__.py +0 -0
  46. {docling-2.31.2 → docling-2.33.0}/docling/models/base_model.py +0 -0
  47. {docling-2.31.2 → docling-2.33.0}/docling/models/base_ocr_model.py +0 -0
  48. {docling-2.31.2 → docling-2.33.0}/docling/models/code_formula_model.py +0 -0
  49. {docling-2.31.2 → docling-2.33.0}/docling/models/document_picture_classifier.py +0 -0
  50. {docling-2.31.2 → docling-2.33.0}/docling/models/easyocr_model.py +0 -0
  51. {docling-2.31.2 → docling-2.33.0}/docling/models/factories/__init__.py +0 -0
  52. {docling-2.31.2 → docling-2.33.0}/docling/models/factories/base_factory.py +0 -0
  53. {docling-2.31.2 → docling-2.33.0}/docling/models/factories/ocr_factory.py +0 -0
  54. {docling-2.31.2 → docling-2.33.0}/docling/models/factories/picture_description_factory.py +0 -0
  55. {docling-2.31.2 → docling-2.33.0}/docling/models/hf_mlx_model.py +0 -0
  56. {docling-2.31.2 → docling-2.33.0}/docling/models/hf_vlm_model.py +0 -0
  57. {docling-2.31.2 → docling-2.33.0}/docling/models/layout_model.py +0 -0
  58. {docling-2.31.2 → docling-2.33.0}/docling/models/ocr_mac_model.py +0 -0
  59. {docling-2.31.2 → docling-2.33.0}/docling/models/page_assemble_model.py +0 -0
  60. {docling-2.31.2 → docling-2.33.0}/docling/models/page_preprocessing_model.py +0 -0
  61. {docling-2.31.2 → docling-2.33.0}/docling/models/picture_description_base_model.py +0 -0
  62. {docling-2.31.2 → docling-2.33.0}/docling/models/picture_description_vlm_model.py +0 -0
  63. {docling-2.31.2 → docling-2.33.0}/docling/models/plugins/__init__.py +0 -0
  64. {docling-2.31.2 → docling-2.33.0}/docling/models/plugins/defaults.py +0 -0
  65. {docling-2.31.2 → docling-2.33.0}/docling/models/rapid_ocr_model.py +0 -0
  66. {docling-2.31.2 → docling-2.33.0}/docling/models/readingorder_model.py +0 -0
  67. {docling-2.31.2 → docling-2.33.0}/docling/models/table_structure_model.py +0 -0
  68. {docling-2.31.2 → docling-2.33.0}/docling/models/tesseract_ocr_model.py +0 -0
  69. {docling-2.31.2 → docling-2.33.0}/docling/pipeline/__init__.py +0 -0
  70. {docling-2.31.2 → docling-2.33.0}/docling/pipeline/base_pipeline.py +0 -0
  71. {docling-2.31.2 → docling-2.33.0}/docling/pipeline/simple_pipeline.py +0 -0
  72. {docling-2.31.2 → docling-2.33.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  73. {docling-2.31.2 → docling-2.33.0}/docling/py.typed +0 -0
  74. {docling-2.31.2 → docling-2.33.0}/docling/utils/__init__.py +0 -0
  75. {docling-2.31.2 → docling-2.33.0}/docling/utils/accelerator_utils.py +0 -0
  76. {docling-2.31.2 → docling-2.33.0}/docling/utils/api_image_request.py +0 -0
  77. {docling-2.31.2 → docling-2.33.0}/docling/utils/export.py +0 -0
  78. {docling-2.31.2 → docling-2.33.0}/docling/utils/glm_utils.py +0 -0
  79. {docling-2.31.2 → docling-2.33.0}/docling/utils/layout_postprocessor.py +0 -0
  80. {docling-2.31.2 → docling-2.33.0}/docling/utils/locks.py +0 -0
  81. {docling-2.31.2 → docling-2.33.0}/docling/utils/model_downloader.py +0 -0
  82. {docling-2.31.2 → docling-2.33.0}/docling/utils/ocr_utils.py +0 -0
  83. {docling-2.31.2 → docling-2.33.0}/docling/utils/profiling.py +0 -0
  84. {docling-2.31.2 → docling-2.33.0}/docling/utils/utils.py +0 -0
  85. {docling-2.31.2 → docling-2.33.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.31.2
3
+ Version: 2.33.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
@@ -29,7 +29,7 @@ Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platfo
29
29
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
30
30
  Requires-Dist: certifi (>=2024.7.4)
31
31
  Requires-Dist: click (<8.2.0)
32
- Requires-Dist: docling-core[chunking] (>=2.26.0,<3.0.0)
32
+ Requires-Dist: docling-core[chunking] (>=2.29.0,<3.0.0)
33
33
  Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
34
34
  Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
35
35
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -2,7 +2,7 @@ import logging
2
2
  import re
3
3
  from io import BytesIO
4
4
  from pathlib import Path
5
- from typing import Any, Optional, Union
5
+ from typing import Any, List, Optional, Union
6
6
 
7
7
  from docling_core.types.doc import (
8
8
  DocItemLabel,
@@ -24,7 +24,6 @@ from docx.text.hyperlink import Hyperlink
24
24
  from docx.text.paragraph import Paragraph
25
25
  from docx.text.run import Run
26
26
  from lxml import etree
27
- from lxml.etree import XPath
28
27
  from PIL import Image, UnidentifiedImageError
29
28
  from pydantic import AnyUrl
30
29
  from typing_extensions import override
@@ -59,6 +58,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
59
58
  self.parents: dict[int, Optional[NodeItem]] = {}
60
59
  self.numbered_headers: dict[int, int] = {}
61
60
  self.equation_bookends: str = "<eq>{EQ}</eq>"
61
+ # Track processed textbox elements to avoid duplication
62
+ self.processed_textbox_elements: List[int] = []
63
+ # Track content hash of processed paragraphs to avoid duplicate content
64
+ self.processed_paragraph_content: List[str] = []
65
+
62
66
  for i in range(-1, self.max_levels):
63
67
  self.parents[i] = None
64
68
 
@@ -175,10 +179,74 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
175
179
  "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
176
180
  "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
177
181
  "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
182
+ "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
183
+ "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
184
+ "v": "urn:schemas-microsoft-com:vml",
185
+ "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
186
+ "w10": "urn:schemas-microsoft-com:office:word",
187
+ "a14": "http://schemas.microsoft.com/office/drawing/2010/main",
178
188
  }
179
- xpath_expr = XPath(".//a:blip", namespaces=namespaces)
189
+ xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
180
190
  drawing_blip = xpath_expr(element)
181
191
 
192
+ # Check for textbox content - check multiple textbox formats
193
+ # Only process if the element hasn't been processed before
194
+ element_id = id(element)
195
+ if element_id not in self.processed_textbox_elements:
196
+ # Modern Word textboxes
197
+ txbx_xpath = etree.XPath(
198
+ ".//w:txbxContent|.//v:textbox//w:p", namespaces=namespaces
199
+ )
200
+ textbox_elements = txbx_xpath(element)
201
+
202
+ # No modern textboxes found, check for alternate/legacy textbox formats
203
+ if not textbox_elements and tag_name in ["drawing", "pict"]:
204
+ # Additional checks for textboxes in DrawingML and VML formats
205
+ alt_txbx_xpath = etree.XPath(
206
+ ".//wps:txbx//w:p|.//w10:wrap//w:p|.//a:p//a:t",
207
+ namespaces=namespaces,
208
+ )
209
+ textbox_elements = alt_txbx_xpath(element)
210
+
211
+ # Check for shape text that's not in a standard textbox
212
+ if not textbox_elements:
213
+ shape_text_xpath = etree.XPath(
214
+ ".//a:bodyPr/ancestor::*//a:t|.//a:txBody//a:t",
215
+ namespaces=namespaces,
216
+ )
217
+ shape_text_elements = shape_text_xpath(element)
218
+ if shape_text_elements:
219
+ # Create custom text elements from shape text
220
+ text_content = " ".join(
221
+ [t.text for t in shape_text_elements if t.text]
222
+ )
223
+ if text_content.strip():
224
+ _log.debug(f"Found shape text: {text_content[:50]}...")
225
+ # Create a paragraph-like element to process with standard handler
226
+ level = self._get_level()
227
+ shape_group = doc.add_group(
228
+ label=GroupLabel.SECTION,
229
+ parent=self.parents[level - 1],
230
+ name="shape-text",
231
+ )
232
+ doc.add_text(
233
+ label=DocItemLabel.PARAGRAPH,
234
+ parent=shape_group,
235
+ text=text_content,
236
+ )
237
+
238
+ if textbox_elements:
239
+ # Mark the parent element as processed
240
+ self.processed_textbox_elements.append(element_id)
241
+ # Also mark all found textbox elements as processed
242
+ for tb_element in textbox_elements:
243
+ self.processed_textbox_elements.append(id(tb_element))
244
+
245
+ _log.debug(
246
+ f"Found textbox content with {len(textbox_elements)} elements"
247
+ )
248
+ self._handle_textbox_content(textbox_elements, docx_obj, doc)
249
+
182
250
  # Check for Tables
183
251
  if element.tag.endswith("tbl"):
184
252
  try:
@@ -291,15 +359,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
291
359
 
292
360
  @classmethod
293
361
  def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
294
- has_any_formatting = run.bold or run.italic or run.underline
295
- return (
296
- Formatting(
297
- bold=run.bold or False,
298
- italic=run.italic or False,
299
- underline=run.underline or False,
300
- )
301
- if has_any_formatting
302
- else None
362
+ # The .bold and .italic properties are booleans, but .underline can be an enum
363
+ # like WD_UNDERLINE.THICK (value 6), so we need to convert it to a boolean
364
+ has_bold = run.bold or False
365
+ has_italic = run.italic or False
366
+ # Convert any non-None underline value to True
367
+ has_underline = bool(run.underline is not None and run.underline)
368
+
369
+ return Formatting(
370
+ bold=has_bold,
371
+ italic=has_italic,
372
+ underline=has_underline,
303
373
  )
304
374
 
305
375
  def _get_paragraph_elements(self, paragraph: Paragraph):
@@ -355,6 +425,182 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
355
425
 
356
426
  return paragraph_elements
357
427
 
428
+ def _get_paragraph_position(self, paragraph_element):
429
+ """Extract vertical position information from paragraph element."""
430
+ # First try to directly get the index from w:p element that has an order-related attribute
431
+ if (
432
+ hasattr(paragraph_element, "getparent")
433
+ and paragraph_element.getparent() is not None
434
+ ):
435
+ parent = paragraph_element.getparent()
436
+ # Get all paragraph siblings
437
+ paragraphs = [
438
+ p for p in parent.getchildren() if etree.QName(p).localname == "p"
439
+ ]
440
+ # Find index of current paragraph within its siblings
441
+ try:
442
+ paragraph_index = paragraphs.index(paragraph_element)
443
+ return paragraph_index # Use index as position for consistent ordering
444
+ except ValueError:
445
+ pass
446
+
447
+ # Look for position hints in element attributes and ancestor elements
448
+ for elem in (*[paragraph_element], *paragraph_element.iterancestors()):
449
+ # Check for direct position attributes
450
+ for attr_name in ["y", "top", "positionY", "y-position", "position"]:
451
+ value = elem.get(attr_name)
452
+ if value:
453
+ try:
454
+ # Remove any non-numeric characters (like 'pt', 'px', etc.)
455
+ clean_value = re.sub(r"[^0-9.]", "", value)
456
+ if clean_value:
457
+ return float(clean_value)
458
+ except (ValueError, TypeError):
459
+ pass
460
+
461
+ # Check for position in transform attribute
462
+ transform = elem.get("transform")
463
+ if transform:
464
+ # Extract translation component from transform matrix
465
+ match = re.search(r"translate\([^,]+,\s*([0-9.]+)", transform)
466
+ if match:
467
+ try:
468
+ return float(match.group(1))
469
+ except ValueError:
470
+ pass
471
+
472
+ # Check for anchors or relative position indicators in Word format
473
+ # 'dist' attributes can indicate relative positioning
474
+ for attr_name in ["distT", "distB", "anchor", "relativeFrom"]:
475
+ if elem.get(attr_name) is not None:
476
+ return elem.sourceline # Use the XML source line number as fallback
477
+
478
+ # For VML shapes, look for specific attributes
479
+ for ns_uri in paragraph_element.nsmap.values():
480
+ if "vml" in ns_uri:
481
+ # Try to extract position from style attribute
482
+ style = paragraph_element.get("style")
483
+ if style:
484
+ match = re.search(r"top:([0-9.]+)pt", style)
485
+ if match:
486
+ try:
487
+ return float(match.group(1))
488
+ except ValueError:
489
+ pass
490
+
491
+ # If no better position indicator found, use XML source line number as proxy for order
492
+ return (
493
+ paragraph_element.sourceline
494
+ if hasattr(paragraph_element, "sourceline")
495
+ else None
496
+ )
497
+
498
+ def _collect_textbox_paragraphs(self, textbox_elements):
499
+ """Collect and organize paragraphs from textbox elements."""
500
+ processed_paragraphs = []
501
+ container_paragraphs = {}
502
+
503
+ for element in textbox_elements:
504
+ element_id = id(element)
505
+ # Skip if we've already processed this exact element
506
+ if element_id in processed_paragraphs:
507
+ continue
508
+
509
+ tag_name = etree.QName(element).localname
510
+ processed_paragraphs.append(element_id)
511
+
512
+ # Handle paragraphs directly found (VML textboxes)
513
+ if tag_name == "p":
514
+ # Find the containing textbox or shape element
515
+ container_id = None
516
+ for ancestor in element.iterancestors():
517
+ if any(ns in ancestor.tag for ns in ["textbox", "shape", "txbx"]):
518
+ container_id = id(ancestor)
519
+ break
520
+
521
+ if container_id not in container_paragraphs:
522
+ container_paragraphs[container_id] = []
523
+ container_paragraphs[container_id].append(
524
+ (element, self._get_paragraph_position(element))
525
+ )
526
+
527
+ # Handle txbxContent elements (Word DrawingML textboxes)
528
+ elif tag_name == "txbxContent":
529
+ paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
530
+ container_id = id(element)
531
+ if container_id not in container_paragraphs:
532
+ container_paragraphs[container_id] = []
533
+
534
+ for p in paragraphs:
535
+ p_id = id(p)
536
+ if p_id not in processed_paragraphs:
537
+ processed_paragraphs.append(p_id)
538
+ container_paragraphs[container_id].append(
539
+ (p, self._get_paragraph_position(p))
540
+ )
541
+ else:
542
+ # Try to extract any paragraphs from unknown elements
543
+ paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
544
+ container_id = id(element)
545
+ if container_id not in container_paragraphs:
546
+ container_paragraphs[container_id] = []
547
+
548
+ for p in paragraphs:
549
+ p_id = id(p)
550
+ if p_id not in processed_paragraphs:
551
+ processed_paragraphs.append(p_id)
552
+ container_paragraphs[container_id].append(
553
+ (p, self._get_paragraph_position(p))
554
+ )
555
+
556
+ return container_paragraphs
557
+
558
+ def _handle_textbox_content(
559
+ self,
560
+ textbox_elements: list,
561
+ docx_obj: DocxDocument,
562
+ doc: DoclingDocument,
563
+ ) -> None:
564
+ """Process textbox content and add it to the document structure."""
565
+ level = self._get_level()
566
+ # Create a textbox group to contain all text from the textbox
567
+ textbox_group = doc.add_group(
568
+ label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox"
569
+ )
570
+
571
+ # Set this as the current parent to ensure textbox content
572
+ # is properly nested in document structure
573
+ original_parent = self.parents[level]
574
+ self.parents[level] = textbox_group
575
+
576
+ # Collect and organize paragraphs
577
+ container_paragraphs = self._collect_textbox_paragraphs(textbox_elements)
578
+
579
+ # Process all paragraphs
580
+ all_paragraphs = []
581
+
582
+ # Sort paragraphs within each container, then process containers
583
+ for container_id, paragraphs in container_paragraphs.items():
584
+ # Sort by vertical position within each container
585
+ sorted_container_paragraphs = sorted(
586
+ paragraphs,
587
+ key=lambda x: (
588
+ x[1] is None,
589
+ x[1] if x[1] is not None else float("inf"),
590
+ ),
591
+ )
592
+
593
+ # Add the sorted paragraphs to our processing list
594
+ all_paragraphs.extend(sorted_container_paragraphs)
595
+
596
+ # Process all the paragraphs
597
+ for p, _ in all_paragraphs:
598
+ self._handle_text_elements(p, docx_obj, doc, is_from_textbox=True)
599
+
600
+ # Restore original parent
601
+ self.parents[level] = original_parent
602
+ return
603
+
358
604
  def _handle_equations_in_text(self, element, text):
359
605
  only_texts = []
360
606
  only_equations = []
@@ -423,10 +669,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
423
669
  element: BaseOxmlElement,
424
670
  docx_obj: DocxDocument,
425
671
  doc: DoclingDocument,
672
+ is_from_textbox: bool = False,
426
673
  ) -> None:
427
674
  paragraph = Paragraph(element, docx_obj)
428
675
 
676
+ # Skip if from a textbox and this exact paragraph content was already processed
677
+ # Skip if from a textbox and this exact paragraph content was already processed
429
678
  raw_text = paragraph.text
679
+ if is_from_textbox and raw_text:
680
+ # Create a simple hash of content to detect duplicates
681
+ content_hash = f"{len(raw_text)}:{raw_text[:50]}"
682
+ if content_hash in self.processed_paragraph_content:
683
+ _log.debug(f"Skipping duplicate paragraph content: {content_hash}")
684
+ return
685
+ self.processed_paragraph_content.append(content_hash)
686
+
430
687
  text, equations = self._handle_equations_in_text(element=element, text=raw_text)
431
688
 
432
689
  if text is None:
@@ -175,13 +175,18 @@ class PyPdfiumPageBackend(PdfPageBackend):
175
175
  if len(group) == 1:
176
176
  return group[0]
177
177
 
178
- merged_text = "".join(cell.text for cell in group)
179
178
  merged_bbox = BoundingBox(
180
179
  l=min(cell.rect.to_bounding_box().l for cell in group),
181
180
  t=min(cell.rect.to_bounding_box().t for cell in group),
182
181
  r=max(cell.rect.to_bounding_box().r for cell in group),
183
182
  b=max(cell.rect.to_bounding_box().b for cell in group),
184
183
  )
184
+
185
+ assert self._ppage is not None
186
+ self.text_page = self._ppage.get_textpage()
187
+ bbox = merged_bbox.to_bottom_left_origin(page_size.height)
188
+ merged_text = self.text_page.get_text_bounded(*bbox.as_tuple())
189
+
185
190
  return TextCell(
186
191
  index=group[0].index,
187
192
  text=merged_text,
@@ -90,6 +90,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
90
90
  "image/tiff",
91
91
  "image/gif",
92
92
  "image/bmp",
93
+ "image/webp",
93
94
  ],
94
95
  InputFormat.PDF: ["application/pdf"],
95
96
  InputFormat.ASCIIDOC: ["text/asciidoc"],
@@ -302,7 +302,7 @@ class _DocumentConversionInput(BaseModel):
302
302
  if ("." in obj.name and not obj.name.startswith("."))
303
303
  else ""
304
304
  )
305
- mime = _DocumentConversionInput._mime_from_extension(ext)
305
+ mime = _DocumentConversionInput._mime_from_extension(ext.lower())
306
306
  if mime is not None and mime.lower() == "application/zip":
307
307
  objname = obj.name.lower()
308
308
  if objname.endswith(".xlsx"):
@@ -376,6 +376,13 @@ class _DocumentConversionInput(BaseModel):
376
376
  mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
377
377
  elif ext in FormatToExtensions[InputFormat.PDF]:
378
378
  mime = FormatToMimeType[InputFormat.PDF][0]
379
+ elif ext in FormatToExtensions[InputFormat.DOCX]:
380
+ mime = FormatToMimeType[InputFormat.DOCX][0]
381
+ elif ext in FormatToExtensions[InputFormat.PPTX]:
382
+ mime = FormatToMimeType[InputFormat.PPTX][0]
383
+ elif ext in FormatToExtensions[InputFormat.XLSX]:
384
+ mime = FormatToMimeType[InputFormat.XLSX][0]
385
+
379
386
  return mime
380
387
 
381
388
  @staticmethod
@@ -225,6 +225,7 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
225
225
  headers: Dict[str, str] = {}
226
226
  params: Dict[str, Any] = {}
227
227
  timeout: float = 20
228
+ concurrency: int = 1
228
229
 
229
230
  prompt: str = "Describe this image in a few sentences."
230
231
  provenance: str = ""
@@ -295,6 +296,7 @@ class ApiVlmOptions(BaseVlmOptions):
295
296
  params: Dict[str, Any] = {}
296
297
  scale: float = 2.0
297
298
  timeout: float = 60
299
+ concurrency: int = 1
298
300
  response_format: ResponseFormat
299
301
 
300
302
 
@@ -56,13 +56,15 @@ class DebugSettings(BaseModel):
56
56
 
57
57
 
58
58
  class AppSettings(BaseSettings):
59
- model_config = SettingsConfigDict(env_prefix="DOCLING_", env_nested_delimiter="_")
59
+ model_config = SettingsConfigDict(
60
+ env_prefix="DOCLING_", env_nested_delimiter="_", env_nested_max_split=1
61
+ )
60
62
 
61
- perf: BatchConcurrencySettings
62
- debug: DebugSettings
63
+ perf: BatchConcurrencySettings = BatchConcurrencySettings()
64
+ debug: DebugSettings = DebugSettings()
63
65
 
64
66
  cache_dir: Path = Path.home() / ".cache" / "docling"
65
67
  artifacts_path: Optional[Path] = None
66
68
 
67
69
 
68
- settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
70
+ settings = AppSettings()
@@ -1,4 +1,5 @@
1
1
  from collections.abc import Iterable
2
+ from concurrent.futures import ThreadPoolExecutor
2
3
 
3
4
  from docling.datamodel.base_models import Page, VlmPrediction
4
5
  from docling.datamodel.document import ConversionResult
@@ -27,6 +28,7 @@ class ApiVlmModel(BasePageModel):
27
28
  )
28
29
 
29
30
  self.timeout = self.vlm_options.timeout
31
+ self.concurrency = self.vlm_options.concurrency
30
32
  self.prompt_content = (
31
33
  f"This is a page from a document.\n{self.vlm_options.prompt}"
32
34
  )
@@ -38,10 +40,10 @@ class ApiVlmModel(BasePageModel):
38
40
  def __call__(
39
41
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
40
42
  ) -> Iterable[Page]:
41
- for page in page_batch:
43
+ def _vlm_request(page):
42
44
  assert page._backend is not None
43
45
  if not page._backend.is_valid():
44
- yield page
46
+ return page
45
47
  else:
46
48
  with TimeRecorder(conv_res, "vlm"):
47
49
  assert page.size is not None
@@ -63,4 +65,7 @@ class ApiVlmModel(BasePageModel):
63
65
 
64
66
  page.predictions.vlm_response = VlmPrediction(text=page_tags)
65
67
 
66
- yield page
68
+ return page
69
+
70
+ with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
71
+ yield from executor.map(_vlm_request, page_batch)
@@ -1,4 +1,5 @@
1
1
  from collections.abc import Iterable
2
+ from concurrent.futures import ThreadPoolExecutor
2
3
  from pathlib import Path
3
4
  from typing import Optional, Type, Union
4
5
 
@@ -37,6 +38,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
37
38
  accelerator_options=accelerator_options,
38
39
  )
39
40
  self.options: PictureDescriptionApiOptions
41
+ self.concurrency = self.options.concurrency
40
42
 
41
43
  if self.enabled:
42
44
  if not enable_remote_services:
@@ -48,8 +50,8 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
48
50
  def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
49
51
  # Note: technically we could make a batch request here,
50
52
  # but not all APIs will allow for it. For example, vllm won't allow more than 1.
51
- for image in images:
52
- yield api_image_request(
53
+ def _api_request(image):
54
+ return api_image_request(
53
55
  image=image,
54
56
  prompt=self.options.prompt,
55
57
  url=self.options.url,
@@ -57,3 +59,6 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
57
59
  headers=self.options.headers,
58
60
  **self.options.params,
59
61
  )
62
+
63
+ with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
64
+ yield from executor.map(_api_request, images)
@@ -249,7 +249,7 @@ class TesseractOcrCliModel(BaseOcrModel):
249
249
  cell = TextCell(
250
250
  index=ix,
251
251
  text=str(text),
252
- orig=text,
252
+ orig=str(text),
253
253
  from_ocr=True,
254
254
  confidence=conf / 100.0,
255
255
  rect=BoundingRectangle.from_bounding_box(
@@ -3,7 +3,7 @@ from io import BytesIO
3
3
  from pathlib import Path
4
4
  from typing import List, Optional, Union, cast
5
5
 
6
- # from docling_core.types import DoclingDocument
6
+ from docling_core.types import DoclingDocument
7
7
  from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem
8
8
  from docling_core.types.doc.document import DocTagsDocument
9
9
  from PIL import Image as PILImage
@@ -133,28 +133,26 @@ class VlmPipeline(PaginatedPipeline):
133
133
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
134
134
  doctags_list_c, image_list_c
135
135
  )
136
- conv_res.document.load_from_doctags(doctags_doc)
136
+ conv_res.document = DoclingDocument.load_from_doctags(doctags_doc)
137
137
 
138
138
  # If forced backend text, replace model predicted text with backend one
139
- if page.size:
140
- if self.force_backend_text:
141
- scale = self.pipeline_options.images_scale
142
- for element, _level in conv_res.document.iterate_items():
143
- if (
144
- not isinstance(element, TextItem)
145
- or len(element.prov) == 0
146
- ):
147
- continue
148
- crop_bbox = (
149
- element.prov[0]
150
- .bbox.scaled(scale=scale)
151
- .to_top_left_origin(
152
- page_height=page.size.height * scale
153
- )
154
- )
155
- txt = self.extract_text_from_backend(page, crop_bbox)
156
- element.text = txt
157
- element.orig = txt
139
+ if self.force_backend_text:
140
+ scale = self.pipeline_options.images_scale
141
+ for element, _level in conv_res.document.iterate_items():
142
+ if not isinstance(element, TextItem) or len(element.prov) == 0:
143
+ continue
144
+ page_ix = element.prov[0].page_no - 1
145
+ page = conv_res.pages[page_ix]
146
+ if not page.size:
147
+ continue
148
+ crop_bbox = (
149
+ element.prov[0]
150
+ .bbox.scaled(scale=scale)
151
+ .to_top_left_origin(page_height=page.size.height * scale)
152
+ )
153
+ txt = self.extract_text_from_backend(page, crop_bbox)
154
+ element.text = txt
155
+ element.orig = txt
158
156
  elif (
159
157
  self.pipeline_options.vlm_options.response_format
160
158
  == ResponseFormat.MARKDOWN
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.31.2" # DO NOT EDIT, updated automatically
3
+ version = "2.33.0" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = [
6
6
  "Christoph Auer <cau@zurich.ibm.com>",
@@ -46,7 +46,7 @@ packages = [{ include = "docling" }]
46
46
  ######################
47
47
  python = "^3.9"
48
48
  pydantic = "^2.0.0"
49
- docling-core = {version = "^2.26.0", extras = ["chunking"]}
49
+ docling-core = {version = "^2.29.0", extras = ["chunking"]}
50
50
  docling-ibm-models = "^3.4.0"
51
51
  docling-parse = "^4.0.0"
52
52
  filetype = "^1.2.0"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes