docling 2.15.0__tar.gz → 2.15.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {docling-2.15.0 → docling-2.15.1}/PKG-INFO +3 -4
  2. {docling-2.15.0 → docling-2.15.1}/README.md +1 -2
  3. {docling-2.15.0 → docling-2.15.1}/docling/backend/docling_parse_backend.py +1 -1
  4. {docling-2.15.0 → docling-2.15.1}/docling/backend/docling_parse_v2_backend.py +1 -1
  5. {docling-2.15.0 → docling-2.15.1}/docling/backend/pypdfium2_backend.py +1 -1
  6. {docling-2.15.0 → docling-2.15.1}/docling/datamodel/pipeline_options.py +1 -1
  7. {docling-2.15.0 → docling-2.15.1}/docling/models/base_ocr_model.py +15 -12
  8. {docling-2.15.0 → docling-2.15.1}/pyproject.toml +2 -2
  9. {docling-2.15.0 → docling-2.15.1}/LICENSE +0 -0
  10. {docling-2.15.0 → docling-2.15.1}/docling/__init__.py +0 -0
  11. {docling-2.15.0 → docling-2.15.1}/docling/backend/__init__.py +0 -0
  12. {docling-2.15.0 → docling-2.15.1}/docling/backend/abstract_backend.py +0 -0
  13. {docling-2.15.0 → docling-2.15.1}/docling/backend/asciidoc_backend.py +0 -0
  14. {docling-2.15.0 → docling-2.15.1}/docling/backend/html_backend.py +0 -0
  15. {docling-2.15.0 → docling-2.15.1}/docling/backend/md_backend.py +0 -0
  16. {docling-2.15.0 → docling-2.15.1}/docling/backend/msexcel_backend.py +0 -0
  17. {docling-2.15.0 → docling-2.15.1}/docling/backend/mspowerpoint_backend.py +0 -0
  18. {docling-2.15.0 → docling-2.15.1}/docling/backend/msword_backend.py +0 -0
  19. {docling-2.15.0 → docling-2.15.1}/docling/backend/pdf_backend.py +0 -0
  20. {docling-2.15.0 → docling-2.15.1}/docling/backend/xml/__init__.py +0 -0
  21. {docling-2.15.0 → docling-2.15.1}/docling/backend/xml/pubmed_backend.py +0 -0
  22. {docling-2.15.0 → docling-2.15.1}/docling/backend/xml/uspto_backend.py +0 -0
  23. {docling-2.15.0 → docling-2.15.1}/docling/chunking/__init__.py +0 -0
  24. {docling-2.15.0 → docling-2.15.1}/docling/cli/__init__.py +0 -0
  25. {docling-2.15.0 → docling-2.15.1}/docling/cli/main.py +0 -0
  26. {docling-2.15.0 → docling-2.15.1}/docling/datamodel/__init__.py +0 -0
  27. {docling-2.15.0 → docling-2.15.1}/docling/datamodel/base_models.py +0 -0
  28. {docling-2.15.0 → docling-2.15.1}/docling/datamodel/document.py +0 -0
  29. {docling-2.15.0 → docling-2.15.1}/docling/datamodel/settings.py +0 -0
  30. {docling-2.15.0 → docling-2.15.1}/docling/document_converter.py +0 -0
  31. {docling-2.15.0 → docling-2.15.1}/docling/exceptions.py +0 -0
  32. {docling-2.15.0 → docling-2.15.1}/docling/models/__init__.py +0 -0
  33. {docling-2.15.0 → docling-2.15.1}/docling/models/base_model.py +0 -0
  34. {docling-2.15.0 → docling-2.15.1}/docling/models/ds_glm_model.py +0 -0
  35. {docling-2.15.0 → docling-2.15.1}/docling/models/easyocr_model.py +0 -0
  36. {docling-2.15.0 → docling-2.15.1}/docling/models/layout_model.py +0 -0
  37. {docling-2.15.0 → docling-2.15.1}/docling/models/ocr_mac_model.py +0 -0
  38. {docling-2.15.0 → docling-2.15.1}/docling/models/page_assemble_model.py +0 -0
  39. {docling-2.15.0 → docling-2.15.1}/docling/models/page_preprocessing_model.py +0 -0
  40. {docling-2.15.0 → docling-2.15.1}/docling/models/rapid_ocr_model.py +0 -0
  41. {docling-2.15.0 → docling-2.15.1}/docling/models/table_structure_model.py +0 -0
  42. {docling-2.15.0 → docling-2.15.1}/docling/models/tesseract_ocr_cli_model.py +0 -0
  43. {docling-2.15.0 → docling-2.15.1}/docling/models/tesseract_ocr_model.py +0 -0
  44. {docling-2.15.0 → docling-2.15.1}/docling/pipeline/__init__.py +0 -0
  45. {docling-2.15.0 → docling-2.15.1}/docling/pipeline/base_pipeline.py +0 -0
  46. {docling-2.15.0 → docling-2.15.1}/docling/pipeline/simple_pipeline.py +0 -0
  47. {docling-2.15.0 → docling-2.15.1}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  48. {docling-2.15.0 → docling-2.15.1}/docling/py.typed +0 -0
  49. {docling-2.15.0 → docling-2.15.1}/docling/utils/__init__.py +0 -0
  50. {docling-2.15.0 → docling-2.15.1}/docling/utils/accelerator_utils.py +0 -0
  51. {docling-2.15.0 → docling-2.15.1}/docling/utils/export.py +0 -0
  52. {docling-2.15.0 → docling-2.15.1}/docling/utils/glm_utils.py +0 -0
  53. {docling-2.15.0 → docling-2.15.1}/docling/utils/layout_postprocessor.py +0 -0
  54. {docling-2.15.0 → docling-2.15.1}/docling/utils/profiling.py +0 -0
  55. {docling-2.15.0 → docling-2.15.1}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.15.0
3
+ Version: 2.15.1
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -45,7 +45,7 @@ Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
45
45
  Requires-Dist: python-docx (>=1.1.2,<2.0.0)
46
46
  Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
47
47
  Requires-Dist: rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; (python_version < "3.13") and (extra == "rapidocr")
48
- Requires-Dist: requests (>=2.32.3,<3.0.0)
48
+ Requires-Dist: requests (>=2.32.2,<3.0.0)
49
49
  Requires-Dist: rtree (>=1.3.0,<2.0.0)
50
50
  Requires-Dist: scipy (>=1.6.0,<2.0.0)
51
51
  Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
@@ -84,7 +84,7 @@ Docling parses documents and exports them to the desired format with ease and sp
84
84
  * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
85
85
  * 📑 Advanced PDF document understanding including page layout, reading order & table structures
86
86
  * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
87
- * 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
87
+ * 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
88
88
  * 🔍 OCR support for scanned PDFs
89
89
  * 💻 Simple and convenient CLI
90
90
 
@@ -94,7 +94,6 @@ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty
94
94
 
95
95
  * ♾️ Equation & code extraction
96
96
  * 📝 Metadata extraction, including title, authors, references & language
97
- * 🦜🔗 Native LangChain extension
98
97
 
99
98
  ## Installation
100
99
 
@@ -29,7 +29,7 @@ Docling parses documents and exports them to the desired format with ease and sp
29
29
  * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
30
30
  * 📑 Advanced PDF document understanding including page layout, reading order & table structures
31
31
  * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
32
- * 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
32
+ * 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
33
33
  * 🔍 OCR support for scanned PDFs
34
34
  * 💻 Simple and convenient CLI
35
35
 
@@ -39,7 +39,6 @@ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty
39
39
 
40
40
  * ♾️ Equation & code extraction
41
41
  * 📝 Metadata extraction, including title, authors, references & language
42
- * 🦜🔗 Native LangChain extension
43
42
 
44
43
  ## Installation
45
44
 
@@ -132,7 +132,7 @@ class DoclingParsePageBackend(PdfPageBackend):
132
132
  return cells
133
133
 
134
134
  def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
135
- AREA_THRESHOLD = 32 * 32
135
+ AREA_THRESHOLD = 0 # 32 * 32
136
136
 
137
137
  for i in range(len(self._dpage["images"])):
138
138
  bitmap = self._dpage["images"][i]
@@ -140,7 +140,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
140
140
  return cells
141
141
 
142
142
  def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
143
- AREA_THRESHOLD = 32 * 32
143
+ AREA_THRESHOLD = 0 # 32 * 32
144
144
 
145
145
  images = self._dpage["sanitized"]["images"]["data"]
146
146
  images_header = self._dpage["sanitized"]["images"]["header"]
@@ -39,7 +39,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
39
39
  return self.valid
40
40
 
41
41
  def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
42
- AREA_THRESHOLD = 32 * 32
42
+ AREA_THRESHOLD = 0 # 32 * 32
43
43
  for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
44
44
  pos = obj.get_pos()
45
45
  cropbox = BoundingBox.from_tuple(
@@ -139,7 +139,7 @@ class EasyOcrOptions(OcrOptions):
139
139
 
140
140
  use_gpu: Optional[bool] = None
141
141
 
142
- confidence_threshold: float = 0.65
142
+ confidence_threshold: float = 0.5
143
143
 
144
144
  model_storage_directory: Optional[str] = None
145
145
  recog_network: Optional[str] = "standard"
@@ -8,7 +8,7 @@ import numpy as np
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
9
  from PIL import Image, ImageDraw
10
10
  from rtree import index
11
- from scipy.ndimage import find_objects, label
11
+ from scipy.ndimage import binary_dilation, find_objects, label
12
12
 
13
13
  from docling.datamodel.base_models import Cell, OcrCell, Page
14
14
  from docling.datamodel.document import ConversionResult
@@ -43,6 +43,12 @@ class BaseOcrModel(BasePageModel):
43
43
 
44
44
  np_image = np.array(image)
45
45
 
46
+ # Dilate the image by 10 pixels to merge nearby bitmap rectangles
47
+ structure = np.ones(
48
+ (20, 20)
49
+ ) # Create a 20x20 structure element (10 pixels in all directions)
50
+ np_image = binary_dilation(np_image > 0, structure=structure)
51
+
46
52
  # Find the connected components
47
53
  labeled_image, num_features = label(
48
54
  np_image > 0
@@ -72,7 +78,7 @@ class BaseOcrModel(BasePageModel):
72
78
  bitmap_rects = []
73
79
  coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
74
80
 
75
- # return full-page rectangle if sufficiently covered with bitmaps
81
+ # return full-page rectangle if page is dominantly covered with bitmaps
76
82
  if self.options.force_full_page_ocr or coverage > max(
77
83
  BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
78
84
  ):
@@ -85,17 +91,11 @@ class BaseOcrModel(BasePageModel):
85
91
  coord_origin=CoordOrigin.TOPLEFT,
86
92
  )
87
93
  ]
88
- # return individual rectangles if the bitmap coverage is smaller
89
- else: # coverage <= BITMAP_COVERAGE_TRESHOLD:
90
-
91
- # skip OCR if the bitmap area on the page is smaller than the options threshold
92
- ocr_rects = [
93
- rect
94
- for rect in ocr_rects
95
- if rect.area() / (page.size.width * page.size.height)
96
- > self.options.bitmap_area_threshold
97
- ]
94
+ # return individual rectangles if the bitmap coverage is above the threshold
95
+ elif coverage > self.options.bitmap_area_threshold:
98
96
  return ocr_rects
97
+ else: # overall coverage of bitmaps is too low, drop all bitmap rectangles.
98
+ return []
99
99
 
100
100
  # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
101
101
  def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
@@ -162,6 +162,9 @@ class BaseOcrModel(BasePageModel):
162
162
  x0 *= scale_x
163
163
  x1 *= scale_x
164
164
 
165
+ if y1 <= y0:
166
+ y1, y0 = y0, y1
167
+
165
168
  color = "gray"
166
169
  if isinstance(tc, OcrCell):
167
170
  color = "magenta"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.15.0" # DO NOT EDIT, updated automatically
3
+ version = "2.15.1" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -34,7 +34,7 @@ filetype = "^1.2.0"
34
34
  pypdfium2 = "^4.30.0"
35
35
  pydantic-settings = "^2.3.0"
36
36
  huggingface_hub = ">=0.23,<1"
37
- requests = "^2.32.3"
37
+ requests = "^2.32.2"
38
38
  easyocr = "^1.7"
39
39
  tesserocr = { version = "^2.7.1", optional = true }
40
40
  certifi = ">=2024.7.4"
File without changes
File without changes
File without changes
File without changes
File without changes