docling 2.4.1__tar.gz → 2.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {docling-2.4.1 → docling-2.5.0}/PKG-INFO +1 -1
  2. {docling-2.4.1 → docling-2.5.0}/docling/backend/html_backend.py +12 -0
  3. {docling-2.4.1 → docling-2.5.0}/docling/backend/mspowerpoint_backend.py +17 -22
  4. {docling-2.4.1 → docling-2.5.0}/docling/cli/main.py +10 -3
  5. {docling-2.4.1 → docling-2.5.0}/docling/datamodel/pipeline_options.py +1 -0
  6. {docling-2.4.1 → docling-2.5.0}/docling/datamodel/settings.py +3 -1
  7. {docling-2.4.1 → docling-2.5.0}/docling/models/base_ocr_model.py +22 -3
  8. {docling-2.4.1 → docling-2.5.0}/docling/models/easyocr_model.py +5 -7
  9. {docling-2.4.1 → docling-2.5.0}/docling/models/tesseract_ocr_cli_model.py +3 -7
  10. {docling-2.4.1 → docling-2.5.0}/docling/models/tesseract_ocr_model.py +3 -7
  11. {docling-2.4.1 → docling-2.5.0}/pyproject.toml +1 -1
  12. {docling-2.4.1 → docling-2.5.0}/LICENSE +0 -0
  13. {docling-2.4.1 → docling-2.5.0}/README.md +0 -0
  14. {docling-2.4.1 → docling-2.5.0}/docling/__init__.py +0 -0
  15. {docling-2.4.1 → docling-2.5.0}/docling/backend/__init__.py +0 -0
  16. {docling-2.4.1 → docling-2.5.0}/docling/backend/abstract_backend.py +0 -0
  17. {docling-2.4.1 → docling-2.5.0}/docling/backend/asciidoc_backend.py +0 -0
  18. {docling-2.4.1 → docling-2.5.0}/docling/backend/docling_parse_backend.py +0 -0
  19. {docling-2.4.1 → docling-2.5.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  20. {docling-2.4.1 → docling-2.5.0}/docling/backend/md_backend.py +0 -0
  21. {docling-2.4.1 → docling-2.5.0}/docling/backend/msword_backend.py +0 -0
  22. {docling-2.4.1 → docling-2.5.0}/docling/backend/pdf_backend.py +0 -0
  23. {docling-2.4.1 → docling-2.5.0}/docling/backend/pypdfium2_backend.py +0 -0
  24. {docling-2.4.1 → docling-2.5.0}/docling/cli/__init__.py +0 -0
  25. {docling-2.4.1 → docling-2.5.0}/docling/datamodel/__init__.py +0 -0
  26. {docling-2.4.1 → docling-2.5.0}/docling/datamodel/base_models.py +0 -0
  27. {docling-2.4.1 → docling-2.5.0}/docling/datamodel/document.py +0 -0
  28. {docling-2.4.1 → docling-2.5.0}/docling/document_converter.py +0 -0
  29. {docling-2.4.1 → docling-2.5.0}/docling/models/__init__.py +0 -0
  30. {docling-2.4.1 → docling-2.5.0}/docling/models/base_model.py +0 -0
  31. {docling-2.4.1 → docling-2.5.0}/docling/models/ds_glm_model.py +0 -0
  32. {docling-2.4.1 → docling-2.5.0}/docling/models/layout_model.py +0 -0
  33. {docling-2.4.1 → docling-2.5.0}/docling/models/page_assemble_model.py +0 -0
  34. {docling-2.4.1 → docling-2.5.0}/docling/models/page_preprocessing_model.py +0 -0
  35. {docling-2.4.1 → docling-2.5.0}/docling/models/table_structure_model.py +0 -0
  36. {docling-2.4.1 → docling-2.5.0}/docling/pipeline/__init__.py +0 -0
  37. {docling-2.4.1 → docling-2.5.0}/docling/pipeline/base_pipeline.py +0 -0
  38. {docling-2.4.1 → docling-2.5.0}/docling/pipeline/simple_pipeline.py +0 -0
  39. {docling-2.4.1 → docling-2.5.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  40. {docling-2.4.1 → docling-2.5.0}/docling/utils/__init__.py +0 -0
  41. {docling-2.4.1 → docling-2.5.0}/docling/utils/export.py +0 -0
  42. {docling-2.4.1 → docling-2.5.0}/docling/utils/layout_utils.py +0 -0
  43. {docling-2.4.1 → docling-2.5.0}/docling/utils/profiling.py +0 -0
  44. {docling-2.4.1 → docling-2.5.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.4.1
3
+ Version: 2.5.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -120,6 +120,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
120
120
  self.handle_header(element, idx, doc)
121
121
  elif element.name in ["p"]:
122
122
  self.handle_paragraph(element, idx, doc)
123
+ elif element.name in ["pre"]:
124
+ self.handle_code(element, idx, doc)
123
125
  elif element.name in ["ul", "ol"]:
124
126
  self.handle_list(element, idx, doc)
125
127
  elif element.name in ["li"]:
@@ -205,6 +207,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
205
207
  level=hlevel,
206
208
  )
207
209
 
210
+ def handle_code(self, element, idx, doc):
211
+ """Handles monospace code snippets (pre)."""
212
+ if element.text is None:
213
+ return
214
+ text = element.text.strip()
215
+ label = DocItemLabel.CODE
216
+ if len(text) == 0:
217
+ return
218
+ doc.add_text(parent=self.parents[self.level], label=label, text=text)
219
+
208
220
  def handle_paragraph(self, element, idx, doc):
209
221
  """Handles paragraph tags (p)."""
210
222
  if element.text is None:
@@ -358,41 +358,36 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
358
358
 
359
359
  size = Size(width=slide_width, height=slide_height)
360
360
  parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
361
- # parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)
362
-
363
- # Loop through each shape in the slide
364
- for shape in slide.shapes:
365
361
 
362
+ def handle_shapes(shape, parent_slide, slide_ind, doc):
363
+ handle_groups(shape, parent_slide, slide_ind, doc)
366
364
  if shape.has_table:
367
365
  # Handle Tables
368
366
  self.handle_tables(shape, parent_slide, slide_ind, doc)
369
-
370
367
  if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
371
- # Handle Tables
368
+ # Handle Pictures
372
369
  self.handle_pictures(shape, parent_slide, slide_ind, doc)
373
-
374
370
  # If shape doesn't have any text, move on to the next shape
375
371
  if not hasattr(shape, "text"):
376
- continue
372
+ return
377
373
  if shape.text is None:
378
- continue
374
+ return
379
375
  if len(shape.text.strip()) == 0:
380
- continue
376
+ return
381
377
  if not shape.has_text_frame:
382
- _log.warn("Warning: shape has text but not text_frame")
383
- continue
384
-
385
- # if shape.is_placeholder:
386
- # Handle Titles (Headers) and Subtitles
387
- # Check if the shape is a placeholder (titles are placeholders)
388
- # self.handle_title(shape, parent_slide, slide_ind, doc)
389
- # self.handle_text_elements(shape, parent_slide, slide_ind, doc)
390
- # else:
391
-
378
+ _log.warning("Warning: shape has text but not text_frame")
379
+ return
392
380
  # Handle other text elements, including lists (bullet lists, numbered lists)
393
381
  self.handle_text_elements(shape, parent_slide, slide_ind, doc)
382
+ return
383
+
384
+ def handle_groups(shape, parent_slide, slide_ind, doc):
385
+ if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
386
+ for groupedshape in shape.shapes:
387
+ handle_shapes(groupedshape, parent_slide, slide_ind, doc)
394
388
 
395
- # figures...
396
- # doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None)
389
+ # Loop through each shape in the slide
390
+ for shape in slide.shapes:
391
+ handle_shapes(shape, parent_slide, slide_ind, doc)
397
392
 
398
393
  return doc
@@ -153,6 +153,13 @@ def convert(
153
153
  ..., help="If enabled, the bitmap content will be processed using OCR."
154
154
  ),
155
155
  ] = True,
156
+ force_ocr: Annotated[
157
+ bool,
158
+ typer.Option(
159
+ ...,
160
+ help="Replace any existing text with OCR generated text over the full content.",
161
+ ),
162
+ ] = False,
156
163
  ocr_engine: Annotated[
157
164
  OcrEngine, typer.Option(..., help="The OCR engine to use.")
158
165
  ] = OcrEngine.EASYOCR,
@@ -219,11 +226,11 @@ def convert(
219
226
 
220
227
  match ocr_engine:
221
228
  case OcrEngine.EASYOCR:
222
- ocr_options: OcrOptions = EasyOcrOptions()
229
+ ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
223
230
  case OcrEngine.TESSERACT_CLI:
224
- ocr_options = TesseractCliOcrOptions()
231
+ ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
225
232
  case OcrEngine.TESSERACT:
226
- ocr_options = TesseractOcrOptions()
233
+ ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
227
234
  case _:
228
235
  raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
229
236
 
@@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):
22
22
 
23
23
  class OcrOptions(BaseModel):
24
24
  kind: str
25
+ force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
25
26
  bitmap_area_threshold: float = (
26
27
  0.05 # percentage of the area for a bitmap to processed with OCR
27
28
  )
@@ -2,7 +2,7 @@ import sys
2
2
  from pathlib import Path
3
3
 
4
4
  from pydantic import BaseModel
5
- from pydantic_settings import BaseSettings
5
+ from pydantic_settings import BaseSettings, SettingsConfigDict
6
6
 
7
7
 
8
8
  class DocumentLimits(BaseModel):
@@ -40,6 +40,8 @@ class DebugSettings(BaseModel):
40
40
 
41
41
 
42
42
  class AppSettings(BaseSettings):
43
+ model_config = SettingsConfigDict(env_prefix="DOCLING_", env_nested_delimiter="_")
44
+
43
45
  perf: BatchConcurrencySettings
44
46
  debug: DebugSettings
45
47
 
@@ -10,7 +10,7 @@ from PIL import Image, ImageDraw
10
10
  from rtree import index
11
11
  from scipy.ndimage import find_objects, label
12
12
 
13
- from docling.datamodel.base_models import OcrCell, Page
13
+ from docling.datamodel.base_models import Cell, OcrCell, Page
14
14
  from docling.datamodel.document import ConversionResult
15
15
  from docling.datamodel.pipeline_options import OcrOptions
16
16
  from docling.datamodel.settings import settings
@@ -73,7 +73,9 @@ class BaseOcrModel(BasePageModel):
73
73
  coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
74
74
 
75
75
  # return full-page rectangle if sufficiently covered with bitmaps
76
- if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold):
76
+ if self.options.force_full_page_ocr or coverage > max(
77
+ BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
78
+ ):
77
79
  return [
78
80
  BoundingBox(
79
81
  l=0,
@@ -96,7 +98,7 @@ class BaseOcrModel(BasePageModel):
96
98
  return ocr_rects
97
99
 
98
100
  # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
99
- def filter_ocr_cells(self, ocr_cells, programmatic_cells):
101
+ def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
100
102
  # Create R-tree index for programmatic cells
101
103
  p = index.Property()
102
104
  p.dimension = 2
@@ -117,6 +119,23 @@ class BaseOcrModel(BasePageModel):
117
119
  ]
118
120
  return filtered_ocr_cells
119
121
 
122
+ def post_process_cells(self, ocr_cells, programmatic_cells):
123
+ r"""
124
+ Post-process the ocr and programmatic cells and return the final list of of cells
125
+ """
126
+ if self.options.force_full_page_ocr:
127
+ # If a full page OCR is forced, use only the OCR cells
128
+ cells = [
129
+ Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
130
+ for c_ocr in ocr_cells
131
+ ]
132
+ return cells
133
+
134
+ ## Remove OCR cells which overlap with programmatic cells.
135
+ filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
136
+ programmatic_cells.extend(filtered_ocr_cells)
137
+ return programmatic_cells
138
+
120
139
  def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
121
140
  image = copy.deepcopy(page.image)
122
141
  draw = ImageDraw.Draw(image, "RGBA")
@@ -2,9 +2,10 @@ import logging
2
2
  from typing import Iterable
3
3
 
4
4
  import numpy
5
+ import torch
5
6
  from docling_core.types.doc import BoundingBox, CoordOrigin
6
7
 
7
- from docling.datamodel.base_models import OcrCell, Page
8
+ from docling.datamodel.base_models import Cell, OcrCell, Page
8
9
  from docling.datamodel.document import ConversionResult
9
10
  from docling.datamodel.pipeline_options import EasyOcrOptions
10
11
  from docling.datamodel.settings import settings
@@ -32,6 +33,7 @@ class EasyOcrModel(BaseOcrModel):
32
33
 
33
34
  self.reader = easyocr.Reader(
34
35
  lang_list=self.options.lang,
36
+ gpu=self.options.use_gpu,
35
37
  model_storage_directory=self.options.model_storage_directory,
36
38
  download_enabled=self.options.download_enabled,
37
39
  )
@@ -86,12 +88,8 @@ class EasyOcrModel(BaseOcrModel):
86
88
  ]
87
89
  all_ocr_cells.extend(cells)
88
90
 
89
- ## Remove OCR cells which overlap with programmatic cells.
90
- filtered_ocr_cells = self.filter_ocr_cells(
91
- all_ocr_cells, page.cells
92
- )
93
-
94
- page.cells.extend(filtered_ocr_cells)
91
+ # Post-process the cells
92
+ page.cells = self.post_process_cells(all_ocr_cells, page.cells)
95
93
 
96
94
  # DEBUG code:
97
95
  if settings.debug.visualize_ocr:
@@ -7,7 +7,7 @@ from typing import Iterable, Optional, Tuple
7
7
  import pandas as pd
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
9
 
10
- from docling.datamodel.base_models import OcrCell, Page
10
+ from docling.datamodel.base_models import Cell, OcrCell, Page
11
11
  from docling.datamodel.document import ConversionResult
12
12
  from docling.datamodel.pipeline_options import TesseractCliOcrOptions
13
13
  from docling.datamodel.settings import settings
@@ -170,12 +170,8 @@ class TesseractOcrCliModel(BaseOcrModel):
170
170
  )
171
171
  all_ocr_cells.append(cell)
172
172
 
173
- ## Remove OCR cells which overlap with programmatic cells.
174
- filtered_ocr_cells = self.filter_ocr_cells(
175
- all_ocr_cells, page.cells
176
- )
177
-
178
- page.cells.extend(filtered_ocr_cells)
173
+ # Post-process the cells
174
+ page.cells = self.post_process_cells(all_ocr_cells, page.cells)
179
175
 
180
176
  # DEBUG code:
181
177
  if settings.debug.visualize_ocr:
@@ -3,7 +3,7 @@ from typing import Iterable
3
3
 
4
4
  from docling_core.types.doc import BoundingBox, CoordOrigin
5
5
 
6
- from docling.datamodel.base_models import OcrCell, Page
6
+ from docling.datamodel.base_models import Cell, OcrCell, Page
7
7
  from docling.datamodel.document import ConversionResult
8
8
  from docling.datamodel.pipeline_options import TesseractOcrOptions
9
9
  from docling.datamodel.settings import settings
@@ -140,12 +140,8 @@ class TesseractOcrModel(BaseOcrModel):
140
140
  # del high_res_image
141
141
  all_ocr_cells.extend(cells)
142
142
 
143
- ## Remove OCR cells which overlap with programmatic cells.
144
- filtered_ocr_cells = self.filter_ocr_cells(
145
- all_ocr_cells, page.cells
146
- )
147
-
148
- page.cells.extend(filtered_ocr_cells)
143
+ # Post-process the cells
144
+ page.cells = self.post_process_cells(all_ocr_cells, page.cells)
149
145
 
150
146
  # DEBUG code:
151
147
  if settings.debug.visualize_ocr:
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.4.1" # DO NOT EDIT, updated automatically
3
+ version = "2.5.0" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes