docling 2.7.1__py3-none-any.whl → 2.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -507,18 +507,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
507
507
 
508
508
  image_data = get_docx_image(element, drawing_blip)
509
509
  image_bytes = BytesIO(image_data)
510
+ level = self.get_level()
510
511
  # Open the BytesIO object with PIL to create an Image
511
512
  try:
512
513
  pil_image = Image.open(image_bytes)
513
514
  doc.add_picture(
514
- parent=self.parents[self.level],
515
+ parent=self.parents[level - 1],
515
516
  image=ImageRef.from_pil(image=pil_image, dpi=72),
516
517
  caption=None,
517
518
  )
518
519
  except (UnidentifiedImageError, OSError) as e:
519
520
  _log.warning("Warning: image cannot be loaded by Pillow")
520
521
  doc.add_picture(
521
- parent=self.parents[self.level],
522
+ parent=self.parents[level - 1],
522
523
  caption=None,
523
524
  )
524
525
  return
docling/cli/main.py CHANGED
@@ -27,6 +27,7 @@ from docling.datamodel.pipeline_options import (
27
27
  OcrMacOptions,
28
28
  OcrOptions,
29
29
  PdfPipelineOptions,
30
+ RapidOcrOptions,
30
31
  TableFormerMode,
31
32
  TesseractCliOcrOptions,
32
33
  TesseractOcrOptions,
@@ -76,6 +77,7 @@ class OcrEngine(str, Enum):
76
77
  TESSERACT_CLI = "tesseract_cli"
77
78
  TESSERACT = "tesseract"
78
79
  OCRMAC = "ocrmac"
80
+ RAPIDOCR = "rapidocr"
79
81
 
80
82
 
81
83
  def export_documents(
@@ -262,6 +264,8 @@ def convert(
262
264
  ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
263
265
  elif ocr_engine == OcrEngine.OCRMAC:
264
266
  ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
267
+ elif ocr_engine == OcrEngine.RAPIDOCR:
268
+ ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
265
269
  else:
266
270
  raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
267
271
 
@@ -29,6 +29,42 @@ class OcrOptions(BaseModel):
29
29
  )
30
30
 
31
31
 
32
+ class RapidOcrOptions(OcrOptions):
33
+ kind: Literal["rapidocr"] = "rapidocr"
34
+
35
+ # English and chinese are the most commly used models and have been tested with RapidOCR.
36
+ lang: List[str] = [
37
+ "english",
38
+ "chinese",
39
+ ] # However, language as a parameter is not supported by rapidocr yet and hence changing this options doesn't affect anything.
40
+ # For more details on supported languages by RapidOCR visit https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
41
+
42
+ # For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
43
+ text_score: float = 0.5 # same default as rapidocr
44
+
45
+ use_det: Optional[bool] = None # same default as rapidocr
46
+ use_cls: Optional[bool] = None # same default as rapidocr
47
+ use_rec: Optional[bool] = None # same default as rapidocr
48
+
49
+ # class Device(Enum):
50
+ # CPU = "CPU"
51
+ # CUDA = "CUDA"
52
+ # DIRECTML = "DIRECTML"
53
+ # AUTO = "AUTO"
54
+
55
+ # device: Device = Device.AUTO # Default value is AUTO
56
+
57
+ print_verbose: bool = False # same default as rapidocr
58
+
59
+ det_model_path: Optional[str] = None # same default as rapidocr
60
+ cls_model_path: Optional[str] = None # same default as rapidocr
61
+ rec_model_path: Optional[str] = None # same default as rapidocr
62
+
63
+ model_config = ConfigDict(
64
+ extra="forbid",
65
+ )
66
+
67
+
32
68
  class EasyOcrOptions(OcrOptions):
33
69
  kind: Literal["easyocr"] = "easyocr"
34
70
  lang: List[str] = ["fr", "de", "es", "en"]
@@ -0,0 +1,147 @@
1
+ import logging
2
+ from typing import Iterable
3
+
4
+ import numpy
5
+ from docling_core.types.doc import BoundingBox, CoordOrigin
6
+
7
+ from docling.datamodel.base_models import OcrCell, Page
8
+ from docling.datamodel.document import ConversionResult
9
+ from docling.datamodel.pipeline_options import RapidOcrOptions
10
+ from docling.datamodel.settings import settings
11
+ from docling.models.base_ocr_model import BaseOcrModel
12
+ from docling.utils.profiling import TimeRecorder
13
+
14
+ _log = logging.getLogger(__name__)
15
+
16
+
17
+ class RapidOcrModel(BaseOcrModel):
18
+ def __init__(self, enabled: bool, options: RapidOcrOptions):
19
+ super().__init__(enabled=enabled, options=options)
20
+ self.options: RapidOcrOptions
21
+
22
+ self.scale = 3 # multiplier for 72 dpi == 216 dpi.
23
+
24
+ if self.enabled:
25
+ try:
26
+ from rapidocr_onnxruntime import RapidOCR # type: ignore
27
+ except ImportError:
28
+ raise ImportError(
29
+ "RapidOCR is not installed. Please install it via `pip install rapidocr_onnxruntime` to use this OCR engine. "
30
+ "Alternatively, Docling has support for other OCR engines. See the documentation."
31
+ )
32
+
33
+ # This configuration option will be revamped while introducing device settings for all models.
34
+ # For the moment we will default to auto and let onnx-runtime pick the best.
35
+ cls_use_cuda = True
36
+ rec_use_cuda = True
37
+ det_use_cuda = True
38
+ det_use_dml = True
39
+ cls_use_dml = True
40
+ rec_use_dml = True
41
+
42
+ # # Same as Defaults in RapidOCR
43
+ # cls_use_cuda = False
44
+ # rec_use_cuda = False
45
+ # det_use_cuda = False
46
+ # det_use_dml = False
47
+ # cls_use_dml = False
48
+ # rec_use_dml = False
49
+
50
+ # # If we set everything to true onnx-runtime would automatically choose the fastest accelerator
51
+ # if self.options.device == self.options.Device.AUTO:
52
+ # cls_use_cuda = True
53
+ # rec_use_cuda = True
54
+ # det_use_cuda = True
55
+ # det_use_dml = True
56
+ # cls_use_dml = True
57
+ # rec_use_dml = True
58
+
59
+ # # If we set use_cuda to true onnx would use the cuda device available in runtime if no cuda device is available it would run on CPU.
60
+ # elif self.options.device == self.options.Device.CUDA:
61
+ # cls_use_cuda = True
62
+ # rec_use_cuda = True
63
+ # det_use_cuda = True
64
+
65
+ # # If we set use_dml to true onnx would use the dml device available in runtime if no dml device is available it would work on CPU.
66
+ # elif self.options.device == self.options.Device.DIRECTML:
67
+ # det_use_dml = True
68
+ # cls_use_dml = True
69
+ # rec_use_dml = True
70
+
71
+ self.reader = RapidOCR(
72
+ text_score=self.options.text_score,
73
+ cls_use_cuda=cls_use_cuda,
74
+ rec_use_cuda=rec_use_cuda,
75
+ det_use_cuda=det_use_cuda,
76
+ det_use_dml=det_use_dml,
77
+ cls_use_dml=cls_use_dml,
78
+ rec_use_dml=rec_use_dml,
79
+ print_verbose=self.options.print_verbose,
80
+ det_model_path=self.options.det_model_path,
81
+ cls_model_path=self.options.cls_model_path,
82
+ rec_model_path=self.options.rec_model_path,
83
+ )
84
+
85
+ def __call__(
86
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
87
+ ) -> Iterable[Page]:
88
+
89
+ if not self.enabled:
90
+ yield from page_batch
91
+ return
92
+
93
+ for page in page_batch:
94
+
95
+ assert page._backend is not None
96
+ if not page._backend.is_valid():
97
+ yield page
98
+ else:
99
+ with TimeRecorder(conv_res, "ocr"):
100
+ ocr_rects = self.get_ocr_rects(page)
101
+
102
+ all_ocr_cells = []
103
+ for ocr_rect in ocr_rects:
104
+ # Skip zero area boxes
105
+ if ocr_rect.area() == 0:
106
+ continue
107
+ high_res_image = page._backend.get_page_image(
108
+ scale=self.scale, cropbox=ocr_rect
109
+ )
110
+ im = numpy.array(high_res_image)
111
+ result, _ = self.reader(
112
+ im,
113
+ use_det=self.options.use_det,
114
+ use_cls=self.options.use_cls,
115
+ use_rec=self.options.use_rec,
116
+ )
117
+
118
+ del high_res_image
119
+ del im
120
+
121
+ cells = [
122
+ OcrCell(
123
+ id=ix,
124
+ text=line[1],
125
+ confidence=line[2],
126
+ bbox=BoundingBox.from_tuple(
127
+ coord=(
128
+ (line[0][0][0] / self.scale) + ocr_rect.l,
129
+ (line[0][0][1] / self.scale) + ocr_rect.t,
130
+ (line[0][2][0] / self.scale) + ocr_rect.l,
131
+ (line[0][2][1] / self.scale) + ocr_rect.t,
132
+ ),
133
+ origin=CoordOrigin.TOPLEFT,
134
+ ),
135
+ )
136
+ for ix, line in enumerate(result)
137
+ ]
138
+ all_ocr_cells.extend(cells)
139
+
140
+ # Post-process the cells
141
+ page.cells = self.post_process_cells(all_ocr_cells, page.cells)
142
+
143
+ # DEBUG code:
144
+ if settings.debug.visualize_ocr:
145
+ self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
146
+
147
+ yield page
@@ -13,6 +13,7 @@ from docling.datamodel.pipeline_options import (
13
13
  EasyOcrOptions,
14
14
  OcrMacOptions,
15
15
  PdfPipelineOptions,
16
+ RapidOcrOptions,
16
17
  TesseractCliOcrOptions,
17
18
  TesseractOcrOptions,
18
19
  )
@@ -26,6 +27,7 @@ from docling.models.page_preprocessing_model import (
26
27
  PagePreprocessingModel,
27
28
  PagePreprocessingOptions,
28
29
  )
30
+ from docling.models.rapid_ocr_model import RapidOcrModel
29
31
  from docling.models.table_structure_model import TableStructureModel
30
32
  from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
31
33
  from docling.models.tesseract_ocr_model import TesseractOcrModel
@@ -121,6 +123,11 @@ class StandardPdfPipeline(PaginatedPipeline):
121
123
  enabled=self.pipeline_options.do_ocr,
122
124
  options=self.pipeline_options.ocr_options,
123
125
  )
126
+ elif isinstance(self.pipeline_options.ocr_options, RapidOcrOptions):
127
+ return RapidOcrModel(
128
+ enabled=self.pipeline_options.do_ocr,
129
+ options=self.pipeline_options.ocr_options,
130
+ )
124
131
  elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
125
132
  if "darwin" != sys.platform:
126
133
  raise RuntimeError(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.7.1
3
+ Version: 2.8.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -21,11 +21,12 @@ Classifier: Programming Language :: Python :: 3.11
21
21
  Classifier: Programming Language :: Python :: 3.12
22
22
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Provides-Extra: ocrmac
24
+ Provides-Extra: rapidocr
24
25
  Provides-Extra: tesserocr
25
26
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
26
27
  Requires-Dist: certifi (>=2024.7.4)
27
28
  Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
28
- Requires-Dist: docling-core (>=2.4.0,<3.0.0)
29
+ Requires-Dist: docling-core (>=2.5.1,<3.0.0)
29
30
  Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
30
31
  Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
31
32
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -34,6 +35,8 @@ Requires-Dist: huggingface_hub (>=0.23,<1)
34
35
  Requires-Dist: lxml (>=4.0.0,<6.0.0)
35
36
  Requires-Dist: marko (>=2.1.2,<3.0.0)
36
37
  Requires-Dist: ocrmac (>=1.0.0,<2.0.0) ; (sys_platform == "darwin") and (extra == "ocrmac")
38
+ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (extra == "rapidocr")
39
+ Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
37
40
  Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
38
41
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
39
42
  Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
@@ -42,6 +45,7 @@ Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
42
45
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
43
46
  Requires-Dist: python-docx (>=1.1.2,<2.0.0)
44
47
  Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
48
+ Requires-Dist: rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; (python_version < "3.13") and (extra == "rapidocr")
45
49
  Requires-Dist: requests (>=2.32.3,<3.0.0)
46
50
  Requires-Dist: rtree (>=1.3.0,<2.0.0)
47
51
  Requires-Dist: scipy (>=1.6.0,<2.0.0)
@@ -8,15 +8,15 @@ docling/backend/html_backend.py,sha256=qbu1W8xoTGnXMuZPRPLq68hDbCEj6ygnpxP5gYaod
8
8
  docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
9
9
  docling/backend/msexcel_backend.py,sha256=23qUEScqr5GhY06xiqg-eBQ_JlAqO0FkPEmX6554sVA,12040
10
10
  docling/backend/mspowerpoint_backend.py,sha256=QD0NaatTO8U9CIFoiipkq3X5HxLZaaahH8nlrQ6ecDA,15710
11
- docling/backend/msword_backend.py,sha256=sMumfB9Xa2Md1a8WO-fGPPAKf1s3mCvErMyZ-xnBC2E,18495
11
+ docling/backend/msword_backend.py,sha256=VFHPr-gCak7w3NJToc5Cs-JaTb4Vm3a1JnnRIfJO3TI,18526
12
12
  docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
13
13
  docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
14
14
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- docling/cli/main.py,sha256=MpjbAXhOlbGnAnl5_OaKCdub61YPQBy1NOqroXQtNYE,10722
15
+ docling/cli/main.py,sha256=KxukTq155IFVkfc_aUpSL6laGG1KjnXE4oAau7B5xBA,10881
16
16
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  docling/datamodel/base_models.py,sha256=6qlwPamDZ3XUsE2kTAyGKG6O2IJClVjCqaE7DZ74KHU,5533
18
18
  docling/datamodel/document.py,sha256=9dQf_J18X_MEWs-Mg3Ed6BykFPJ79ETmkkxcssY-vYo,20698
19
- docling/datamodel/pipeline_options.py,sha256=aC_CmtEhNLIbn9n3JuYhL_aA8UA0vFgw7HcGMUuOI4o,3117
19
+ docling/datamodel/pipeline_options.py,sha256=J-6kWugUrxahymKzgaEgiqPuyle1fbInPXV2wNos6Vc,4550
20
20
  docling/datamodel/settings.py,sha256=JK8lZPBjUx2kD2q-Qpg-o3vOElADMcyQbRUL0EHZ7us,1263
21
21
  docling/document_converter.py,sha256=L0A3g7IQBaKIK7dWpUFC72ZqKywIPYkyh71Qd6DiNPE,10940
22
22
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -28,20 +28,21 @@ docling/models/layout_model.py,sha256=ZvbTSyxvXB5yLHNEti0Wv3trz0vwGuHySI5TCdApb0
28
28
  docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
29
29
  docling/models/page_assemble_model.py,sha256=kSGNiRKhmzkpFH7xCiT3rulMsgJmUXFa6Th_eB-cLEk,7103
30
30
  docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
31
+ docling/models/rapid_ocr_model.py,sha256=VQ0jaFmOzB9f-1JaqZ6d0o_El55Lr-nsFHfTNubMAuc,6005
31
32
  docling/models/table_structure_model.py,sha256=-ANSQpiN2avt3B9sbi7dHcoULUJbMBalAR5xxlrM7To,8421
32
33
  docling/models/tesseract_ocr_cli_model.py,sha256=OfopQnt2FGwtLJTMtW9jbJZ9EN2G2QFkA_aACjuUuDs,6372
33
34
  docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUsOkuz0neY,6095
34
35
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
36
  docling/pipeline/base_pipeline.py,sha256=IF1XWYgUGbdB4-teLkmM4Hvg_UNEfPrGuhExMRTUsk8,7168
36
37
  docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
37
- docling/pipeline/standard_pdf_pipeline.py,sha256=btm_y1ZsjUrtWvMbF6RA8BVM0ENrK4z_rqF0jjdeZmU,8473
38
+ docling/pipeline/standard_pdf_pipeline.py,sha256=7sbkh9EwXlhSfJSgf-WyjB5jdJ1El7Pn4siSssTJpq8,8789
38
39
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
40
  docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
40
41
  docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
41
42
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
42
43
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
43
- docling-2.7.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
44
- docling-2.7.1.dist-info/METADATA,sha256=TvD3BGlbO1ci54NzwmLxqSITXIdMefyj71YjdZkD7Vs,6906
45
- docling-2.7.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
46
- docling-2.7.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
47
- docling-2.7.1.dist-info/RECORD,,
44
+ docling-2.8.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
45
+ docling-2.8.0.dist-info/METADATA,sha256=4XSleijcmMxpwEFyjiNIh71ScIZUTApiKIfKDdM660A,7236
46
+ docling-2.8.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
47
+ docling-2.8.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
48
+ docling-2.8.0.dist-info/RECORD,,