docling 2.7.0__tar.gz → 2.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.7.0 → docling-2.8.0}/PKG-INFO +10 -4
- {docling-2.7.0 → docling-2.8.0}/README.md +2 -1
- {docling-2.7.0 → docling-2.8.0}/docling/backend/msword_backend.py +23 -9
- {docling-2.7.0 → docling-2.8.0}/docling/cli/main.py +4 -0
- {docling-2.7.0 → docling-2.8.0}/docling/datamodel/pipeline_options.py +36 -0
- docling-2.8.0/docling/models/rapid_ocr_model.py +147 -0
- {docling-2.7.0 → docling-2.8.0}/docling/pipeline/standard_pdf_pipeline.py +7 -0
- {docling-2.7.0 → docling-2.8.0}/pyproject.toml +12 -3
- {docling-2.7.0 → docling-2.8.0}/LICENSE +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/__init__.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/backend/__init__.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/backend/html_backend.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/backend/md_backend.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/cli/__init__.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/datamodel/base_models.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/datamodel/document.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/datamodel/settings.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/document_converter.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/models/__init__.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/models/base_model.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/models/ds_glm_model.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/models/layout_model.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/utils/__init__.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/utils/export.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/utils/layout_utils.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/utils/profiling.py +0 -0
- {docling-2.7.0 → docling-2.8.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.8.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -21,26 +21,31 @@ Classifier: Programming Language :: Python :: 3.11
|
|
21
21
|
Classifier: Programming Language :: Python :: 3.12
|
22
22
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
23
23
|
Provides-Extra: ocrmac
|
24
|
+
Provides-Extra: rapidocr
|
24
25
|
Provides-Extra: tesserocr
|
25
26
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
26
27
|
Requires-Dist: certifi (>=2024.7.4)
|
27
28
|
Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
|
28
|
-
Requires-Dist: docling-core (>=2.
|
29
|
+
Requires-Dist: docling-core (>=2.5.1,<3.0.0)
|
29
30
|
Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
|
30
31
|
Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
|
31
32
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
32
33
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
33
34
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
35
|
+
Requires-Dist: lxml (>=4.0.0,<6.0.0)
|
34
36
|
Requires-Dist: marko (>=2.1.2,<3.0.0)
|
35
37
|
Requires-Dist: ocrmac (>=1.0.0,<2.0.0) ; (sys_platform == "darwin") and (extra == "ocrmac")
|
38
|
+
Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (extra == "rapidocr")
|
39
|
+
Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
|
36
40
|
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
37
41
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
38
42
|
Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
|
39
|
-
Requires-Dist: pydantic (>=2.0.0,<
|
43
|
+
Requires-Dist: pydantic (>=2.0.0,<2.10)
|
40
44
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
41
45
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
42
46
|
Requires-Dist: python-docx (>=1.1.2,<2.0.0)
|
43
47
|
Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
|
48
|
+
Requires-Dist: rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; (python_version < "3.13") and (extra == "rapidocr")
|
44
49
|
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
45
50
|
Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
46
51
|
Requires-Dist: scipy (>=1.6.0,<2.0.0)
|
@@ -71,12 +76,13 @@ Description-Content-Type: text/markdown
|
|
71
76
|
[](https://pydantic.dev)
|
72
77
|
[](https://github.com/pre-commit/pre-commit)
|
73
78
|
[](https://opensource.org/licenses/MIT)
|
79
|
+
[](https://pepy.tech/projects/docling)
|
74
80
|
|
75
81
|
Docling parses documents and exports them to the desired format with ease and speed.
|
76
82
|
|
77
83
|
## Features
|
78
84
|
|
79
|
-
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc
|
85
|
+
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
|
80
86
|
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
81
87
|
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
82
88
|
* 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications
|
@@ -20,12 +20,13 @@
|
|
20
20
|
[](https://pydantic.dev)
|
21
21
|
[](https://github.com/pre-commit/pre-commit)
|
22
22
|
[](https://opensource.org/licenses/MIT)
|
23
|
+
[](https://pepy.tech/projects/docling)
|
23
24
|
|
24
25
|
Docling parses documents and exports them to the desired format with ease and speed.
|
25
26
|
|
26
27
|
## Features
|
27
28
|
|
28
|
-
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc
|
29
|
+
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
|
29
30
|
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
30
31
|
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
31
32
|
* 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications
|
@@ -14,7 +14,8 @@ from docling_core.types.doc import (
|
|
14
14
|
TableData,
|
15
15
|
)
|
16
16
|
from lxml import etree
|
17
|
-
from
|
17
|
+
from lxml.etree import XPath
|
18
|
+
from PIL import Image, UnidentifiedImageError
|
18
19
|
|
19
20
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
20
21
|
from docling.datamodel.base_models import InputFormat
|
@@ -132,8 +133,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
132
133
|
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
133
134
|
for element in body:
|
134
135
|
tag_name = etree.QName(element).localname
|
136
|
+
|
135
137
|
# Check for Inline Images (blip elements)
|
136
|
-
|
138
|
+
namespaces = {
|
139
|
+
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
140
|
+
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
141
|
+
}
|
142
|
+
xpath_expr = XPath(".//a:blip", namespaces=namespaces)
|
143
|
+
drawing_blip = xpath_expr(element)
|
137
144
|
|
138
145
|
# Check for Tables
|
139
146
|
if element.tag.endswith("tbl"):
|
@@ -210,7 +217,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
210
217
|
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
|
211
218
|
|
212
219
|
if paragraph.text is None:
|
213
|
-
# _log.warn(f"paragraph has text==None")
|
214
220
|
return
|
215
221
|
text = paragraph.text.strip()
|
216
222
|
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
|
@@ -501,11 +507,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
501
507
|
|
502
508
|
image_data = get_docx_image(element, drawing_blip)
|
503
509
|
image_bytes = BytesIO(image_data)
|
510
|
+
level = self.get_level()
|
504
511
|
# Open the BytesIO object with PIL to create an Image
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
512
|
+
try:
|
513
|
+
pil_image = Image.open(image_bytes)
|
514
|
+
doc.add_picture(
|
515
|
+
parent=self.parents[level - 1],
|
516
|
+
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
517
|
+
caption=None,
|
518
|
+
)
|
519
|
+
except (UnidentifiedImageError, OSError) as e:
|
520
|
+
_log.warning("Warning: image cannot be loaded by Pillow")
|
521
|
+
doc.add_picture(
|
522
|
+
parent=self.parents[level - 1],
|
523
|
+
caption=None,
|
524
|
+
)
|
511
525
|
return
|
@@ -27,6 +27,7 @@ from docling.datamodel.pipeline_options import (
|
|
27
27
|
OcrMacOptions,
|
28
28
|
OcrOptions,
|
29
29
|
PdfPipelineOptions,
|
30
|
+
RapidOcrOptions,
|
30
31
|
TableFormerMode,
|
31
32
|
TesseractCliOcrOptions,
|
32
33
|
TesseractOcrOptions,
|
@@ -76,6 +77,7 @@ class OcrEngine(str, Enum):
|
|
76
77
|
TESSERACT_CLI = "tesseract_cli"
|
77
78
|
TESSERACT = "tesseract"
|
78
79
|
OCRMAC = "ocrmac"
|
80
|
+
RAPIDOCR = "rapidocr"
|
79
81
|
|
80
82
|
|
81
83
|
def export_documents(
|
@@ -262,6 +264,8 @@ def convert(
|
|
262
264
|
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
263
265
|
elif ocr_engine == OcrEngine.OCRMAC:
|
264
266
|
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
|
267
|
+
elif ocr_engine == OcrEngine.RAPIDOCR:
|
268
|
+
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
|
265
269
|
else:
|
266
270
|
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
267
271
|
|
@@ -29,6 +29,42 @@ class OcrOptions(BaseModel):
|
|
29
29
|
)
|
30
30
|
|
31
31
|
|
32
|
+
class RapidOcrOptions(OcrOptions):
|
33
|
+
kind: Literal["rapidocr"] = "rapidocr"
|
34
|
+
|
35
|
+
# English and chinese are the most commly used models and have been tested with RapidOCR.
|
36
|
+
lang: List[str] = [
|
37
|
+
"english",
|
38
|
+
"chinese",
|
39
|
+
] # However, language as a parameter is not supported by rapidocr yet and hence changing this options doesn't affect anything.
|
40
|
+
# For more details on supported languages by RapidOCR visit https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
|
41
|
+
|
42
|
+
# For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
|
43
|
+
text_score: float = 0.5 # same default as rapidocr
|
44
|
+
|
45
|
+
use_det: Optional[bool] = None # same default as rapidocr
|
46
|
+
use_cls: Optional[bool] = None # same default as rapidocr
|
47
|
+
use_rec: Optional[bool] = None # same default as rapidocr
|
48
|
+
|
49
|
+
# class Device(Enum):
|
50
|
+
# CPU = "CPU"
|
51
|
+
# CUDA = "CUDA"
|
52
|
+
# DIRECTML = "DIRECTML"
|
53
|
+
# AUTO = "AUTO"
|
54
|
+
|
55
|
+
# device: Device = Device.AUTO # Default value is AUTO
|
56
|
+
|
57
|
+
print_verbose: bool = False # same default as rapidocr
|
58
|
+
|
59
|
+
det_model_path: Optional[str] = None # same default as rapidocr
|
60
|
+
cls_model_path: Optional[str] = None # same default as rapidocr
|
61
|
+
rec_model_path: Optional[str] = None # same default as rapidocr
|
62
|
+
|
63
|
+
model_config = ConfigDict(
|
64
|
+
extra="forbid",
|
65
|
+
)
|
66
|
+
|
67
|
+
|
32
68
|
class EasyOcrOptions(OcrOptions):
|
33
69
|
kind: Literal["easyocr"] = "easyocr"
|
34
70
|
lang: List[str] = ["fr", "de", "es", "en"]
|
@@ -0,0 +1,147 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Iterable
|
3
|
+
|
4
|
+
import numpy
|
5
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
6
|
+
|
7
|
+
from docling.datamodel.base_models import OcrCell, Page
|
8
|
+
from docling.datamodel.document import ConversionResult
|
9
|
+
from docling.datamodel.pipeline_options import RapidOcrOptions
|
10
|
+
from docling.datamodel.settings import settings
|
11
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
12
|
+
from docling.utils.profiling import TimeRecorder
|
13
|
+
|
14
|
+
_log = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
class RapidOcrModel(BaseOcrModel):
|
18
|
+
def __init__(self, enabled: bool, options: RapidOcrOptions):
|
19
|
+
super().__init__(enabled=enabled, options=options)
|
20
|
+
self.options: RapidOcrOptions
|
21
|
+
|
22
|
+
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
23
|
+
|
24
|
+
if self.enabled:
|
25
|
+
try:
|
26
|
+
from rapidocr_onnxruntime import RapidOCR # type: ignore
|
27
|
+
except ImportError:
|
28
|
+
raise ImportError(
|
29
|
+
"RapidOCR is not installed. Please install it via `pip install rapidocr_onnxruntime` to use this OCR engine. "
|
30
|
+
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
31
|
+
)
|
32
|
+
|
33
|
+
# This configuration option will be revamped while introducing device settings for all models.
|
34
|
+
# For the moment we will default to auto and let onnx-runtime pick the best.
|
35
|
+
cls_use_cuda = True
|
36
|
+
rec_use_cuda = True
|
37
|
+
det_use_cuda = True
|
38
|
+
det_use_dml = True
|
39
|
+
cls_use_dml = True
|
40
|
+
rec_use_dml = True
|
41
|
+
|
42
|
+
# # Same as Defaults in RapidOCR
|
43
|
+
# cls_use_cuda = False
|
44
|
+
# rec_use_cuda = False
|
45
|
+
# det_use_cuda = False
|
46
|
+
# det_use_dml = False
|
47
|
+
# cls_use_dml = False
|
48
|
+
# rec_use_dml = False
|
49
|
+
|
50
|
+
# # If we set everything to true onnx-runtime would automatically choose the fastest accelerator
|
51
|
+
# if self.options.device == self.options.Device.AUTO:
|
52
|
+
# cls_use_cuda = True
|
53
|
+
# rec_use_cuda = True
|
54
|
+
# det_use_cuda = True
|
55
|
+
# det_use_dml = True
|
56
|
+
# cls_use_dml = True
|
57
|
+
# rec_use_dml = True
|
58
|
+
|
59
|
+
# # If we set use_cuda to true onnx would use the cuda device available in runtime if no cuda device is available it would run on CPU.
|
60
|
+
# elif self.options.device == self.options.Device.CUDA:
|
61
|
+
# cls_use_cuda = True
|
62
|
+
# rec_use_cuda = True
|
63
|
+
# det_use_cuda = True
|
64
|
+
|
65
|
+
# # If we set use_dml to true onnx would use the dml device available in runtime if no dml device is available it would work on CPU.
|
66
|
+
# elif self.options.device == self.options.Device.DIRECTML:
|
67
|
+
# det_use_dml = True
|
68
|
+
# cls_use_dml = True
|
69
|
+
# rec_use_dml = True
|
70
|
+
|
71
|
+
self.reader = RapidOCR(
|
72
|
+
text_score=self.options.text_score,
|
73
|
+
cls_use_cuda=cls_use_cuda,
|
74
|
+
rec_use_cuda=rec_use_cuda,
|
75
|
+
det_use_cuda=det_use_cuda,
|
76
|
+
det_use_dml=det_use_dml,
|
77
|
+
cls_use_dml=cls_use_dml,
|
78
|
+
rec_use_dml=rec_use_dml,
|
79
|
+
print_verbose=self.options.print_verbose,
|
80
|
+
det_model_path=self.options.det_model_path,
|
81
|
+
cls_model_path=self.options.cls_model_path,
|
82
|
+
rec_model_path=self.options.rec_model_path,
|
83
|
+
)
|
84
|
+
|
85
|
+
def __call__(
|
86
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
87
|
+
) -> Iterable[Page]:
|
88
|
+
|
89
|
+
if not self.enabled:
|
90
|
+
yield from page_batch
|
91
|
+
return
|
92
|
+
|
93
|
+
for page in page_batch:
|
94
|
+
|
95
|
+
assert page._backend is not None
|
96
|
+
if not page._backend.is_valid():
|
97
|
+
yield page
|
98
|
+
else:
|
99
|
+
with TimeRecorder(conv_res, "ocr"):
|
100
|
+
ocr_rects = self.get_ocr_rects(page)
|
101
|
+
|
102
|
+
all_ocr_cells = []
|
103
|
+
for ocr_rect in ocr_rects:
|
104
|
+
# Skip zero area boxes
|
105
|
+
if ocr_rect.area() == 0:
|
106
|
+
continue
|
107
|
+
high_res_image = page._backend.get_page_image(
|
108
|
+
scale=self.scale, cropbox=ocr_rect
|
109
|
+
)
|
110
|
+
im = numpy.array(high_res_image)
|
111
|
+
result, _ = self.reader(
|
112
|
+
im,
|
113
|
+
use_det=self.options.use_det,
|
114
|
+
use_cls=self.options.use_cls,
|
115
|
+
use_rec=self.options.use_rec,
|
116
|
+
)
|
117
|
+
|
118
|
+
del high_res_image
|
119
|
+
del im
|
120
|
+
|
121
|
+
cells = [
|
122
|
+
OcrCell(
|
123
|
+
id=ix,
|
124
|
+
text=line[1],
|
125
|
+
confidence=line[2],
|
126
|
+
bbox=BoundingBox.from_tuple(
|
127
|
+
coord=(
|
128
|
+
(line[0][0][0] / self.scale) + ocr_rect.l,
|
129
|
+
(line[0][0][1] / self.scale) + ocr_rect.t,
|
130
|
+
(line[0][2][0] / self.scale) + ocr_rect.l,
|
131
|
+
(line[0][2][1] / self.scale) + ocr_rect.t,
|
132
|
+
),
|
133
|
+
origin=CoordOrigin.TOPLEFT,
|
134
|
+
),
|
135
|
+
)
|
136
|
+
for ix, line in enumerate(result)
|
137
|
+
]
|
138
|
+
all_ocr_cells.extend(cells)
|
139
|
+
|
140
|
+
# Post-process the cells
|
141
|
+
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
142
|
+
|
143
|
+
# DEBUG code:
|
144
|
+
if settings.debug.visualize_ocr:
|
145
|
+
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
146
|
+
|
147
|
+
yield page
|
@@ -13,6 +13,7 @@ from docling.datamodel.pipeline_options import (
|
|
13
13
|
EasyOcrOptions,
|
14
14
|
OcrMacOptions,
|
15
15
|
PdfPipelineOptions,
|
16
|
+
RapidOcrOptions,
|
16
17
|
TesseractCliOcrOptions,
|
17
18
|
TesseractOcrOptions,
|
18
19
|
)
|
@@ -26,6 +27,7 @@ from docling.models.page_preprocessing_model import (
|
|
26
27
|
PagePreprocessingModel,
|
27
28
|
PagePreprocessingOptions,
|
28
29
|
)
|
30
|
+
from docling.models.rapid_ocr_model import RapidOcrModel
|
29
31
|
from docling.models.table_structure_model import TableStructureModel
|
30
32
|
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
31
33
|
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
@@ -121,6 +123,11 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
121
123
|
enabled=self.pipeline_options.do_ocr,
|
122
124
|
options=self.pipeline_options.ocr_options,
|
123
125
|
)
|
126
|
+
elif isinstance(self.pipeline_options.ocr_options, RapidOcrOptions):
|
127
|
+
return RapidOcrModel(
|
128
|
+
enabled=self.pipeline_options.do_ocr,
|
129
|
+
options=self.pipeline_options.ocr_options,
|
130
|
+
)
|
124
131
|
elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
|
125
132
|
if "darwin" != sys.platform:
|
126
133
|
raise RuntimeError(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.
|
3
|
+
version = "2.8.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
@@ -25,8 +25,8 @@ packages = [{include = "docling"}]
|
|
25
25
|
# actual dependencies:
|
26
26
|
######################
|
27
27
|
python = "^3.9"
|
28
|
-
pydantic = "
|
29
|
-
docling-core = "^2.
|
28
|
+
pydantic = ">=2.0.0,<2.10"
|
29
|
+
docling-core = "^2.5.1"
|
30
30
|
docling-ibm-models = "^2.0.6"
|
31
31
|
deepsearch-glm = "^0.26.1"
|
32
32
|
filetype = "^1.2.0"
|
@@ -48,7 +48,15 @@ beautifulsoup4 = "^4.12.3"
|
|
48
48
|
pandas = "^2.1.4"
|
49
49
|
marko = "^2.1.2"
|
50
50
|
openpyxl = "^3.1.5"
|
51
|
+
lxml = ">=4.0.0,<6.0.0"
|
51
52
|
ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
|
53
|
+
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
|
54
|
+
onnxruntime = [
|
55
|
+
# 1.19.2 is the last version with python3.9 support,
|
56
|
+
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
|
57
|
+
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
|
58
|
+
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
|
59
|
+
]
|
52
60
|
|
53
61
|
[tool.poetry.group.dev.dependencies]
|
54
62
|
black = {extras = ["jupyter"], version = "^24.4.2"}
|
@@ -103,6 +111,7 @@ torchvision = [
|
|
103
111
|
[tool.poetry.extras]
|
104
112
|
tesserocr = ["tesserocr"]
|
105
113
|
ocrmac = ["ocrmac"]
|
114
|
+
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
|
106
115
|
|
107
116
|
[tool.poetry.scripts]
|
108
117
|
docling = "docling.cli.main:app"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|