docling 2.6.0__tar.gz → 2.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {docling-2.6.0 → docling-2.7.1}/PKG-INFO +13 -8
  2. {docling-2.6.0 → docling-2.7.1}/README.md +3 -2
  3. {docling-2.6.0 → docling-2.7.1}/docling/backend/msword_backend.py +22 -9
  4. {docling-2.6.0 → docling-2.7.1}/docling/cli/main.py +20 -18
  5. {docling-2.6.0 → docling-2.7.1}/docling/datamodel/pipeline_options.py +14 -3
  6. {docling-2.6.0 → docling-2.7.1}/docling/document_converter.py +4 -4
  7. docling-2.7.1/docling/models/ocr_mac_model.py +118 -0
  8. {docling-2.6.0 → docling-2.7.1}/docling/pipeline/standard_pdf_pipeline.py +12 -0
  9. {docling-2.6.0 → docling-2.7.1}/pyproject.toml +18 -8
  10. {docling-2.6.0 → docling-2.7.1}/LICENSE +0 -0
  11. {docling-2.6.0 → docling-2.7.1}/docling/__init__.py +0 -0
  12. {docling-2.6.0 → docling-2.7.1}/docling/backend/__init__.py +0 -0
  13. {docling-2.6.0 → docling-2.7.1}/docling/backend/abstract_backend.py +0 -0
  14. {docling-2.6.0 → docling-2.7.1}/docling/backend/asciidoc_backend.py +0 -0
  15. {docling-2.6.0 → docling-2.7.1}/docling/backend/docling_parse_backend.py +0 -0
  16. {docling-2.6.0 → docling-2.7.1}/docling/backend/docling_parse_v2_backend.py +0 -0
  17. {docling-2.6.0 → docling-2.7.1}/docling/backend/html_backend.py +0 -0
  18. {docling-2.6.0 → docling-2.7.1}/docling/backend/md_backend.py +0 -0
  19. {docling-2.6.0 → docling-2.7.1}/docling/backend/msexcel_backend.py +0 -0
  20. {docling-2.6.0 → docling-2.7.1}/docling/backend/mspowerpoint_backend.py +0 -0
  21. {docling-2.6.0 → docling-2.7.1}/docling/backend/pdf_backend.py +0 -0
  22. {docling-2.6.0 → docling-2.7.1}/docling/backend/pypdfium2_backend.py +0 -0
  23. {docling-2.6.0 → docling-2.7.1}/docling/cli/__init__.py +0 -0
  24. {docling-2.6.0 → docling-2.7.1}/docling/datamodel/__init__.py +0 -0
  25. {docling-2.6.0 → docling-2.7.1}/docling/datamodel/base_models.py +0 -0
  26. {docling-2.6.0 → docling-2.7.1}/docling/datamodel/document.py +0 -0
  27. {docling-2.6.0 → docling-2.7.1}/docling/datamodel/settings.py +0 -0
  28. {docling-2.6.0 → docling-2.7.1}/docling/models/__init__.py +0 -0
  29. {docling-2.6.0 → docling-2.7.1}/docling/models/base_model.py +0 -0
  30. {docling-2.6.0 → docling-2.7.1}/docling/models/base_ocr_model.py +0 -0
  31. {docling-2.6.0 → docling-2.7.1}/docling/models/ds_glm_model.py +0 -0
  32. {docling-2.6.0 → docling-2.7.1}/docling/models/easyocr_model.py +0 -0
  33. {docling-2.6.0 → docling-2.7.1}/docling/models/layout_model.py +0 -0
  34. {docling-2.6.0 → docling-2.7.1}/docling/models/page_assemble_model.py +0 -0
  35. {docling-2.6.0 → docling-2.7.1}/docling/models/page_preprocessing_model.py +0 -0
  36. {docling-2.6.0 → docling-2.7.1}/docling/models/table_structure_model.py +0 -0
  37. {docling-2.6.0 → docling-2.7.1}/docling/models/tesseract_ocr_cli_model.py +0 -0
  38. {docling-2.6.0 → docling-2.7.1}/docling/models/tesseract_ocr_model.py +0 -0
  39. {docling-2.6.0 → docling-2.7.1}/docling/pipeline/__init__.py +0 -0
  40. {docling-2.6.0 → docling-2.7.1}/docling/pipeline/base_pipeline.py +0 -0
  41. {docling-2.6.0 → docling-2.7.1}/docling/pipeline/simple_pipeline.py +0 -0
  42. {docling-2.6.0 → docling-2.7.1}/docling/utils/__init__.py +0 -0
  43. {docling-2.6.0 → docling-2.7.1}/docling/utils/export.py +0 -0
  44. {docling-2.6.0 → docling-2.7.1}/docling/utils/layout_utils.py +0 -0
  45. {docling-2.6.0 → docling-2.7.1}/docling/utils/profiling.py +0 -0
  46. {docling-2.6.0 → docling-2.7.1}/docling/utils/utils.py +0 -0
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.6.0
3
+ Version: 2.7.1
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
7
7
  Keywords: docling,convert,document,pdf,docx,html,markdown,layout model,segmentation,table structure,table former
8
8
  Author: Christoph Auer
9
9
  Author-email: cau@zurich.ibm.com
10
- Requires-Python: >=3.10,<4.0
10
+ Requires-Python: >=3.9,<4.0
11
11
  Classifier: Development Status :: 5 - Production/Stable
12
12
  Classifier: Intended Audience :: Developers
13
13
  Classifier: Intended Audience :: Science/Research
@@ -15,32 +15,36 @@ Classifier: License :: OSI Approved :: MIT License
15
15
  Classifier: Operating System :: MacOS :: MacOS X
16
16
  Classifier: Operating System :: POSIX :: Linux
17
17
  Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.9
18
19
  Classifier: Programming Language :: Python :: 3.10
19
20
  Classifier: Programming Language :: Python :: 3.11
20
21
  Classifier: Programming Language :: Python :: 3.12
21
22
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Provides-Extra: ocrmac
22
24
  Provides-Extra: tesserocr
23
25
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
24
26
  Requires-Dist: certifi (>=2024.7.4)
25
27
  Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
26
28
  Requires-Dist: docling-core (>=2.4.0,<3.0.0)
27
- Requires-Dist: docling-ibm-models (>=2.0.3,<3.0.0)
28
- Requires-Dist: docling-parse (>=2.0.2,<3.0.0)
29
+ Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
30
+ Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
29
31
  Requires-Dist: easyocr (>=1.7,<2.0)
30
32
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
31
33
  Requires-Dist: huggingface_hub (>=0.23,<1)
34
+ Requires-Dist: lxml (>=4.0.0,<6.0.0)
32
35
  Requires-Dist: marko (>=2.1.2,<3.0.0)
36
+ Requires-Dist: ocrmac (>=1.0.0,<2.0.0) ; (sys_platform == "darwin") and (extra == "ocrmac")
33
37
  Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
34
38
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
35
39
  Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
36
- Requires-Dist: pydantic (>=2.0.0,<3.0.0)
40
+ Requires-Dist: pydantic (>=2.0.0,<2.10)
37
41
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
38
42
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
39
43
  Requires-Dist: python-docx (>=1.1.2,<2.0.0)
40
44
  Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
41
45
  Requires-Dist: requests (>=2.32.3,<3.0.0)
42
46
  Requires-Dist: rtree (>=1.3.0,<2.0.0)
43
- Requires-Dist: scipy (>=1.14.1,<2.0.0)
47
+ Requires-Dist: scipy (>=1.6.0,<2.0.0)
44
48
  Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
45
49
  Requires-Dist: typer (>=0.12.5,<0.13.0)
46
50
  Project-URL: Repository, https://github.com/DS4SD/docling
@@ -61,19 +65,20 @@ Description-Content-Type: text/markdown
61
65
  [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
62
66
  [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
63
67
  [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
64
- ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
68
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling)](https://pypi.org/project/docling/)
65
69
  [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
66
70
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
67
71
  [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
68
72
  [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
69
73
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
70
74
  [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
75
+ [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
71
76
 
72
77
  Docling parses documents and exports them to the desired format with ease and speed.
73
78
 
74
79
  ## Features
75
80
 
76
- * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON
81
+ * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
77
82
  * 📑 Advanced PDF document understanding including page layout, reading order & table structures
78
83
  * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
79
84
  * 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications
@@ -13,19 +13,20 @@
13
13
  [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
14
14
  [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
15
15
  [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
16
- ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
16
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling)](https://pypi.org/project/docling/)
17
17
  [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
18
18
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
19
19
  [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
20
20
  [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
21
21
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
22
22
  [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
23
+ [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
23
24
 
24
25
  Docling parses documents and exports them to the desired format with ease and speed.
25
26
 
26
27
  ## Features
27
28
 
28
- * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON
29
+ * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
29
30
  * 📑 Advanced PDF document understanding including page layout, reading order & table structures
30
31
  * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
31
32
  * 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications
@@ -14,7 +14,8 @@ from docling_core.types.doc import (
14
14
  TableData,
15
15
  )
16
16
  from lxml import etree
17
- from PIL import Image
17
+ from lxml.etree import XPath
18
+ from PIL import Image, UnidentifiedImageError
18
19
 
19
20
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
20
21
  from docling.datamodel.base_models import InputFormat
@@ -132,8 +133,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
132
133
  def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
133
134
  for element in body:
134
135
  tag_name = etree.QName(element).localname
136
+
135
137
  # Check for Inline Images (blip elements)
136
- drawing_blip = element.xpath(".//a:blip")
138
+ namespaces = {
139
+ "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
140
+ "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
141
+ }
142
+ xpath_expr = XPath(".//a:blip", namespaces=namespaces)
143
+ drawing_blip = xpath_expr(element)
137
144
 
138
145
  # Check for Tables
139
146
  if element.tag.endswith("tbl"):
@@ -210,7 +217,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
210
217
  paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
211
218
 
212
219
  if paragraph.text is None:
213
- # _log.warn(f"paragraph has text==None")
214
220
  return
215
221
  text = paragraph.text.strip()
216
222
  # if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
@@ -502,10 +508,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
502
508
  image_data = get_docx_image(element, drawing_blip)
503
509
  image_bytes = BytesIO(image_data)
504
510
  # Open the BytesIO object with PIL to create an Image
505
- pil_image = Image.open(image_bytes)
506
- doc.add_picture(
507
- parent=self.parents[self.level],
508
- image=ImageRef.from_pil(image=pil_image, dpi=72),
509
- caption=None,
510
- )
511
+ try:
512
+ pil_image = Image.open(image_bytes)
513
+ doc.add_picture(
514
+ parent=self.parents[self.level],
515
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
516
+ caption=None,
517
+ )
518
+ except (UnidentifiedImageError, OSError) as e:
519
+ _log.warning("Warning: image cannot be loaded by Pillow")
520
+ doc.add_picture(
521
+ parent=self.parents[self.level],
522
+ caption=None,
523
+ )
511
524
  return
@@ -24,6 +24,7 @@ from docling.datamodel.base_models import (
24
24
  from docling.datamodel.document import ConversionResult
25
25
  from docling.datamodel.pipeline_options import (
26
26
  EasyOcrOptions,
27
+ OcrMacOptions,
27
28
  OcrOptions,
28
29
  PdfPipelineOptions,
29
30
  TableFormerMode,
@@ -74,6 +75,7 @@ class OcrEngine(str, Enum):
74
75
  EASYOCR = "easyocr"
75
76
  TESSERACT_CLI = "tesseract_cli"
76
77
  TESSERACT = "tesseract"
78
+ OCRMAC = "ocrmac"
77
79
 
78
80
 
79
81
  def export_documents(
@@ -252,15 +254,16 @@ def convert(
252
254
  export_txt = OutputFormat.TEXT in to_formats
253
255
  export_doctags = OutputFormat.DOCTAGS in to_formats
254
256
 
255
- match ocr_engine:
256
- case OcrEngine.EASYOCR:
257
- ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
258
- case OcrEngine.TESSERACT_CLI:
259
- ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
260
- case OcrEngine.TESSERACT:
261
- ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
262
- case _:
263
- raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
257
+ if ocr_engine == OcrEngine.EASYOCR:
258
+ ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
259
+ elif ocr_engine == OcrEngine.TESSERACT_CLI:
260
+ ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
261
+ elif ocr_engine == OcrEngine.TESSERACT:
262
+ ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
263
+ elif ocr_engine == OcrEngine.OCRMAC:
264
+ ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
265
+ else:
266
+ raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
264
267
 
265
268
  ocr_lang_list = _split_list(ocr_lang)
266
269
  if ocr_lang_list is not None:
@@ -277,15 +280,14 @@ def convert(
277
280
  if artifacts_path is not None:
278
281
  pipeline_options.artifacts_path = artifacts_path
279
282
 
280
- match pdf_backend:
281
- case PdfBackend.DLPARSE_V1:
282
- backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
283
- case PdfBackend.DLPARSE_V2:
284
- backend = DoclingParseV2DocumentBackend
285
- case PdfBackend.PYPDFIUM2:
286
- backend = PyPdfiumDocumentBackend
287
- case _:
288
- raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
283
+ if pdf_backend == PdfBackend.DLPARSE_V1:
284
+ backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
285
+ elif pdf_backend == PdfBackend.DLPARSE_V2:
286
+ backend = DoclingParseV2DocumentBackend
287
+ elif pdf_backend == PdfBackend.PYPDFIUM2:
288
+ backend = PyPdfiumDocumentBackend
289
+ else:
290
+ raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
289
291
 
290
292
  format_options: Dict[InputFormat, FormatOption] = {
291
293
  InputFormat.PDF: PdfFormatOption(
@@ -63,6 +63,17 @@ class TesseractOcrOptions(OcrOptions):
63
63
  )
64
64
 
65
65
 
66
+ class OcrMacOptions(OcrOptions):
67
+ kind: Literal["ocrmac"] = "ocrmac"
68
+ lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
69
+ recognition: str = "accurate"
70
+ framework: str = "vision"
71
+
72
+ model_config = ConfigDict(
73
+ extra="forbid",
74
+ )
75
+
76
+
66
77
  class PipelineOptions(BaseModel):
67
78
  create_legacy_output: bool = (
68
79
  True # This defautl will be set to False on a future version of docling
@@ -75,9 +86,9 @@ class PdfPipelineOptions(PipelineOptions):
75
86
  do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
76
87
 
77
88
  table_structure_options: TableStructureOptions = TableStructureOptions()
78
- ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
79
- Field(EasyOcrOptions(), discriminator="kind")
80
- )
89
+ ocr_options: Union[
90
+ EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
91
+ ] = Field(EasyOcrOptions(), discriminator="kind")
81
92
 
82
93
  images_scale: float = 1.0
83
94
  generate_page_images: bool = False
@@ -3,7 +3,7 @@ import sys
3
3
  import time
4
4
  from functools import partial
5
5
  from pathlib import Path
6
- from typing import Dict, Iterable, Iterator, List, Optional, Type
6
+ from typing import Dict, Iterable, Iterator, List, Optional, Type, Union
7
7
 
8
8
  from pydantic import BaseModel, ConfigDict, model_validator, validate_call
9
9
 
@@ -155,7 +155,7 @@ class DocumentConverter:
155
155
  @validate_call(config=ConfigDict(strict=True))
156
156
  def convert(
157
157
  self,
158
- source: Path | str | DocumentStream, # TODO review naming
158
+ source: Union[Path, str, DocumentStream], # TODO review naming
159
159
  raises_on_error: bool = True,
160
160
  max_num_pages: int = sys.maxsize,
161
161
  max_file_size: int = sys.maxsize,
@@ -172,7 +172,7 @@ class DocumentConverter:
172
172
  @validate_call(config=ConfigDict(strict=True))
173
173
  def convert_all(
174
174
  self,
175
- source: Iterable[Path | str | DocumentStream], # TODO review naming
175
+ source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
176
176
  raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
177
177
  max_num_pages: int = sys.maxsize,
178
178
  max_file_size: int = sys.maxsize,
@@ -183,7 +183,7 @@ class DocumentConverter:
183
183
  )
184
184
  conv_input = _DocumentConversionInput(
185
185
  path_or_stream_iterator=source,
186
- limit=limits,
186
+ limits=limits,
187
187
  )
188
188
  conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
189
189
  for conv_res in conv_res_iter:
@@ -0,0 +1,118 @@
1
+ import logging
2
+ import tempfile
3
+ from typing import Iterable, Optional, Tuple
4
+
5
+ from docling_core.types.doc import BoundingBox, CoordOrigin
6
+
7
+ from docling.datamodel.base_models import OcrCell, Page
8
+ from docling.datamodel.document import ConversionResult
9
+ from docling.datamodel.pipeline_options import OcrMacOptions
10
+ from docling.datamodel.settings import settings
11
+ from docling.models.base_ocr_model import BaseOcrModel
12
+ from docling.utils.profiling import TimeRecorder
13
+
14
+ _log = logging.getLogger(__name__)
15
+
16
+
17
+ class OcrMacModel(BaseOcrModel):
18
+ def __init__(self, enabled: bool, options: OcrMacOptions):
19
+ super().__init__(enabled=enabled, options=options)
20
+ self.options: OcrMacOptions
21
+
22
+ self.scale = 3 # multiplier for 72 dpi == 216 dpi.
23
+
24
+ if self.enabled:
25
+ install_errmsg = (
26
+ "ocrmac is not correctly installed. "
27
+ "Please install it via `pip install ocrmac` to use this OCR engine. "
28
+ "Alternatively, Docling has support for other OCR engines. See the documentation: "
29
+ "https://ds4sd.github.io/docling/installation/"
30
+ )
31
+ try:
32
+ from ocrmac import ocrmac
33
+ except ImportError:
34
+ raise ImportError(install_errmsg)
35
+
36
+ self.reader_RIL = ocrmac.OCR
37
+
38
+ def __call__(
39
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
40
+ ) -> Iterable[Page]:
41
+
42
+ if not self.enabled:
43
+ yield from page_batch
44
+ return
45
+
46
+ for page in page_batch:
47
+ assert page._backend is not None
48
+ if not page._backend.is_valid():
49
+ yield page
50
+ else:
51
+ with TimeRecorder(conv_res, "ocr"):
52
+
53
+ ocr_rects = self.get_ocr_rects(page)
54
+
55
+ all_ocr_cells = []
56
+ for ocr_rect in ocr_rects:
57
+ # Skip zero area boxes
58
+ if ocr_rect.area() == 0:
59
+ continue
60
+ high_res_image = page._backend.get_page_image(
61
+ scale=self.scale, cropbox=ocr_rect
62
+ )
63
+
64
+ with tempfile.NamedTemporaryFile(
65
+ suffix=".png", mode="w"
66
+ ) as image_file:
67
+ fname = image_file.name
68
+ high_res_image.save(fname)
69
+
70
+ boxes = self.reader_RIL(
71
+ fname,
72
+ recognition_level=self.options.recognition,
73
+ framework=self.options.framework,
74
+ language_preference=self.options.lang,
75
+ ).recognize()
76
+
77
+ im_width, im_height = high_res_image.size
78
+ cells = []
79
+ for ix, (text, confidence, box) in enumerate(boxes):
80
+ x = float(box[0])
81
+ y = float(box[1])
82
+ w = float(box[2])
83
+ h = float(box[3])
84
+
85
+ x1 = x * im_width
86
+ y2 = (1 - y) * im_height
87
+
88
+ x2 = x1 + w * im_width
89
+ y1 = y2 - h * im_height
90
+
91
+ left = x1 / self.scale
92
+ top = y1 / self.scale
93
+ right = x2 / self.scale
94
+ bottom = y2 / self.scale
95
+
96
+ cells.append(
97
+ OcrCell(
98
+ id=ix,
99
+ text=text,
100
+ confidence=confidence,
101
+ bbox=BoundingBox.from_tuple(
102
+ coord=(left, top, right, bottom),
103
+ origin=CoordOrigin.TOPLEFT,
104
+ ),
105
+ )
106
+ )
107
+
108
+ # del high_res_image
109
+ all_ocr_cells.extend(cells)
110
+
111
+ # Post-process the cells
112
+ page.cells = self.post_process_cells(all_ocr_cells, page.cells)
113
+
114
+ # DEBUG code:
115
+ if settings.debug.visualize_ocr:
116
+ self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
117
+
118
+ yield page
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import sys
2
3
  from pathlib import Path
3
4
  from typing import Optional
4
5
 
@@ -10,6 +11,7 @@ from docling.datamodel.base_models import AssembledUnit, Page
10
11
  from docling.datamodel.document import ConversionResult
11
12
  from docling.datamodel.pipeline_options import (
12
13
  EasyOcrOptions,
14
+ OcrMacOptions,
13
15
  PdfPipelineOptions,
14
16
  TesseractCliOcrOptions,
15
17
  TesseractOcrOptions,
@@ -18,6 +20,7 @@ from docling.models.base_ocr_model import BaseOcrModel
18
20
  from docling.models.ds_glm_model import GlmModel, GlmOptions
19
21
  from docling.models.easyocr_model import EasyOcrModel
20
22
  from docling.models.layout_model import LayoutModel
23
+ from docling.models.ocr_mac_model import OcrMacModel
21
24
  from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
22
25
  from docling.models.page_preprocessing_model import (
23
26
  PagePreprocessingModel,
@@ -118,6 +121,15 @@ class StandardPdfPipeline(PaginatedPipeline):
118
121
  enabled=self.pipeline_options.do_ocr,
119
122
  options=self.pipeline_options.ocr_options,
120
123
  )
124
+ elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
125
+ if "darwin" != sys.platform:
126
+ raise RuntimeError(
127
+ f"The specified OCR type is only supported on Mac: {self.pipeline_options.ocr_options.kind}."
128
+ )
129
+ return OcrMacModel(
130
+ enabled=self.pipeline_options.do_ocr,
131
+ options=self.pipeline_options.ocr_options,
132
+ )
121
133
  return None
122
134
 
123
135
  def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.6.0" # DO NOT EDIT, updated automatically
3
+ version = "2.7.1" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -24,10 +24,10 @@ packages = [{include = "docling"}]
24
24
  ######################
25
25
  # actual dependencies:
26
26
  ######################
27
- python = "^3.10"
28
- pydantic = "^2.0.0"
27
+ python = "^3.9"
28
+ pydantic = ">=2.0.0,<2.10"
29
29
  docling-core = "^2.4.0"
30
- docling-ibm-models = "^2.0.3"
30
+ docling-ibm-models = "^2.0.6"
31
31
  deepsearch-glm = "^0.26.1"
32
32
  filetype = "^1.2.0"
33
33
  pypdfium2 = "^4.30.0"
@@ -36,10 +36,10 @@ huggingface_hub = ">=0.23,<1"
36
36
  requests = "^2.32.3"
37
37
  easyocr = "^1.7"
38
38
  tesserocr = { version = "^2.7.1", optional = true }
39
- docling-parse = "^2.0.2"
39
+ docling-parse = "^2.0.5"
40
40
  certifi = ">=2024.7.4"
41
41
  rtree = "^1.3.0"
42
- scipy = "^1.14.1"
42
+ scipy = "^1.6.0"
43
43
  pyarrow = "^16.1.0"
44
44
  typer = "^0.12.5"
45
45
  python-docx = "^1.1.2"
@@ -48,6 +48,8 @@ beautifulsoup4 = "^4.12.3"
48
48
  pandas = "^2.1.4"
49
49
  marko = "^2.1.2"
50
50
  openpyxl = "^3.1.5"
51
+ lxml = ">=4.0.0,<6.0.0"
52
+ ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
51
53
 
52
54
  [tool.poetry.group.dev.dependencies]
53
55
  black = {extras = ["jupyter"], version = "^24.4.2"}
@@ -80,6 +82,12 @@ langchain-huggingface = "^0.0.3"
80
82
  langchain-milvus = "^0.1.4"
81
83
  langchain-text-splitters = "^0.2.4"
82
84
 
85
+ [tool.poetry.group.constraints.dependencies]
86
+ numpy = [
87
+ { version = "^2.1.0", markers = 'python_version >= "3.13"' },
88
+ { version = "^1.24.4", markers = 'python_version < "3.13"' },
89
+ ]
90
+
83
91
  [tool.poetry.group.mac_intel]
84
92
  optional = true
85
93
 
@@ -95,6 +103,7 @@ torchvision = [
95
103
 
96
104
  [tool.poetry.extras]
97
105
  tesserocr = ["tesserocr"]
106
+ ocrmac = ["ocrmac"]
98
107
 
99
108
  [tool.poetry.scripts]
100
109
  docling = "docling.cli.main:app"
@@ -105,13 +114,13 @@ build-backend = "poetry.core.masonry.api"
105
114
 
106
115
  [tool.black]
107
116
  line-length = 88
108
- target-version = ["py310"]
117
+ target-version = ["py39"]
109
118
  include = '\.pyi?$'
110
119
 
111
120
  [tool.isort]
112
121
  profile = "black"
113
122
  line_length = 88
114
- py_version=311
123
+ py_version=39
115
124
 
116
125
  [tool.mypy]
117
126
  pretty = true
@@ -130,6 +139,7 @@ module = [
130
139
  "tesserocr.*",
131
140
  "docling_ibm_models.*",
132
141
  "easyocr.*",
142
+ "ocrmac.*",
133
143
  "deepsearch_glm.*",
134
144
  "lxml.*",
135
145
  "bs4.*",
File without changes
File without changes
File without changes
File without changes
File without changes