docling 2.15.0__tar.gz → 2.16.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.15.0 → docling-2.16.0}/PKG-INFO +7 -7
- {docling-2.15.0 → docling-2.16.0}/README.md +1 -2
- {docling-2.15.0 → docling-2.16.0}/docling/backend/abstract_backend.py +0 -1
- {docling-2.15.0 → docling-2.16.0}/docling/backend/asciidoc_backend.py +0 -1
- {docling-2.15.0 → docling-2.16.0}/docling/backend/docling_parse_backend.py +2 -2
- {docling-2.15.0 → docling-2.16.0}/docling/backend/docling_parse_v2_backend.py +2 -2
- {docling-2.15.0 → docling-2.16.0}/docling/backend/html_backend.py +1 -1
- docling-2.16.0/docling/backend/json/docling_json_backend.py +58 -0
- {docling-2.15.0 → docling-2.16.0}/docling/backend/md_backend.py +44 -27
- {docling-2.15.0 → docling-2.16.0}/docling/backend/msexcel_backend.py +50 -38
- {docling-2.15.0 → docling-2.16.0}/docling/backend/msword_backend.py +0 -1
- {docling-2.15.0 → docling-2.16.0}/docling/backend/pdf_backend.py +0 -2
- {docling-2.15.0 → docling-2.16.0}/docling/backend/pypdfium2_backend.py +2 -2
- {docling-2.15.0 → docling-2.16.0}/docling/datamodel/base_models.py +30 -3
- {docling-2.15.0 → docling-2.16.0}/docling/datamodel/document.py +2 -0
- {docling-2.15.0 → docling-2.16.0}/docling/datamodel/pipeline_options.py +7 -10
- {docling-2.15.0 → docling-2.16.0}/docling/document_converter.py +4 -0
- docling-2.16.0/docling/models/base_model.py +84 -0
- {docling-2.15.0 → docling-2.16.0}/docling/models/base_ocr_model.py +15 -12
- docling-2.16.0/docling/models/code_formula_model.py +245 -0
- docling-2.16.0/docling/models/document_picture_classifier.py +187 -0
- {docling-2.15.0 → docling-2.16.0}/docling/models/layout_model.py +10 -86
- {docling-2.15.0 → docling-2.16.0}/docling/models/page_assemble_model.py +1 -33
- {docling-2.15.0 → docling-2.16.0}/docling/models/tesseract_ocr_cli_model.py +0 -1
- {docling-2.15.0 → docling-2.16.0}/docling/models/tesseract_ocr_model.py +63 -15
- {docling-2.15.0 → docling-2.16.0}/docling/pipeline/base_pipeline.py +40 -17
- {docling-2.15.0 → docling-2.16.0}/docling/pipeline/standard_pdf_pipeline.py +31 -2
- docling-2.16.0/docling/utils/__init__.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/utils/glm_utils.py +4 -1
- docling-2.16.0/docling/utils/visualization.py +80 -0
- {docling-2.15.0 → docling-2.16.0}/pyproject.toml +6 -5
- docling-2.15.0/docling/models/base_model.py +0 -28
- {docling-2.15.0 → docling-2.16.0}/LICENSE +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/__init__.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/backend/__init__.py +0 -0
- {docling-2.15.0/docling/backend/xml → docling-2.16.0/docling/backend/json}/__init__.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.15.0/docling/cli → docling-2.16.0/docling/backend/xml}/__init__.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/backend/xml/pubmed_backend.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/chunking/__init__.py +0 -0
- {docling-2.15.0/docling/datamodel → docling-2.16.0/docling/cli}/__init__.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/cli/main.py +0 -0
- {docling-2.15.0/docling/models → docling-2.16.0/docling/datamodel}/__init__.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/datamodel/settings.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/exceptions.py +0 -0
- {docling-2.15.0/docling/pipeline → docling-2.16.0/docling/models}/__init__.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/models/ds_glm_model.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.15.0/docling/utils → docling-2.16.0/docling/pipeline}/__init__.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/py.typed +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/utils/export.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/utils/profiling.py +0 -0
- {docling-2.15.0 → docling-2.16.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.16.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -26,9 +26,9 @@ Provides-Extra: tesserocr
|
|
26
26
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
27
27
|
Requires-Dist: certifi (>=2024.7.4)
|
28
28
|
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
29
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
30
|
-
Requires-Dist: docling-ibm-models (>=3.
|
31
|
-
Requires-Dist: docling-parse (>=3.
|
29
|
+
Requires-Dist: docling-core[chunking] (>=2.15.1,<3.0.0)
|
30
|
+
Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
|
31
|
+
Requires-Dist: docling-parse (>=3.1.0,<4.0.0)
|
32
32
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
33
33
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
34
34
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
@@ -39,13 +39,14 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
|
|
39
39
|
Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
|
40
40
|
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
41
41
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
42
|
+
Requires-Dist: pillow (>=10.0.0,<11.0.0)
|
42
43
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
43
44
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
44
45
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
45
46
|
Requires-Dist: python-docx (>=1.1.2,<2.0.0)
|
46
47
|
Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
|
47
48
|
Requires-Dist: rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; (python_version < "3.13") and (extra == "rapidocr")
|
48
|
-
Requires-Dist: requests (>=2.32.
|
49
|
+
Requires-Dist: requests (>=2.32.2,<3.0.0)
|
49
50
|
Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
50
51
|
Requires-Dist: scipy (>=1.6.0,<2.0.0)
|
51
52
|
Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
|
@@ -84,7 +85,7 @@ Docling parses documents and exports them to the desired format with ease and sp
|
|
84
85
|
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
|
85
86
|
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
86
87
|
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
87
|
-
* 🤖
|
88
|
+
* 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
88
89
|
* 🔍 OCR support for scanned PDFs
|
89
90
|
* 💻 Simple and convenient CLI
|
90
91
|
|
@@ -94,7 +95,6 @@ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty
|
|
94
95
|
|
95
96
|
* ♾️ Equation & code extraction
|
96
97
|
* 📝 Metadata extraction, including title, authors, references & language
|
97
|
-
* 🦜🔗 Native LangChain extension
|
98
98
|
|
99
99
|
## Installation
|
100
100
|
|
@@ -29,7 +29,7 @@ Docling parses documents and exports them to the desired format with ease and sp
|
|
29
29
|
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
|
30
30
|
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
31
31
|
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
32
|
-
* 🤖
|
32
|
+
* 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
33
33
|
* 🔍 OCR support for scanned PDFs
|
34
34
|
* 💻 Simple and convenient CLI
|
35
35
|
|
@@ -39,7 +39,6 @@ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty
|
|
39
39
|
|
40
40
|
* ♾️ Equation & code extraction
|
41
41
|
* 📝 Metadata extraction, including title, authors, references & language
|
42
|
-
* 🦜🔗 Native LangChain extension
|
43
42
|
|
44
43
|
## Installation
|
45
44
|
|
@@ -132,7 +132,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
132
132
|
return cells
|
133
133
|
|
134
134
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
135
|
-
AREA_THRESHOLD = 32 * 32
|
135
|
+
AREA_THRESHOLD = 0 # 32 * 32
|
136
136
|
|
137
137
|
for i in range(len(self._dpage["images"])):
|
138
138
|
bitmap = self._dpage["images"][i]
|
@@ -163,7 +163,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
163
163
|
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
164
164
|
)
|
165
165
|
else:
|
166
|
-
padbox = cropbox.to_bottom_left_origin(page_size.height)
|
166
|
+
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
167
167
|
padbox.r = page_size.width - padbox.r
|
168
168
|
padbox.t = page_size.height - padbox.t
|
169
169
|
|
@@ -140,7 +140,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|
140
140
|
return cells
|
141
141
|
|
142
142
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
143
|
-
AREA_THRESHOLD = 32 * 32
|
143
|
+
AREA_THRESHOLD = 0 # 32 * 32
|
144
144
|
|
145
145
|
images = self._dpage["sanitized"]["images"]["data"]
|
146
146
|
images_header = self._dpage["sanitized"]["images"]["header"]
|
@@ -178,7 +178,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|
178
178
|
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
179
179
|
)
|
180
180
|
else:
|
181
|
-
padbox = cropbox.to_bottom_left_origin(page_size.height)
|
181
|
+
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
182
182
|
padbox.r = page_size.width - padbox.r
|
183
183
|
padbox.t = page_size.height - padbox.t
|
184
184
|
|
@@ -215,7 +215,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
215
215
|
label = DocItemLabel.CODE
|
216
216
|
if len(text) == 0:
|
217
217
|
return
|
218
|
-
doc.
|
218
|
+
doc.add_code(parent=self.parents[self.level], label=label, text=text)
|
219
219
|
|
220
220
|
def handle_paragraph(self, element, idx, doc):
|
221
221
|
"""Handles paragraph tags (p)."""
|
@@ -0,0 +1,58 @@
|
|
1
|
+
from io import BytesIO
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Union
|
4
|
+
|
5
|
+
from docling_core.types.doc import DoclingDocument
|
6
|
+
from typing_extensions import override
|
7
|
+
|
8
|
+
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
9
|
+
from docling.datamodel.base_models import InputFormat
|
10
|
+
from docling.datamodel.document import InputDocument
|
11
|
+
|
12
|
+
|
13
|
+
class DoclingJSONBackend(DeclarativeDocumentBackend):
|
14
|
+
@override
|
15
|
+
def __init__(
|
16
|
+
self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
|
17
|
+
) -> None:
|
18
|
+
super().__init__(in_doc, path_or_stream)
|
19
|
+
|
20
|
+
# given we need to store any actual conversion exception for raising it from
|
21
|
+
# convert(), this captures the successful result or the actual error in a
|
22
|
+
# mutually exclusive way:
|
23
|
+
self._doc_or_err = self._get_doc_or_err()
|
24
|
+
|
25
|
+
@override
|
26
|
+
def is_valid(self) -> bool:
|
27
|
+
return isinstance(self._doc_or_err, DoclingDocument)
|
28
|
+
|
29
|
+
@classmethod
|
30
|
+
@override
|
31
|
+
def supports_pagination(cls) -> bool:
|
32
|
+
return False
|
33
|
+
|
34
|
+
@classmethod
|
35
|
+
@override
|
36
|
+
def supported_formats(cls) -> set[InputFormat]:
|
37
|
+
return {InputFormat.JSON_DOCLING}
|
38
|
+
|
39
|
+
def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
|
40
|
+
try:
|
41
|
+
json_data: Union[str, bytes]
|
42
|
+
if isinstance(self.path_or_stream, Path):
|
43
|
+
with open(self.path_or_stream, encoding="utf-8") as f:
|
44
|
+
json_data = f.read()
|
45
|
+
elif isinstance(self.path_or_stream, BytesIO):
|
46
|
+
json_data = self.path_or_stream.getvalue()
|
47
|
+
else:
|
48
|
+
raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
|
49
|
+
return DoclingDocument.model_validate_json(json_data=json_data)
|
50
|
+
except Exception as e:
|
51
|
+
return e
|
52
|
+
|
53
|
+
@override
|
54
|
+
def convert(self) -> DoclingDocument:
|
55
|
+
if isinstance(self._doc_or_err, DoclingDocument):
|
56
|
+
return self._doc_or_err
|
57
|
+
else:
|
58
|
+
raise self._doc_or_err
|
@@ -3,19 +3,22 @@ import re
|
|
3
3
|
import warnings
|
4
4
|
from io import BytesIO
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import Set, Union
|
6
|
+
from typing import List, Optional, Set, Union
|
7
7
|
|
8
8
|
import marko
|
9
9
|
import marko.ext
|
10
10
|
import marko.ext.gfm
|
11
11
|
import marko.inline
|
12
12
|
from docling_core.types.doc import (
|
13
|
+
DocItem,
|
13
14
|
DocItemLabel,
|
14
15
|
DoclingDocument,
|
15
16
|
DocumentOrigin,
|
16
17
|
GroupLabel,
|
18
|
+
NodeItem,
|
17
19
|
TableCell,
|
18
20
|
TableData,
|
21
|
+
TextItem,
|
19
22
|
)
|
20
23
|
from marko import Markdown
|
21
24
|
|
@@ -27,8 +30,7 @@ _log = logging.getLogger(__name__)
|
|
27
30
|
|
28
31
|
|
29
32
|
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
30
|
-
|
31
|
-
def shorten_underscore_sequences(self, markdown_text, max_length=10):
|
33
|
+
def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
32
34
|
# This regex will match any sequence of underscores
|
33
35
|
pattern = r"_+"
|
34
36
|
|
@@ -90,13 +92,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
90
92
|
) from e
|
91
93
|
return
|
92
94
|
|
93
|
-
def close_table(self, doc
|
95
|
+
def close_table(self, doc: DoclingDocument):
|
94
96
|
if self.in_table:
|
95
97
|
_log.debug("=== TABLE START ===")
|
96
98
|
for md_table_row in self.md_table_buffer:
|
97
99
|
_log.debug(md_table_row)
|
98
100
|
_log.debug("=== TABLE END ===")
|
99
|
-
tcells = []
|
101
|
+
tcells: List[TableCell] = []
|
100
102
|
result_table = []
|
101
103
|
for n, md_table_row in enumerate(self.md_table_buffer):
|
102
104
|
data = []
|
@@ -137,15 +139,19 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
137
139
|
self.in_table = False
|
138
140
|
self.md_table_buffer = [] # clean table markdown buffer
|
139
141
|
# Initialize Docling TableData
|
140
|
-
|
142
|
+
table_data = TableData(
|
143
|
+
num_rows=num_rows, num_cols=num_cols, table_cells=tcells
|
144
|
+
)
|
141
145
|
# Populate
|
142
146
|
for tcell in tcells:
|
143
|
-
|
147
|
+
table_data.table_cells.append(tcell)
|
144
148
|
if len(tcells) > 0:
|
145
|
-
doc.add_table(data=
|
149
|
+
doc.add_table(data=table_data)
|
146
150
|
return
|
147
151
|
|
148
|
-
def process_inline_text(
|
152
|
+
def process_inline_text(
|
153
|
+
self, parent_element: Optional[NodeItem], doc: DoclingDocument
|
154
|
+
):
|
149
155
|
# self.inline_text_buffer += str(text_in)
|
150
156
|
txt = self.inline_text_buffer.strip()
|
151
157
|
if len(txt) > 0:
|
@@ -156,14 +162,20 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
156
162
|
)
|
157
163
|
self.inline_text_buffer = ""
|
158
164
|
|
159
|
-
def iterate_elements(
|
165
|
+
def iterate_elements(
|
166
|
+
self,
|
167
|
+
element: marko.block.Element,
|
168
|
+
depth: int,
|
169
|
+
doc: DoclingDocument,
|
170
|
+
parent_element: Optional[NodeItem] = None,
|
171
|
+
):
|
160
172
|
# Iterates over all elements in the AST
|
161
173
|
# Check for different element types and process relevant details
|
162
174
|
if isinstance(element, marko.block.Heading):
|
163
175
|
self.close_table(doc)
|
164
176
|
self.process_inline_text(parent_element, doc)
|
165
177
|
_log.debug(
|
166
|
-
f" - Heading level {element.level}, content: {element.children[0].children}"
|
178
|
+
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
167
179
|
)
|
168
180
|
if element.level == 1:
|
169
181
|
doc_label = DocItemLabel.TITLE
|
@@ -172,10 +184,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
172
184
|
|
173
185
|
# Header could have arbitrary inclusion of bold, italic or emphasis,
|
174
186
|
# hence we need to traverse the tree to get full text of a header
|
175
|
-
strings = []
|
187
|
+
strings: List[str] = []
|
176
188
|
|
177
189
|
# Define a recursive function to traverse the tree
|
178
|
-
def traverse(node):
|
190
|
+
def traverse(node: marko.block.BlockElement):
|
179
191
|
# Check if the node has a "children" attribute
|
180
192
|
if hasattr(node, "children"):
|
181
193
|
# If "children" is a list, continue traversal
|
@@ -209,9 +221,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
209
221
|
self.process_inline_text(parent_element, doc)
|
210
222
|
_log.debug(" - List item")
|
211
223
|
|
212
|
-
snippet_text = str(element.children[0].children[0].children)
|
224
|
+
snippet_text = str(element.children[0].children[0].children) # type: ignore
|
213
225
|
is_numbered = False
|
214
|
-
if
|
226
|
+
if (
|
227
|
+
parent_element is not None
|
228
|
+
and isinstance(parent_element, DocItem)
|
229
|
+
and parent_element.label == GroupLabel.ORDERED_LIST
|
230
|
+
):
|
215
231
|
is_numbered = True
|
216
232
|
doc.add_list_item(
|
217
233
|
enumerated=is_numbered, parent=parent_element, text=snippet_text
|
@@ -221,7 +237,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
221
237
|
self.close_table(doc)
|
222
238
|
self.process_inline_text(parent_element, doc)
|
223
239
|
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
224
|
-
|
240
|
+
|
241
|
+
fig_caption: Optional[TextItem] = None
|
242
|
+
if element.title is not None and element.title != "":
|
243
|
+
fig_caption = doc.add_text(
|
244
|
+
label=DocItemLabel.CAPTION, text=element.title
|
245
|
+
)
|
246
|
+
|
247
|
+
doc.add_picture(parent=parent_element, caption=fig_caption)
|
225
248
|
|
226
249
|
elif isinstance(element, marko.block.Paragraph):
|
227
250
|
self.process_inline_text(parent_element, doc)
|
@@ -252,27 +275,21 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
252
275
|
self.process_inline_text(parent_element, doc)
|
253
276
|
_log.debug(f" - Code Span: {element.children}")
|
254
277
|
snippet_text = str(element.children).strip()
|
255
|
-
doc.
|
256
|
-
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
257
|
-
)
|
278
|
+
doc.add_code(parent=parent_element, text=snippet_text)
|
258
279
|
|
259
280
|
elif isinstance(element, marko.block.CodeBlock):
|
260
281
|
self.close_table(doc)
|
261
282
|
self.process_inline_text(parent_element, doc)
|
262
283
|
_log.debug(f" - Code Block: {element.children}")
|
263
|
-
snippet_text = str(element.children[0].children).strip()
|
264
|
-
doc.
|
265
|
-
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
266
|
-
)
|
284
|
+
snippet_text = str(element.children[0].children).strip() # type: ignore
|
285
|
+
doc.add_code(parent=parent_element, text=snippet_text)
|
267
286
|
|
268
287
|
elif isinstance(element, marko.block.FencedCode):
|
269
288
|
self.close_table(doc)
|
270
289
|
self.process_inline_text(parent_element, doc)
|
271
290
|
_log.debug(f" - Code Block: {element.children}")
|
272
|
-
snippet_text = str(element.children[0].children).strip()
|
273
|
-
doc.
|
274
|
-
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
275
|
-
)
|
291
|
+
snippet_text = str(element.children[0].children).strip() # type: ignore
|
292
|
+
doc.add_code(parent=parent_element, text=snippet_text)
|
276
293
|
|
277
294
|
elif isinstance(element, marko.inline.LineBreak):
|
278
295
|
self.process_inline_text(parent_element, doc)
|
@@ -26,6 +26,7 @@ _log = logging.getLogger(__name__)
|
|
26
26
|
|
27
27
|
from typing import Any, List
|
28
28
|
|
29
|
+
from PIL import Image as PILImage
|
29
30
|
from pydantic import BaseModel
|
30
31
|
|
31
32
|
|
@@ -44,7 +45,6 @@ class ExcelTable(BaseModel):
|
|
44
45
|
|
45
46
|
|
46
47
|
class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
47
|
-
|
48
48
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
49
49
|
super().__init__(in_doc, path_or_stream)
|
50
50
|
|
@@ -326,49 +326,61 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
326
326
|
self, doc: DoclingDocument, sheet: Worksheet
|
327
327
|
) -> DoclingDocument:
|
328
328
|
|
329
|
-
#
|
330
|
-
|
331
|
-
# Iterate over images in the sheet
|
332
|
-
for idx, image in enumerate(sheet._images): # Access embedded images
|
329
|
+
# Iterate over byte images in the sheet
|
330
|
+
for idx, image in enumerate(sheet._images): # type: ignore
|
333
331
|
|
334
|
-
|
335
|
-
|
332
|
+
try:
|
333
|
+
pil_image = PILImage.open(image.ref)
|
336
334
|
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
335
|
+
doc.add_picture(
|
336
|
+
parent=self.parents[0],
|
337
|
+
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
338
|
+
caption=None,
|
339
|
+
)
|
340
|
+
except:
|
341
|
+
_log.error("could not extract the image from excel sheets")
|
343
342
|
|
344
|
-
# FIXME: mypy does not agree with _charts ...
|
345
343
|
"""
|
346
|
-
for idx, chart in enumerate(sheet._charts): #
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
344
|
+
for idx, chart in enumerate(sheet._charts): # type: ignore
|
345
|
+
try:
|
346
|
+
chart_path = f"chart_{idx + 1}.png"
|
347
|
+
_log.info(
|
348
|
+
f"Chart found, but dynamic rendering is required for: {chart_path}"
|
349
|
+
)
|
351
350
|
|
352
|
-
|
353
|
-
|
354
|
-
# Chart type
|
355
|
-
_log.info(f"Type: {type(chart).__name__}")
|
356
|
-
|
357
|
-
# Title
|
358
|
-
if chart.title:
|
359
|
-
_log.info(f"Title: {chart.title}")
|
360
|
-
else:
|
361
|
-
_log.info("No title")
|
362
|
-
|
363
|
-
# Data series
|
364
|
-
for series in chart.series:
|
365
|
-
_log.info(" => series ...")
|
366
|
-
_log.info(f"Data Series: {series.title}")
|
367
|
-
_log.info(f"Values: {series.values}")
|
368
|
-
_log.info(f"Categories: {series.categories}")
|
351
|
+
_log.info(f"Chart {idx + 1}:")
|
369
352
|
|
370
|
-
|
371
|
-
|
353
|
+
# Chart type
|
354
|
+
# _log.info(f"Type: {type(chart).__name__}")
|
355
|
+
print(f"Type: {type(chart).__name__}")
|
356
|
+
|
357
|
+
# Extract series data
|
358
|
+
for series_idx, series in enumerate(chart.series):
|
359
|
+
#_log.info(f"Series {series_idx + 1}:")
|
360
|
+
print(f"Series {series_idx + 1} type: {type(series).__name__}")
|
361
|
+
#print(f"x-values: {series.xVal}")
|
362
|
+
#print(f"y-values: {series.yVal}")
|
363
|
+
|
364
|
+
print(f"xval type: {type(series.xVal).__name__}")
|
365
|
+
|
366
|
+
xvals = []
|
367
|
+
for _ in series.xVal.numLit.pt:
|
368
|
+
print(f"xval type: {type(_).__name__}")
|
369
|
+
if hasattr(_, 'v'):
|
370
|
+
xvals.append(_.v)
|
371
|
+
|
372
|
+
print(f"x-values: {xvals}")
|
373
|
+
|
374
|
+
yvals = []
|
375
|
+
for _ in series.yVal:
|
376
|
+
if hasattr(_, 'v'):
|
377
|
+
yvals.append(_.v)
|
378
|
+
|
379
|
+
print(f"y-values: {yvals}")
|
380
|
+
|
381
|
+
except Exception as exc:
|
382
|
+
print(exc)
|
383
|
+
continue
|
372
384
|
"""
|
373
385
|
|
374
386
|
return doc
|
@@ -12,7 +12,6 @@ from docling.datamodel.document import InputDocument
|
|
12
12
|
|
13
13
|
|
14
14
|
class PdfPageBackend(ABC):
|
15
|
-
|
16
15
|
@abstractmethod
|
17
16
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
18
17
|
pass
|
@@ -45,7 +44,6 @@ class PdfPageBackend(ABC):
|
|
45
44
|
|
46
45
|
|
47
46
|
class PdfDocumentBackend(PaginatedDocumentBackend):
|
48
|
-
|
49
47
|
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
|
50
48
|
super().__init__(in_doc, path_or_stream)
|
51
49
|
|
@@ -39,7 +39,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
39
39
|
return self.valid
|
40
40
|
|
41
41
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
42
|
-
AREA_THRESHOLD = 32 * 32
|
42
|
+
AREA_THRESHOLD = 0 # 32 * 32
|
43
43
|
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
44
44
|
pos = obj.get_pos()
|
45
45
|
cropbox = BoundingBox.from_tuple(
|
@@ -210,7 +210,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
210
210
|
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
211
211
|
)
|
212
212
|
else:
|
213
|
-
padbox = cropbox.to_bottom_left_origin(page_size.height)
|
213
|
+
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
214
214
|
padbox.r = page_size.width - padbox.r
|
215
215
|
padbox.t = page_size.height - padbox.t
|
216
216
|
|
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
|
4
4
|
from docling_core.types.doc import (
|
5
5
|
BoundingBox,
|
6
6
|
DocItemLabel,
|
7
|
+
NodeItem,
|
7
8
|
PictureDataType,
|
8
9
|
Size,
|
9
10
|
TableCell,
|
@@ -40,6 +41,7 @@ class InputFormat(str, Enum):
|
|
40
41
|
MD = "md"
|
41
42
|
XLSX = "xlsx"
|
42
43
|
XML_USPTO = "xml_uspto"
|
44
|
+
JSON_DOCLING = "json_docling"
|
43
45
|
|
44
46
|
|
45
47
|
class OutputFormat(str, Enum):
|
@@ -61,6 +63,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|
61
63
|
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
62
64
|
InputFormat.XLSX: ["xlsx"],
|
63
65
|
InputFormat.XML_USPTO: ["xml", "txt"],
|
66
|
+
InputFormat.JSON_DOCLING: ["json"],
|
64
67
|
}
|
65
68
|
|
66
69
|
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
@@ -89,6 +92,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|
89
92
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
90
93
|
],
|
91
94
|
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
95
|
+
InputFormat.JSON_DOCLING: ["application/json"],
|
92
96
|
}
|
93
97
|
|
94
98
|
MimeTypeToFormat: dict[str, list[InputFormat]] = {
|
@@ -201,6 +205,13 @@ class AssembledUnit(BaseModel):
|
|
201
205
|
headers: List[PageElement] = []
|
202
206
|
|
203
207
|
|
208
|
+
class ItemAndImageEnrichmentElement(BaseModel):
|
209
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
210
|
+
|
211
|
+
item: NodeItem
|
212
|
+
image: Image
|
213
|
+
|
214
|
+
|
204
215
|
class Page(BaseModel):
|
205
216
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
206
217
|
|
@@ -219,12 +230,28 @@ class Page(BaseModel):
|
|
219
230
|
{}
|
220
231
|
) # Cache of images in different scales. By default it is cleared during assembling.
|
221
232
|
|
222
|
-
def get_image(
|
233
|
+
def get_image(
|
234
|
+
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
|
235
|
+
) -> Optional[Image]:
|
223
236
|
if self._backend is None:
|
224
237
|
return self._image_cache.get(scale, None)
|
238
|
+
|
225
239
|
if not scale in self._image_cache:
|
226
|
-
|
227
|
-
|
240
|
+
if cropbox is None:
|
241
|
+
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
242
|
+
else:
|
243
|
+
return self._backend.get_page_image(scale=scale, cropbox=cropbox)
|
244
|
+
|
245
|
+
if cropbox is None:
|
246
|
+
return self._image_cache[scale]
|
247
|
+
else:
|
248
|
+
page_im = self._image_cache[scale]
|
249
|
+
assert self.size is not None
|
250
|
+
return page_im.crop(
|
251
|
+
cropbox.to_top_left_origin(page_height=self.size.height)
|
252
|
+
.scaled(scale=scale)
|
253
|
+
.as_tuple()
|
254
|
+
)
|
228
255
|
|
229
256
|
@property
|
230
257
|
def image(self) -> Optional[Image]:
|
@@ -350,6 +350,8 @@ class _DocumentConversionInput(BaseModel):
|
|
350
350
|
mime = FormatToMimeType[InputFormat.HTML][0]
|
351
351
|
elif ext in FormatToExtensions[InputFormat.MD]:
|
352
352
|
mime = FormatToMimeType[InputFormat.MD][0]
|
353
|
+
elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
|
354
|
+
mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
|
353
355
|
return mime
|
354
356
|
|
355
357
|
@staticmethod
|
@@ -1,17 +1,11 @@
|
|
1
1
|
import logging
|
2
2
|
import os
|
3
|
-
import warnings
|
4
3
|
from enum import Enum
|
5
4
|
from pathlib import Path
|
6
|
-
from typing import
|
5
|
+
from typing import Any, List, Literal, Optional, Union
|
7
6
|
|
8
|
-
from pydantic import BaseModel, ConfigDict, Field,
|
9
|
-
from pydantic_settings import
|
10
|
-
BaseSettings,
|
11
|
-
PydanticBaseSettingsSource,
|
12
|
-
SettingsConfigDict,
|
13
|
-
)
|
14
|
-
from typing_extensions import deprecated
|
7
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
8
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
15
9
|
|
16
10
|
_log = logging.getLogger(__name__)
|
17
11
|
|
@@ -139,7 +133,7 @@ class EasyOcrOptions(OcrOptions):
|
|
139
133
|
|
140
134
|
use_gpu: Optional[bool] = None
|
141
135
|
|
142
|
-
confidence_threshold: float = 0.
|
136
|
+
confidence_threshold: float = 0.5
|
143
137
|
|
144
138
|
model_storage_directory: Optional[str] = None
|
145
139
|
recog_network: Optional[str] = "standard"
|
@@ -225,6 +219,9 @@ class PdfPipelineOptions(PipelineOptions):
|
|
225
219
|
artifacts_path: Optional[Union[Path, str]] = None
|
226
220
|
do_table_structure: bool = True # True: perform table structure extraction
|
227
221
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
222
|
+
do_code_enrichment: bool = False # True: perform code OCR
|
223
|
+
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
|
224
|
+
do_picture_classification: bool = False # True: classify pictures in documents
|
228
225
|
|
229
226
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
230
227
|
ocr_options: Union[
|