docling 2.14.0__tar.gz → 2.15.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.14.0 → docling-2.15.0}/PKG-INFO +2 -2
- {docling-2.14.0 → docling-2.15.0}/docling/backend/html_backend.py +2 -2
- {docling-2.14.0 → docling-2.15.0}/docling/backend/mspowerpoint_backend.py +15 -11
- {docling-2.14.0 → docling-2.15.0}/docling/cli/main.py +14 -2
- {docling-2.14.0 → docling-2.15.0}/docling/datamodel/document.py +6 -1
- {docling-2.14.0 → docling-2.15.0}/docling/document_converter.py +4 -2
- {docling-2.14.0 → docling-2.15.0}/docling/models/base_ocr_model.py +14 -1
- {docling-2.14.0 → docling-2.15.0}/docling/models/layout_model.py +18 -25
- {docling-2.14.0 → docling-2.15.0}/docling/models/table_structure_model.py +20 -0
- {docling-2.14.0 → docling-2.15.0}/pyproject.toml +2 -2
- {docling-2.14.0 → docling-2.15.0}/LICENSE +0 -0
- {docling-2.14.0 → docling-2.15.0}/README.md +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/__init__.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/backend/__init__.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/backend/md_backend.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/backend/xml/pubmed_backend.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/chunking/__init__.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/cli/__init__.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/datamodel/base_models.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/datamodel/pipeline_options.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/datamodel/settings.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/exceptions.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/models/__init__.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/models/base_model.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/models/ds_glm_model.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/py.typed +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/utils/__init__.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/utils/export.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/utils/profiling.py +0 -0
- {docling-2.14.0 → docling-2.15.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.15.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -26,7 +26,7 @@ Provides-Extra: tesserocr
|
|
26
26
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
27
27
|
Requires-Dist: certifi (>=2024.7.4)
|
28
28
|
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
29
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
29
|
+
Requires-Dist: docling-core[chunking] (>=2.13.1,<3.0.0)
|
30
30
|
Requires-Dist: docling-ibm-models (>=3.1.0,<4.0.0)
|
31
31
|
Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
|
32
32
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -37,10 +37,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
37
37
|
|
38
38
|
try:
|
39
39
|
if isinstance(self.path_or_stream, BytesIO):
|
40
|
-
text_stream = self.path_or_stream.getvalue()
|
40
|
+
text_stream = self.path_or_stream.getvalue()
|
41
41
|
self.soup = BeautifulSoup(text_stream, "html.parser")
|
42
42
|
if isinstance(self.path_or_stream, Path):
|
43
|
-
with open(self.path_or_stream, "
|
43
|
+
with open(self.path_or_stream, "rb") as f:
|
44
44
|
html_content = f.read()
|
45
45
|
self.soup = BeautifulSoup(html_content, "html.parser")
|
46
46
|
except Exception as e:
|
@@ -16,7 +16,7 @@ from docling_core.types.doc import (
|
|
16
16
|
TableCell,
|
17
17
|
TableData,
|
18
18
|
)
|
19
|
-
from PIL import Image
|
19
|
+
from PIL import Image, UnidentifiedImageError
|
20
20
|
from pptx import Presentation
|
21
21
|
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
22
22
|
|
@@ -120,6 +120,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
120
120
|
bullet_type = "None"
|
121
121
|
list_text = ""
|
122
122
|
list_label = GroupLabel.LIST
|
123
|
+
doc_label = DocItemLabel.LIST_ITEM
|
123
124
|
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
|
124
125
|
|
125
126
|
# Identify if shape contains lists
|
@@ -276,16 +277,19 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
276
277
|
im_dpi, _ = image.dpi
|
277
278
|
|
278
279
|
# Open it with PIL
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
280
|
+
try:
|
281
|
+
pil_image = Image.open(BytesIO(image_bytes))
|
282
|
+
|
283
|
+
# shape has picture
|
284
|
+
prov = self.generate_prov(shape, slide_ind, "")
|
285
|
+
doc.add_picture(
|
286
|
+
parent=parent_slide,
|
287
|
+
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
|
288
|
+
caption=None,
|
289
|
+
prov=prov,
|
290
|
+
)
|
291
|
+
except (UnidentifiedImageError, OSError) as e:
|
292
|
+
_log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
|
289
293
|
return
|
290
294
|
|
291
295
|
def handle_tables(self, shape, parent_slide, slide_ind, doc):
|
@@ -164,6 +164,11 @@ def convert(
|
|
164
164
|
to_formats: List[OutputFormat] = typer.Option(
|
165
165
|
None, "--to", help="Specify output formats. Defaults to Markdown."
|
166
166
|
),
|
167
|
+
headers: str = typer.Option(
|
168
|
+
None,
|
169
|
+
"--headers",
|
170
|
+
help="Specify http request headers used when fetching url input sources in the form of a JSON string",
|
171
|
+
),
|
167
172
|
image_export_mode: Annotated[
|
168
173
|
ImageRefMode,
|
169
174
|
typer.Option(
|
@@ -279,12 +284,19 @@ def convert(
|
|
279
284
|
if from_formats is None:
|
280
285
|
from_formats = [e for e in InputFormat]
|
281
286
|
|
287
|
+
parsed_headers: Optional[Dict[str, str]] = None
|
288
|
+
if headers is not None:
|
289
|
+
headers_t = TypeAdapter(Dict[str, str])
|
290
|
+
parsed_headers = headers_t.validate_json(headers)
|
291
|
+
|
282
292
|
with tempfile.TemporaryDirectory() as tempdir:
|
283
293
|
input_doc_paths: List[Path] = []
|
284
294
|
for src in input_sources:
|
285
295
|
try:
|
286
296
|
# check if we can fetch some remote url
|
287
|
-
source = resolve_source_to_path(
|
297
|
+
source = resolve_source_to_path(
|
298
|
+
source=src, headers=parsed_headers, workdir=Path(tempdir)
|
299
|
+
)
|
288
300
|
input_doc_paths.append(source)
|
289
301
|
except FileNotFoundError:
|
290
302
|
err_console.print(
|
@@ -390,7 +402,7 @@ def convert(
|
|
390
402
|
start_time = time.time()
|
391
403
|
|
392
404
|
conv_results = doc_converter.convert_all(
|
393
|
-
input_doc_paths, raises_on_error=abort_on_error
|
405
|
+
input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
|
394
406
|
)
|
395
407
|
|
396
408
|
output.mkdir(parents=True, exist_ok=True)
|
@@ -227,13 +227,18 @@ class _DummyBackend(AbstractDocumentBackend):
|
|
227
227
|
class _DocumentConversionInput(BaseModel):
|
228
228
|
|
229
229
|
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
230
|
+
headers: Optional[Dict[str, str]] = None
|
230
231
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
231
232
|
|
232
233
|
def docs(
|
233
234
|
self, format_options: Dict[InputFormat, "FormatOption"]
|
234
235
|
) -> Iterable[InputDocument]:
|
235
236
|
for item in self.path_or_stream_iterator:
|
236
|
-
obj =
|
237
|
+
obj = (
|
238
|
+
resolve_source_to_stream(item, self.headers)
|
239
|
+
if isinstance(item, str)
|
240
|
+
else item
|
241
|
+
)
|
237
242
|
format = self._guess_format(obj)
|
238
243
|
backend: Type[AbstractDocumentBackend]
|
239
244
|
if format not in format_options.keys():
|
@@ -176,6 +176,7 @@ class DocumentConverter:
|
|
176
176
|
def convert(
|
177
177
|
self,
|
178
178
|
source: Union[Path, str, DocumentStream], # TODO review naming
|
179
|
+
headers: Optional[Dict[str, str]] = None,
|
179
180
|
raises_on_error: bool = True,
|
180
181
|
max_num_pages: int = sys.maxsize,
|
181
182
|
max_file_size: int = sys.maxsize,
|
@@ -185,6 +186,7 @@ class DocumentConverter:
|
|
185
186
|
raises_on_error=raises_on_error,
|
186
187
|
max_num_pages=max_num_pages,
|
187
188
|
max_file_size=max_file_size,
|
189
|
+
headers=headers,
|
188
190
|
)
|
189
191
|
return next(all_res)
|
190
192
|
|
@@ -192,6 +194,7 @@ class DocumentConverter:
|
|
192
194
|
def convert_all(
|
193
195
|
self,
|
194
196
|
source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
|
197
|
+
headers: Optional[Dict[str, str]] = None,
|
195
198
|
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
196
199
|
max_num_pages: int = sys.maxsize,
|
197
200
|
max_file_size: int = sys.maxsize,
|
@@ -201,8 +204,7 @@ class DocumentConverter:
|
|
201
204
|
max_file_size=max_file_size,
|
202
205
|
)
|
203
206
|
conv_input = _DocumentConversionInput(
|
204
|
-
path_or_stream_iterator=source,
|
205
|
-
limits=limits,
|
207
|
+
path_or_stream_iterator=source, limits=limits, headers=headers
|
206
208
|
)
|
207
209
|
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
208
210
|
|
@@ -138,18 +138,31 @@ class BaseOcrModel(BasePageModel):
|
|
138
138
|
|
139
139
|
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
140
140
|
image = copy.deepcopy(page.image)
|
141
|
+
scale_x = image.width / page.size.width
|
142
|
+
scale_y = image.height / page.size.height
|
143
|
+
|
141
144
|
draw = ImageDraw.Draw(image, "RGBA")
|
142
145
|
|
143
146
|
# Draw OCR rectangles as yellow filled rect
|
144
147
|
for rect in ocr_rects:
|
145
148
|
x0, y0, x1, y1 = rect.as_tuple()
|
149
|
+
y0 *= scale_x
|
150
|
+
y1 *= scale_y
|
151
|
+
x0 *= scale_x
|
152
|
+
x1 *= scale_x
|
153
|
+
|
146
154
|
shade_color = (255, 255, 0, 40) # transparent yellow
|
147
155
|
draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
|
148
156
|
|
149
157
|
# Draw OCR and programmatic cells
|
150
158
|
for tc in page.cells:
|
151
159
|
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
152
|
-
|
160
|
+
y0 *= scale_x
|
161
|
+
y1 *= scale_y
|
162
|
+
x0 *= scale_x
|
163
|
+
x1 *= scale_x
|
164
|
+
|
165
|
+
color = "gray"
|
153
166
|
if isinstance(tc, OcrCell):
|
154
167
|
color = "magenta"
|
155
168
|
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
|
@@ -67,29 +67,9 @@ class LayoutModel(BasePageModel):
|
|
67
67
|
- Right: Clusters including FORM, KEY_VALUE_REGION, and PICTURE.
|
68
68
|
Includes label names and confidence scores for each cluster.
|
69
69
|
"""
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
DocItemLabel.LIST_ITEM: (153, 153, 255), # Light Purple
|
74
|
-
DocItemLabel.FORMULA: (192, 192, 192), # Gray
|
75
|
-
DocItemLabel.TABLE: (255, 204, 204), # Light Pink
|
76
|
-
DocItemLabel.PICTURE: (255, 204, 164), # Light Beige
|
77
|
-
DocItemLabel.SECTION_HEADER: (255, 153, 153), # Light Red
|
78
|
-
DocItemLabel.PAGE_HEADER: (204, 255, 204), # Light Green
|
79
|
-
DocItemLabel.PAGE_FOOTER: (
|
80
|
-
204,
|
81
|
-
255,
|
82
|
-
204,
|
83
|
-
), # Light Green (same as Page-Header)
|
84
|
-
DocItemLabel.TITLE: (255, 153, 153), # Light Red (same as Section-Header)
|
85
|
-
DocItemLabel.FOOTNOTE: (200, 200, 255), # Light Blue
|
86
|
-
DocItemLabel.DOCUMENT_INDEX: (220, 220, 220), # Light Gray
|
87
|
-
DocItemLabel.CODE: (125, 125, 125), # Gray
|
88
|
-
DocItemLabel.CHECKBOX_SELECTED: (255, 182, 193), # Pale Green
|
89
|
-
DocItemLabel.CHECKBOX_UNSELECTED: (255, 182, 193), # Light Pink
|
90
|
-
DocItemLabel.FORM: (200, 255, 255), # Light Cyan
|
91
|
-
DocItemLabel.KEY_VALUE_REGION: (183, 65, 14), # Rusty orange
|
92
|
-
}
|
70
|
+
scale_x = page.image.width / page.size.width
|
71
|
+
scale_y = page.image.height / page.size.height
|
72
|
+
|
93
73
|
# Filter clusters for left and right images
|
94
74
|
exclude_labels = {
|
95
75
|
DocItemLabel.FORM,
|
@@ -118,6 +98,11 @@ class LayoutModel(BasePageModel):
|
|
118
98
|
cell_color = (0, 0, 0, 40) # Transparent black for cells
|
119
99
|
for tc in c.cells:
|
120
100
|
cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
|
101
|
+
cx0 *= scale_x
|
102
|
+
cx1 *= scale_x
|
103
|
+
cy0 *= scale_x
|
104
|
+
cy1 *= scale_y
|
105
|
+
|
121
106
|
draw.rectangle(
|
122
107
|
[(cx0, cy0), (cx1, cy1)],
|
123
108
|
outline=None,
|
@@ -125,8 +110,16 @@ class LayoutModel(BasePageModel):
|
|
125
110
|
)
|
126
111
|
# Draw cluster rectangle
|
127
112
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
128
|
-
|
129
|
-
|
113
|
+
x0 *= scale_x
|
114
|
+
x1 *= scale_x
|
115
|
+
y0 *= scale_x
|
116
|
+
y1 *= scale_y
|
117
|
+
|
118
|
+
cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
|
119
|
+
cluster_outline_color = (
|
120
|
+
*list(DocItemLabel.get_color(c.label)),
|
121
|
+
255,
|
122
|
+
)
|
130
123
|
draw.rectangle(
|
131
124
|
[(x0, y0), (x1, y1)],
|
132
125
|
outline=cluster_outline_color,
|
@@ -66,23 +66,43 @@ class TableStructureModel(BasePageModel):
|
|
66
66
|
show: bool = False,
|
67
67
|
):
|
68
68
|
assert page._backend is not None
|
69
|
+
assert page.size is not None
|
69
70
|
|
70
71
|
image = (
|
71
72
|
page._backend.get_page_image()
|
72
73
|
) # make new image to avoid drawing on the saved ones
|
74
|
+
|
75
|
+
scale_x = image.width / page.size.width
|
76
|
+
scale_y = image.height / page.size.height
|
77
|
+
|
73
78
|
draw = ImageDraw.Draw(image)
|
74
79
|
|
75
80
|
for table_element in tbl_list:
|
76
81
|
x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
|
82
|
+
y0 *= scale_x
|
83
|
+
y1 *= scale_y
|
84
|
+
x0 *= scale_x
|
85
|
+
x1 *= scale_x
|
86
|
+
|
77
87
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
78
88
|
|
79
89
|
for cell in table_element.cluster.cells:
|
80
90
|
x0, y0, x1, y1 = cell.bbox.as_tuple()
|
91
|
+
x0 *= scale_x
|
92
|
+
x1 *= scale_x
|
93
|
+
y0 *= scale_x
|
94
|
+
y1 *= scale_y
|
95
|
+
|
81
96
|
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
|
82
97
|
|
83
98
|
for tc in table_element.table_cells:
|
84
99
|
if tc.bbox is not None:
|
85
100
|
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
101
|
+
x0 *= scale_x
|
102
|
+
x1 *= scale_x
|
103
|
+
y0 *= scale_x
|
104
|
+
y1 *= scale_y
|
105
|
+
|
86
106
|
if tc.column_header:
|
87
107
|
width = 3
|
88
108
|
else:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.
|
3
|
+
version = "2.15.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
@@ -25,7 +25,7 @@ packages = [{include = "docling"}]
|
|
25
25
|
# actual dependencies:
|
26
26
|
######################
|
27
27
|
python = "^3.9"
|
28
|
-
docling-core = { version = "^2.
|
28
|
+
docling-core = { version = "^2.13.1", extras = ["chunking"] }
|
29
29
|
pydantic = "^2.0.0"
|
30
30
|
docling-ibm-models = "^3.1.0"
|
31
31
|
deepsearch-glm = "^1.0.0"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|