docling 2.8.3__py3-none-any.whl → 2.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/backend/docling_parse_backend.py +1 -1
- docling/backend/docling_parse_v2_backend.py +7 -5
- docling/backend/msword_backend.py +43 -27
- docling/chunking/__init__.py +12 -0
- docling/cli/main.py +83 -28
- docling/datamodel/base_models.py +1 -0
- docling/datamodel/document.py +2 -253
- docling/datamodel/pipeline_options.py +5 -1
- docling/document_converter.py +5 -5
- docling/models/ds_glm_model.py +2 -2
- docling/pipeline/standard_pdf_pipeline.py +2 -0
- docling/py.typed +1 -0
- docling/utils/glm_utils.py +336 -0
- {docling-2.8.3.dist-info → docling-2.10.0.dist-info}/METADATA +7 -7
- {docling-2.8.3.dist-info → docling-2.10.0.dist-info}/RECORD +18 -15
- {docling-2.8.3.dist-info → docling-2.10.0.dist-info}/LICENSE +0 -0
- {docling-2.8.3.dist-info → docling-2.10.0.dist-info}/WHEEL +0 -0
- {docling-2.8.3.dist-info → docling-2.10.0.dist-info}/entry_points.txt +0 -0
|
@@ -6,7 +6,7 @@ from typing import Iterable, List, Optional, Union
|
|
|
6
6
|
|
|
7
7
|
import pypdfium2 as pdfium
|
|
8
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
|
9
|
-
from docling_parse.
|
|
9
|
+
from docling_parse.pdf_parsers import pdf_parser_v1
|
|
10
10
|
from PIL import Image, ImageDraw
|
|
11
11
|
from pypdfium2 import PdfPage
|
|
12
12
|
|
|
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
|
|
6
6
|
|
|
7
7
|
import pypdfium2 as pdfium
|
|
8
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
|
9
|
-
from docling_parse.
|
|
9
|
+
from docling_parse.pdf_parsers import pdf_parser_v2
|
|
10
10
|
from PIL import Image, ImageDraw
|
|
11
11
|
from pypdfium2 import PdfPage
|
|
12
12
|
|
|
@@ -210,12 +210,14 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
|
|
|
210
210
|
self.parser = pdf_parser_v2("fatal")
|
|
211
211
|
|
|
212
212
|
success = False
|
|
213
|
-
if isinstance(path_or_stream, BytesIO):
|
|
213
|
+
if isinstance(self.path_or_stream, BytesIO):
|
|
214
214
|
success = self.parser.load_document_from_bytesio(
|
|
215
|
-
self.document_hash, path_or_stream
|
|
215
|
+
self.document_hash, self.path_or_stream
|
|
216
|
+
)
|
|
217
|
+
elif isinstance(self.path_or_stream, Path):
|
|
218
|
+
success = self.parser.load_document(
|
|
219
|
+
self.document_hash, str(self.path_or_stream)
|
|
216
220
|
)
|
|
217
|
-
elif isinstance(path_or_stream, Path):
|
|
218
|
-
success = self.parser.load_document(self.document_hash, str(path_or_stream))
|
|
219
221
|
|
|
220
222
|
if not success:
|
|
221
223
|
raise RuntimeError(
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import re
|
|
2
3
|
from io import BytesIO
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
from typing import Set, Union
|
|
@@ -133,7 +134,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
133
134
|
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
|
134
135
|
for element in body:
|
|
135
136
|
tag_name = etree.QName(element).localname
|
|
136
|
-
|
|
137
137
|
# Check for Inline Images (blip elements)
|
|
138
138
|
namespaces = {
|
|
139
139
|
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
|
@@ -153,6 +153,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
153
153
|
self.handle_pictures(element, docx_obj, drawing_blip, doc)
|
|
154
154
|
# Check for Text
|
|
155
155
|
elif tag_name in ["p"]:
|
|
156
|
+
# "tcPr", "sectPr"
|
|
156
157
|
self.handle_text_elements(element, docx_obj, doc)
|
|
157
158
|
else:
|
|
158
159
|
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
|
@@ -166,6 +167,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
166
167
|
except ValueError:
|
|
167
168
|
return default
|
|
168
169
|
|
|
170
|
+
def split_text_and_number(self, input_string):
|
|
171
|
+
match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
|
|
172
|
+
if match:
|
|
173
|
+
parts = list(filter(None, match.groups()))
|
|
174
|
+
return parts
|
|
175
|
+
else:
|
|
176
|
+
return [input_string]
|
|
177
|
+
|
|
169
178
|
def get_numId_and_ilvl(self, paragraph):
|
|
170
179
|
# Access the XML element of the paragraph
|
|
171
180
|
numPr = paragraph._element.find(
|
|
@@ -188,7 +197,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
188
197
|
def get_label_and_level(self, paragraph):
|
|
189
198
|
if paragraph.style is None:
|
|
190
199
|
return "Normal", None
|
|
191
|
-
label = paragraph.style.
|
|
200
|
+
label = paragraph.style.style_id
|
|
192
201
|
if label is None:
|
|
193
202
|
return "Normal", None
|
|
194
203
|
if ":" in label:
|
|
@@ -197,7 +206,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
197
206
|
if len(parts) == 2:
|
|
198
207
|
return parts[0], int(parts[1])
|
|
199
208
|
|
|
200
|
-
parts =
|
|
209
|
+
parts = self.split_text_and_number(label)
|
|
201
210
|
|
|
202
211
|
if "Heading" in label and len(parts) == 2:
|
|
203
212
|
parts.sort()
|
|
@@ -219,14 +228,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
219
228
|
if paragraph.text is None:
|
|
220
229
|
return
|
|
221
230
|
text = paragraph.text.strip()
|
|
222
|
-
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
|
|
223
231
|
|
|
224
232
|
# Common styles for bullet and numbered lists.
|
|
225
233
|
# "List Bullet", "List Number", "List Paragraph"
|
|
226
234
|
# Identify wether list is a numbered list or not
|
|
227
235
|
# is_numbered = "List Bullet" not in paragraph.style.name
|
|
228
236
|
is_numbered = False
|
|
229
|
-
|
|
237
|
+
p_style_id, p_level = self.get_label_and_level(paragraph)
|
|
230
238
|
numid, ilevel = self.get_numId_and_ilvl(paragraph)
|
|
231
239
|
|
|
232
240
|
if numid == 0:
|
|
@@ -238,14 +246,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
238
246
|
element,
|
|
239
247
|
docx_obj,
|
|
240
248
|
doc,
|
|
241
|
-
|
|
249
|
+
p_style_id,
|
|
242
250
|
p_level,
|
|
243
251
|
numid,
|
|
244
252
|
ilevel,
|
|
245
253
|
text,
|
|
246
254
|
is_numbered,
|
|
247
255
|
)
|
|
248
|
-
self.update_history(
|
|
256
|
+
self.update_history(p_style_id, p_level, numid, ilevel)
|
|
249
257
|
return
|
|
250
258
|
elif numid is None and self.prev_numid() is not None: # Close list
|
|
251
259
|
for key, val in self.parents.items():
|
|
@@ -253,23 +261,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
253
261
|
self.parents[key] = None
|
|
254
262
|
self.level = self.level_at_new_list - 1
|
|
255
263
|
self.level_at_new_list = None
|
|
256
|
-
if
|
|
264
|
+
if p_style_id in ["Title"]:
|
|
257
265
|
for key, val in self.parents.items():
|
|
258
266
|
self.parents[key] = None
|
|
259
267
|
self.parents[0] = doc.add_text(
|
|
260
268
|
parent=None, label=DocItemLabel.TITLE, text=text
|
|
261
269
|
)
|
|
262
|
-
elif "Heading" in
|
|
263
|
-
self.add_header(element, docx_obj, doc,
|
|
270
|
+
elif "Heading" in p_style_id:
|
|
271
|
+
self.add_header(element, docx_obj, doc, p_style_id, p_level, text)
|
|
264
272
|
|
|
265
|
-
elif
|
|
273
|
+
elif p_style_id in [
|
|
266
274
|
"Paragraph",
|
|
267
275
|
"Normal",
|
|
268
276
|
"Subtitle",
|
|
269
277
|
"Author",
|
|
270
|
-
"
|
|
271
|
-
"
|
|
272
|
-
"
|
|
278
|
+
"DefaultText",
|
|
279
|
+
"ListParagraph",
|
|
280
|
+
"ListBullet",
|
|
273
281
|
"Quote",
|
|
274
282
|
]:
|
|
275
283
|
level = self.get_level()
|
|
@@ -285,15 +293,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
285
293
|
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
|
|
286
294
|
)
|
|
287
295
|
|
|
288
|
-
self.update_history(
|
|
296
|
+
self.update_history(p_style_id, p_level, numid, ilevel)
|
|
289
297
|
return
|
|
290
298
|
|
|
291
299
|
def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
|
|
292
300
|
level = self.get_level()
|
|
293
301
|
if isinstance(curr_level, int):
|
|
294
|
-
|
|
295
302
|
if curr_level > level:
|
|
296
|
-
|
|
297
303
|
# add invisible group
|
|
298
304
|
for i in range(level, curr_level):
|
|
299
305
|
self.parents[i] = doc.add_group(
|
|
@@ -301,9 +307,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
301
307
|
label=GroupLabel.SECTION,
|
|
302
308
|
name=f"header-{i}",
|
|
303
309
|
)
|
|
304
|
-
|
|
305
310
|
elif curr_level < level:
|
|
306
|
-
|
|
307
311
|
# remove the tail
|
|
308
312
|
for key, val in self.parents.items():
|
|
309
313
|
if key >= curr_level:
|
|
@@ -314,7 +318,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
314
318
|
text=text,
|
|
315
319
|
level=curr_level,
|
|
316
320
|
)
|
|
317
|
-
|
|
318
321
|
else:
|
|
319
322
|
self.parents[self.level] = doc.add_heading(
|
|
320
323
|
parent=self.parents[self.level - 1],
|
|
@@ -328,7 +331,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
328
331
|
element,
|
|
329
332
|
docx_obj,
|
|
330
333
|
doc,
|
|
331
|
-
|
|
334
|
+
p_style_id,
|
|
332
335
|
p_level,
|
|
333
336
|
numid,
|
|
334
337
|
ilevel,
|
|
@@ -346,7 +349,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
346
349
|
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
|
|
347
350
|
)
|
|
348
351
|
|
|
349
|
-
#
|
|
352
|
+
# Set marker and enumerated arguments if this is an enumeration element.
|
|
350
353
|
self.listIter += 1
|
|
351
354
|
if is_numbered:
|
|
352
355
|
enum_marker = str(self.listIter) + "."
|
|
@@ -365,8 +368,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
365
368
|
self.level_at_new_list + self.prev_indent() + 1,
|
|
366
369
|
self.level_at_new_list + ilevel + 1,
|
|
367
370
|
):
|
|
368
|
-
#
|
|
369
|
-
#
|
|
371
|
+
# Determine if this is an unordered list or an ordered list.
|
|
372
|
+
# Set GroupLabel.ORDERED_LIST when it fits.
|
|
370
373
|
self.listIter = 0
|
|
371
374
|
if is_numbered:
|
|
372
375
|
self.parents[i] = doc.add_group(
|
|
@@ -467,6 +470,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
467
470
|
row_span = get_rowspan(cell)
|
|
468
471
|
col_span = get_colspan(cell)
|
|
469
472
|
|
|
473
|
+
cell_text = cell.text
|
|
474
|
+
# In case cell doesn't return text via docx library:
|
|
475
|
+
if len(cell_text) == 0:
|
|
476
|
+
cell_xml = cell._element
|
|
477
|
+
|
|
478
|
+
texts = [""]
|
|
479
|
+
for elem in cell_xml.iter():
|
|
480
|
+
if elem.tag.endswith("t"): # <w:t> tags that contain text
|
|
481
|
+
if elem.text:
|
|
482
|
+
texts.append(elem.text)
|
|
483
|
+
# Join the collected text
|
|
484
|
+
cell_text = " ".join(texts).strip()
|
|
485
|
+
|
|
470
486
|
# Find the next available column in the grid
|
|
471
487
|
while table_grid[row_idx][col_idx] is not None:
|
|
472
488
|
col_idx += 1
|
|
@@ -477,15 +493,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
477
493
|
table_grid[row_idx + i][col_idx + j] = ""
|
|
478
494
|
|
|
479
495
|
cell = TableCell(
|
|
480
|
-
text=
|
|
496
|
+
text=cell_text,
|
|
481
497
|
row_span=row_span,
|
|
482
498
|
col_span=col_span,
|
|
483
499
|
start_row_offset_idx=row_idx,
|
|
484
500
|
end_row_offset_idx=row_idx + row_span,
|
|
485
501
|
start_col_offset_idx=col_idx,
|
|
486
502
|
end_col_offset_idx=col_idx + col_span,
|
|
487
|
-
col_header=False,
|
|
488
|
-
row_header=False,
|
|
503
|
+
col_header=False,
|
|
504
|
+
row_header=False,
|
|
489
505
|
)
|
|
490
506
|
|
|
491
507
|
data.table_cells.append(cell)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
|
|
7
|
+
from docling_core.transforms.chunker.hierarchical_chunker import (
|
|
8
|
+
DocChunk,
|
|
9
|
+
DocMeta,
|
|
10
|
+
HierarchicalChunker,
|
|
11
|
+
)
|
|
12
|
+
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
|
docling/cli/main.py
CHANGED
|
@@ -10,7 +10,9 @@ from pathlib import Path
|
|
|
10
10
|
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
|
11
11
|
|
|
12
12
|
import typer
|
|
13
|
+
from docling_core.types.doc import ImageRefMode
|
|
13
14
|
from docling_core.utils.file import resolve_source_to_path
|
|
15
|
+
from pydantic import TypeAdapter, ValidationError
|
|
14
16
|
|
|
15
17
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
16
18
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
|
@@ -86,9 +88,11 @@ def export_documents(
|
|
|
86
88
|
conv_results: Iterable[ConversionResult],
|
|
87
89
|
output_dir: Path,
|
|
88
90
|
export_json: bool,
|
|
91
|
+
export_html: bool,
|
|
89
92
|
export_md: bool,
|
|
90
93
|
export_txt: bool,
|
|
91
94
|
export_doctags: bool,
|
|
95
|
+
image_export_mode: ImageRefMode,
|
|
92
96
|
):
|
|
93
97
|
|
|
94
98
|
success_count = 0
|
|
@@ -99,33 +103,45 @@ def export_documents(
|
|
|
99
103
|
success_count += 1
|
|
100
104
|
doc_filename = conv_res.input.file.stem
|
|
101
105
|
|
|
102
|
-
# Export
|
|
106
|
+
# Export JSON format:
|
|
103
107
|
if export_json:
|
|
104
108
|
fname = output_dir / f"{doc_filename}.json"
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
109
|
+
_log.info(f"writing JSON output to {fname}")
|
|
110
|
+
conv_res.document.save_as_json(
|
|
111
|
+
filename=fname, image_mode=image_export_mode
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Export HTML format:
|
|
115
|
+
if export_html:
|
|
116
|
+
fname = output_dir / f"{doc_filename}.html"
|
|
117
|
+
_log.info(f"writing HTML output to {fname}")
|
|
118
|
+
conv_res.document.save_as_html(
|
|
119
|
+
filename=fname, image_mode=image_export_mode
|
|
120
|
+
)
|
|
108
121
|
|
|
109
122
|
# Export Text format:
|
|
110
123
|
if export_txt:
|
|
111
124
|
fname = output_dir / f"{doc_filename}.txt"
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
125
|
+
_log.info(f"writing TXT output to {fname}")
|
|
126
|
+
conv_res.document.save_as_markdown(
|
|
127
|
+
filename=fname,
|
|
128
|
+
strict_text=True,
|
|
129
|
+
image_mode=ImageRefMode.PLACEHOLDER,
|
|
130
|
+
)
|
|
115
131
|
|
|
116
132
|
# Export Markdown format:
|
|
117
133
|
if export_md:
|
|
118
134
|
fname = output_dir / f"{doc_filename}.md"
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
135
|
+
_log.info(f"writing Markdown output to {fname}")
|
|
136
|
+
conv_res.document.save_as_markdown(
|
|
137
|
+
filename=fname, image_mode=image_export_mode
|
|
138
|
+
)
|
|
122
139
|
|
|
123
140
|
# Export Document Tags format:
|
|
124
141
|
if export_doctags:
|
|
125
142
|
fname = output_dir / f"{doc_filename}.doctags"
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
fp.write(conv_res.document.export_to_document_tokens())
|
|
143
|
+
_log.info(f"writing Doc Tags output to {fname}")
|
|
144
|
+
conv_res.document.save_as_document_tokens(filename=fname)
|
|
129
145
|
|
|
130
146
|
else:
|
|
131
147
|
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
|
@@ -160,6 +176,13 @@ def convert(
|
|
|
160
176
|
to_formats: List[OutputFormat] = typer.Option(
|
|
161
177
|
None, "--to", help="Specify output formats. Defaults to Markdown."
|
|
162
178
|
),
|
|
179
|
+
image_export_mode: Annotated[
|
|
180
|
+
ImageRefMode,
|
|
181
|
+
typer.Option(
|
|
182
|
+
...,
|
|
183
|
+
help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
|
|
184
|
+
),
|
|
185
|
+
] = ImageRefMode.EMBEDDED,
|
|
163
186
|
ocr: Annotated[
|
|
164
187
|
bool,
|
|
165
188
|
typer.Option(
|
|
@@ -185,7 +208,7 @@ def convert(
|
|
|
185
208
|
] = None,
|
|
186
209
|
pdf_backend: Annotated[
|
|
187
210
|
PdfBackend, typer.Option(..., help="The PDF backend to use.")
|
|
188
|
-
] = PdfBackend.
|
|
211
|
+
] = PdfBackend.DLPARSE_V2,
|
|
189
212
|
table_mode: Annotated[
|
|
190
213
|
TableFormerMode,
|
|
191
214
|
typer.Option(..., help="The mode to use in the table structure model."),
|
|
@@ -260,24 +283,45 @@ def convert(
|
|
|
260
283
|
with tempfile.TemporaryDirectory() as tempdir:
|
|
261
284
|
input_doc_paths: List[Path] = []
|
|
262
285
|
for src in input_sources:
|
|
263
|
-
|
|
264
|
-
|
|
286
|
+
try:
|
|
287
|
+
# check if we can fetch some remote url
|
|
288
|
+
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
|
|
289
|
+
input_doc_paths.append(source)
|
|
290
|
+
except FileNotFoundError:
|
|
265
291
|
err_console.print(
|
|
266
|
-
f"[red]Error: The input file {
|
|
292
|
+
f"[red]Error: The input file {src} does not exist.[/red]"
|
|
267
293
|
)
|
|
268
294
|
raise typer.Abort()
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
295
|
+
except IsADirectoryError:
|
|
296
|
+
# if the input matches to a file or a folder
|
|
297
|
+
try:
|
|
298
|
+
local_path = TypeAdapter(Path).validate_python(src)
|
|
299
|
+
if local_path.exists() and local_path.is_dir():
|
|
300
|
+
for fmt in from_formats:
|
|
301
|
+
for ext in FormatToExtensions[fmt]:
|
|
302
|
+
input_doc_paths.extend(
|
|
303
|
+
list(local_path.glob(f"**/*.{ext}"))
|
|
304
|
+
)
|
|
305
|
+
input_doc_paths.extend(
|
|
306
|
+
list(local_path.glob(f"**/*.{ext.upper()}"))
|
|
307
|
+
)
|
|
308
|
+
elif local_path.exists():
|
|
309
|
+
input_doc_paths.append(local_path)
|
|
310
|
+
else:
|
|
311
|
+
err_console.print(
|
|
312
|
+
f"[red]Error: The input file {src} does not exist.[/red]"
|
|
313
|
+
)
|
|
314
|
+
raise typer.Abort()
|
|
315
|
+
except Exception as err:
|
|
316
|
+
err_console.print(f"[red]Error: Cannot read the input {src}.[/red]")
|
|
317
|
+
_log.info(err) # will print more details if verbose is activated
|
|
318
|
+
raise typer.Abort()
|
|
276
319
|
|
|
277
320
|
if to_formats is None:
|
|
278
321
|
to_formats = [OutputFormat.MARKDOWN]
|
|
279
322
|
|
|
280
323
|
export_json = OutputFormat.JSON in to_formats
|
|
324
|
+
export_html = OutputFormat.HTML in to_formats
|
|
281
325
|
export_md = OutputFormat.MARKDOWN in to_formats
|
|
282
326
|
export_txt = OutputFormat.TEXT in to_formats
|
|
283
327
|
export_doctags = OutputFormat.DOCTAGS in to_formats
|
|
@@ -309,6 +353,13 @@ def convert(
|
|
|
309
353
|
)
|
|
310
354
|
pipeline_options.table_structure_options.mode = table_mode
|
|
311
355
|
|
|
356
|
+
if image_export_mode != ImageRefMode.PLACEHOLDER:
|
|
357
|
+
pipeline_options.generate_page_images = True
|
|
358
|
+
pipeline_options.generate_picture_images = (
|
|
359
|
+
True # FIXME: to be deprecated in verson 3
|
|
360
|
+
)
|
|
361
|
+
pipeline_options.images_scale = 2
|
|
362
|
+
|
|
312
363
|
if artifacts_path is not None:
|
|
313
364
|
pipeline_options.artifacts_path = artifacts_path
|
|
314
365
|
|
|
@@ -321,11 +372,13 @@ def convert(
|
|
|
321
372
|
else:
|
|
322
373
|
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
|
323
374
|
|
|
375
|
+
pdf_format_option = PdfFormatOption(
|
|
376
|
+
pipeline_options=pipeline_options,
|
|
377
|
+
backend=backend, # pdf_backend
|
|
378
|
+
)
|
|
324
379
|
format_options: Dict[InputFormat, FormatOption] = {
|
|
325
|
-
InputFormat.PDF:
|
|
326
|
-
|
|
327
|
-
backend=backend, # pdf_backend
|
|
328
|
-
)
|
|
380
|
+
InputFormat.PDF: pdf_format_option,
|
|
381
|
+
InputFormat.IMAGE: pdf_format_option,
|
|
329
382
|
}
|
|
330
383
|
doc_converter = DocumentConverter(
|
|
331
384
|
allowed_formats=from_formats,
|
|
@@ -343,9 +396,11 @@ def convert(
|
|
|
343
396
|
conv_results,
|
|
344
397
|
output_dir=output,
|
|
345
398
|
export_json=export_json,
|
|
399
|
+
export_html=export_html,
|
|
346
400
|
export_md=export_md,
|
|
347
401
|
export_txt=export_txt,
|
|
348
402
|
export_doctags=export_doctags,
|
|
403
|
+
image_export_mode=image_export_mode,
|
|
349
404
|
)
|
|
350
405
|
|
|
351
406
|
end_time = time.time() - start_time
|
docling/datamodel/base_models.py
CHANGED
docling/datamodel/document.py
CHANGED
|
@@ -33,6 +33,7 @@ from docling_core.types.legacy_doc.document import (
|
|
|
33
33
|
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
|
34
34
|
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
|
35
35
|
from docling_core.utils.file import resolve_source_to_stream
|
|
36
|
+
from docling_core.utils.legacy import docling_document_to_legacy
|
|
36
37
|
from pydantic import BaseModel
|
|
37
38
|
from typing_extensions import deprecated
|
|
38
39
|
|
|
@@ -189,259 +190,7 @@ class ConversionResult(BaseModel):
|
|
|
189
190
|
@property
|
|
190
191
|
@deprecated("Use document instead.")
|
|
191
192
|
def legacy_document(self):
|
|
192
|
-
|
|
193
|
-
DocItemLabel.CAPTION.value: "Caption",
|
|
194
|
-
DocItemLabel.FOOTNOTE.value: "Footnote",
|
|
195
|
-
DocItemLabel.FORMULA.value: "Formula",
|
|
196
|
-
DocItemLabel.LIST_ITEM.value: "List-item",
|
|
197
|
-
DocItemLabel.PAGE_FOOTER.value: "Page-footer",
|
|
198
|
-
DocItemLabel.PAGE_HEADER.value: "Page-header",
|
|
199
|
-
DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
|
|
200
|
-
DocItemLabel.SECTION_HEADER.value: "Section-header",
|
|
201
|
-
DocItemLabel.TABLE.value: "Table",
|
|
202
|
-
DocItemLabel.TEXT.value: "Text",
|
|
203
|
-
DocItemLabel.TITLE.value: "Title",
|
|
204
|
-
DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
|
|
205
|
-
DocItemLabel.CODE.value: "Code",
|
|
206
|
-
DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
|
|
207
|
-
DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
|
|
208
|
-
DocItemLabel.FORM.value: "Form",
|
|
209
|
-
DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
|
|
210
|
-
DocItemLabel.PARAGRAPH.value: "paragraph",
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
title = ""
|
|
214
|
-
desc = DsDocumentDescription(logs=[])
|
|
215
|
-
|
|
216
|
-
page_hashes = [
|
|
217
|
-
PageReference(
|
|
218
|
-
hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
|
|
219
|
-
page=p.page_no,
|
|
220
|
-
model="default",
|
|
221
|
-
)
|
|
222
|
-
for p in self.document.pages.values()
|
|
223
|
-
]
|
|
224
|
-
|
|
225
|
-
file_info = DsFileInfoObject(
|
|
226
|
-
filename=self.input.file.name,
|
|
227
|
-
document_hash=self.input.document_hash,
|
|
228
|
-
num_pages=self.input.page_count,
|
|
229
|
-
page_hashes=page_hashes,
|
|
230
|
-
)
|
|
231
|
-
|
|
232
|
-
main_text = []
|
|
233
|
-
tables = []
|
|
234
|
-
figures = []
|
|
235
|
-
equations = []
|
|
236
|
-
footnotes = []
|
|
237
|
-
page_headers = []
|
|
238
|
-
page_footers = []
|
|
239
|
-
|
|
240
|
-
embedded_captions = set()
|
|
241
|
-
for ix, (item, level) in enumerate(
|
|
242
|
-
self.document.iterate_items(self.document.body)
|
|
243
|
-
):
|
|
244
|
-
|
|
245
|
-
if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
|
|
246
|
-
caption = item.caption_text(self.document)
|
|
247
|
-
if caption:
|
|
248
|
-
embedded_captions.add(caption)
|
|
249
|
-
|
|
250
|
-
for item, level in self.document.iterate_items():
|
|
251
|
-
if isinstance(item, DocItem):
|
|
252
|
-
item_type = item.label
|
|
253
|
-
|
|
254
|
-
if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
|
|
255
|
-
|
|
256
|
-
if isinstance(item, ListItem) and item.marker:
|
|
257
|
-
text = f"{item.marker} {item.text}"
|
|
258
|
-
else:
|
|
259
|
-
text = item.text
|
|
260
|
-
|
|
261
|
-
# Can be empty.
|
|
262
|
-
prov = [
|
|
263
|
-
Prov(
|
|
264
|
-
bbox=p.bbox.as_tuple(),
|
|
265
|
-
page=p.page_no,
|
|
266
|
-
span=[0, len(item.text)],
|
|
267
|
-
)
|
|
268
|
-
for p in item.prov
|
|
269
|
-
]
|
|
270
|
-
main_text.append(
|
|
271
|
-
BaseText(
|
|
272
|
-
text=text,
|
|
273
|
-
obj_type=layout_label_to_ds_type.get(item.label),
|
|
274
|
-
name=reverse_label_mapping[item.label],
|
|
275
|
-
prov=prov,
|
|
276
|
-
)
|
|
277
|
-
)
|
|
278
|
-
|
|
279
|
-
# skip captions of they are embedded in the actual
|
|
280
|
-
# floating object
|
|
281
|
-
if item_type == DocItemLabel.CAPTION and text in embedded_captions:
|
|
282
|
-
continue
|
|
283
|
-
|
|
284
|
-
elif isinstance(item, TableItem) and item.data:
|
|
285
|
-
index = len(tables)
|
|
286
|
-
ref_str = f"#/tables/{index}"
|
|
287
|
-
main_text.append(
|
|
288
|
-
Ref(
|
|
289
|
-
name=reverse_label_mapping[item.label],
|
|
290
|
-
obj_type=layout_label_to_ds_type.get(item.label),
|
|
291
|
-
ref=ref_str,
|
|
292
|
-
),
|
|
293
|
-
)
|
|
294
|
-
|
|
295
|
-
# Initialise empty table data grid (only empty cells)
|
|
296
|
-
table_data = [
|
|
297
|
-
[
|
|
298
|
-
TableCell(
|
|
299
|
-
text="",
|
|
300
|
-
# bbox=[0,0,0,0],
|
|
301
|
-
spans=[[i, j]],
|
|
302
|
-
obj_type="body",
|
|
303
|
-
)
|
|
304
|
-
for j in range(item.data.num_cols)
|
|
305
|
-
]
|
|
306
|
-
for i in range(item.data.num_rows)
|
|
307
|
-
]
|
|
308
|
-
|
|
309
|
-
# Overwrite cells in table data for which there is actual cell content.
|
|
310
|
-
for cell in item.data.table_cells:
|
|
311
|
-
for i in range(
|
|
312
|
-
min(cell.start_row_offset_idx, item.data.num_rows),
|
|
313
|
-
min(cell.end_row_offset_idx, item.data.num_rows),
|
|
314
|
-
):
|
|
315
|
-
for j in range(
|
|
316
|
-
min(cell.start_col_offset_idx, item.data.num_cols),
|
|
317
|
-
min(cell.end_col_offset_idx, item.data.num_cols),
|
|
318
|
-
):
|
|
319
|
-
celltype = "body"
|
|
320
|
-
if cell.column_header:
|
|
321
|
-
celltype = "col_header"
|
|
322
|
-
elif cell.row_header:
|
|
323
|
-
celltype = "row_header"
|
|
324
|
-
elif cell.row_section:
|
|
325
|
-
celltype = "row_section"
|
|
326
|
-
|
|
327
|
-
def make_spans(cell):
|
|
328
|
-
for rspan in range(
|
|
329
|
-
min(
|
|
330
|
-
cell.start_row_offset_idx,
|
|
331
|
-
item.data.num_rows,
|
|
332
|
-
),
|
|
333
|
-
min(
|
|
334
|
-
cell.end_row_offset_idx, item.data.num_rows
|
|
335
|
-
),
|
|
336
|
-
):
|
|
337
|
-
for cspan in range(
|
|
338
|
-
min(
|
|
339
|
-
cell.start_col_offset_idx,
|
|
340
|
-
item.data.num_cols,
|
|
341
|
-
),
|
|
342
|
-
min(
|
|
343
|
-
cell.end_col_offset_idx,
|
|
344
|
-
item.data.num_cols,
|
|
345
|
-
),
|
|
346
|
-
):
|
|
347
|
-
yield [rspan, cspan]
|
|
348
|
-
|
|
349
|
-
spans = list(make_spans(cell))
|
|
350
|
-
table_data[i][j] = GlmTableCell(
|
|
351
|
-
text=cell.text,
|
|
352
|
-
bbox=(
|
|
353
|
-
cell.bbox.as_tuple()
|
|
354
|
-
if cell.bbox is not None
|
|
355
|
-
else None
|
|
356
|
-
), # check if this is bottom-left
|
|
357
|
-
spans=spans,
|
|
358
|
-
obj_type=celltype,
|
|
359
|
-
col=j,
|
|
360
|
-
row=i,
|
|
361
|
-
row_header=cell.row_header,
|
|
362
|
-
row_section=cell.row_section,
|
|
363
|
-
col_header=cell.column_header,
|
|
364
|
-
row_span=[
|
|
365
|
-
cell.start_row_offset_idx,
|
|
366
|
-
cell.end_row_offset_idx,
|
|
367
|
-
],
|
|
368
|
-
col_span=[
|
|
369
|
-
cell.start_col_offset_idx,
|
|
370
|
-
cell.end_col_offset_idx,
|
|
371
|
-
],
|
|
372
|
-
)
|
|
373
|
-
|
|
374
|
-
# Compute the caption
|
|
375
|
-
caption = item.caption_text(self.document)
|
|
376
|
-
|
|
377
|
-
tables.append(
|
|
378
|
-
DsSchemaTable(
|
|
379
|
-
text=caption,
|
|
380
|
-
num_cols=item.data.num_cols,
|
|
381
|
-
num_rows=item.data.num_rows,
|
|
382
|
-
obj_type=layout_label_to_ds_type.get(item.label),
|
|
383
|
-
data=table_data,
|
|
384
|
-
prov=[
|
|
385
|
-
Prov(
|
|
386
|
-
bbox=p.bbox.as_tuple(),
|
|
387
|
-
page=p.page_no,
|
|
388
|
-
span=[0, 0],
|
|
389
|
-
)
|
|
390
|
-
for p in item.prov
|
|
391
|
-
],
|
|
392
|
-
)
|
|
393
|
-
)
|
|
394
|
-
|
|
395
|
-
elif isinstance(item, PictureItem):
|
|
396
|
-
index = len(figures)
|
|
397
|
-
ref_str = f"#/figures/{index}"
|
|
398
|
-
main_text.append(
|
|
399
|
-
Ref(
|
|
400
|
-
name=reverse_label_mapping[item.label],
|
|
401
|
-
obj_type=layout_label_to_ds_type.get(item.label),
|
|
402
|
-
ref=ref_str,
|
|
403
|
-
),
|
|
404
|
-
)
|
|
405
|
-
|
|
406
|
-
# Compute the caption
|
|
407
|
-
caption = item.caption_text(self.document)
|
|
408
|
-
|
|
409
|
-
figures.append(
|
|
410
|
-
Figure(
|
|
411
|
-
prov=[
|
|
412
|
-
Prov(
|
|
413
|
-
bbox=p.bbox.as_tuple(),
|
|
414
|
-
page=p.page_no,
|
|
415
|
-
span=[0, len(caption)],
|
|
416
|
-
)
|
|
417
|
-
for p in item.prov
|
|
418
|
-
],
|
|
419
|
-
obj_type=layout_label_to_ds_type.get(item.label),
|
|
420
|
-
text=caption,
|
|
421
|
-
# data=[[]],
|
|
422
|
-
)
|
|
423
|
-
)
|
|
424
|
-
|
|
425
|
-
page_dimensions = [
|
|
426
|
-
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
|
|
427
|
-
for p in self.document.pages.values()
|
|
428
|
-
]
|
|
429
|
-
|
|
430
|
-
ds_doc = DsDocument(
|
|
431
|
-
name=title,
|
|
432
|
-
description=desc,
|
|
433
|
-
file_info=file_info,
|
|
434
|
-
main_text=main_text,
|
|
435
|
-
equations=equations,
|
|
436
|
-
footnotes=footnotes,
|
|
437
|
-
page_headers=page_headers,
|
|
438
|
-
page_footers=page_footers,
|
|
439
|
-
tables=tables,
|
|
440
|
-
figures=figures,
|
|
441
|
-
page_dimensions=page_dimensions,
|
|
442
|
-
)
|
|
443
|
-
|
|
444
|
-
return ds_doc
|
|
193
|
+
return docling_document_to_legacy(self.document)
|
|
445
194
|
|
|
446
195
|
|
|
447
196
|
class _DummyBackend(AbstractDocumentBackend):
|
|
@@ -143,7 +143,11 @@ class PdfPipelineOptions(PipelineOptions):
|
|
|
143
143
|
|
|
144
144
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
|
145
145
|
ocr_options: Union[
|
|
146
|
-
EasyOcrOptions,
|
|
146
|
+
EasyOcrOptions,
|
|
147
|
+
TesseractCliOcrOptions,
|
|
148
|
+
TesseractOcrOptions,
|
|
149
|
+
OcrMacOptions,
|
|
150
|
+
RapidOcrOptions,
|
|
147
151
|
] = Field(EasyOcrOptions(), discriminator="kind")
|
|
148
152
|
|
|
149
153
|
images_scale: float = 1.0
|
docling/document_converter.py
CHANGED
|
@@ -9,7 +9,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
|
|
9
9
|
|
|
10
10
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
11
11
|
from docling.backend.asciidoc_backend import AsciiDocBackend
|
|
12
|
-
from docling.backend.
|
|
12
|
+
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
|
13
13
|
from docling.backend.html_backend import HTMLDocumentBackend
|
|
14
14
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
|
15
15
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
|
@@ -84,12 +84,12 @@ class HTMLFormatOption(FormatOption):
|
|
|
84
84
|
|
|
85
85
|
class PdfFormatOption(FormatOption):
|
|
86
86
|
pipeline_cls: Type = StandardPdfPipeline
|
|
87
|
-
backend: Type[AbstractDocumentBackend] =
|
|
87
|
+
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
|
88
88
|
|
|
89
89
|
|
|
90
90
|
class ImageFormatOption(FormatOption):
|
|
91
91
|
pipeline_cls: Type = StandardPdfPipeline
|
|
92
|
-
backend: Type[AbstractDocumentBackend] =
|
|
92
|
+
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
|
93
93
|
|
|
94
94
|
|
|
95
95
|
def _get_default_option(format: InputFormat) -> FormatOption:
|
|
@@ -113,10 +113,10 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|
|
113
113
|
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
|
114
114
|
),
|
|
115
115
|
InputFormat.IMAGE: FormatOption(
|
|
116
|
-
pipeline_cls=StandardPdfPipeline, backend=
|
|
116
|
+
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
|
117
117
|
),
|
|
118
118
|
InputFormat.PDF: FormatOption(
|
|
119
|
-
pipeline_cls=StandardPdfPipeline, backend=
|
|
119
|
+
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
|
120
120
|
),
|
|
121
121
|
}
|
|
122
122
|
if (options := format_to_default_options.get(format)) is not None:
|
docling/models/ds_glm_model.py
CHANGED
|
@@ -4,7 +4,6 @@ from pathlib import Path
|
|
|
4
4
|
from typing import List, Union
|
|
5
5
|
|
|
6
6
|
from deepsearch_glm.nlp_utils import init_nlp_model
|
|
7
|
-
from deepsearch_glm.utils.doc_utils import to_docling_document
|
|
8
7
|
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
|
|
9
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
|
|
10
9
|
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
|
|
@@ -29,6 +28,7 @@ from pydantic import BaseModel, ConfigDict
|
|
|
29
28
|
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
|
|
30
29
|
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
|
31
30
|
from docling.datamodel.settings import settings
|
|
31
|
+
from docling.utils.glm_utils import to_docling_document
|
|
32
32
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
|
33
33
|
from docling.utils.utils import create_hash
|
|
34
34
|
|
|
@@ -232,7 +232,7 @@ class GlmModel:
|
|
|
232
232
|
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
|
233
233
|
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
|
|
234
234
|
ds_doc = self._to_legacy_document(conv_res)
|
|
235
|
-
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
|
235
|
+
ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)
|
|
236
236
|
|
|
237
237
|
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
|
238
238
|
|
|
@@ -97,7 +97,9 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
|
97
97
|
local_dir: Optional[Path] = None, force: bool = False
|
|
98
98
|
) -> Path:
|
|
99
99
|
from huggingface_hub import snapshot_download
|
|
100
|
+
from huggingface_hub.utils import disable_progress_bars
|
|
100
101
|
|
|
102
|
+
disable_progress_bars()
|
|
101
103
|
download_path = snapshot_download(
|
|
102
104
|
repo_id="ds4sd/docling-models",
|
|
103
105
|
force_download=force,
|
docling/py.typed
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from docling_core.types.doc import (
|
|
7
|
+
BoundingBox,
|
|
8
|
+
CoordOrigin,
|
|
9
|
+
DocItemLabel,
|
|
10
|
+
DoclingDocument,
|
|
11
|
+
DocumentOrigin,
|
|
12
|
+
GroupLabel,
|
|
13
|
+
ProvenanceItem,
|
|
14
|
+
Size,
|
|
15
|
+
TableCell,
|
|
16
|
+
TableData,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def resolve_item(paths, obj):
|
|
21
|
+
"""Find item in document from a reference path"""
|
|
22
|
+
|
|
23
|
+
if len(paths) == 0:
|
|
24
|
+
return obj
|
|
25
|
+
|
|
26
|
+
if paths[0] == "#":
|
|
27
|
+
return resolve_item(paths[1:], obj)
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
key = int(paths[0])
|
|
31
|
+
except:
|
|
32
|
+
key = paths[0]
|
|
33
|
+
|
|
34
|
+
if len(paths) == 1:
|
|
35
|
+
if isinstance(key, str) and key in obj:
|
|
36
|
+
return obj[key]
|
|
37
|
+
elif isinstance(key, int) and key < len(obj):
|
|
38
|
+
return obj[key]
|
|
39
|
+
else:
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
elif len(paths) > 1:
|
|
43
|
+
if isinstance(key, str) and key in obj:
|
|
44
|
+
return resolve_item(paths[1:], obj[key])
|
|
45
|
+
elif isinstance(key, int) and key < len(obj):
|
|
46
|
+
return resolve_item(paths[1:], obj[key])
|
|
47
|
+
else:
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
else:
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
|
|
55
|
+
unique_objects = []
|
|
56
|
+
seen_spans = set()
|
|
57
|
+
|
|
58
|
+
for sublist in grid:
|
|
59
|
+
for obj in sublist:
|
|
60
|
+
# Convert the spans list to a tuple of tuples for hashing
|
|
61
|
+
spans_tuple = tuple(tuple(span) for span in obj["spans"])
|
|
62
|
+
if spans_tuple not in seen_spans:
|
|
63
|
+
seen_spans.add(spans_tuple)
|
|
64
|
+
unique_objects.append(obj)
|
|
65
|
+
|
|
66
|
+
return unique_objects
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
|
70
|
+
origin = DocumentOrigin(
|
|
71
|
+
mimetype="application/pdf",
|
|
72
|
+
filename=doc_glm["file-info"]["filename"],
|
|
73
|
+
binary_hash=doc_glm["file-info"]["document-hash"],
|
|
74
|
+
)
|
|
75
|
+
doc_name = Path(origin.filename).stem
|
|
76
|
+
|
|
77
|
+
doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
|
|
78
|
+
|
|
79
|
+
for page_dim in doc_glm["page-dimensions"]:
|
|
80
|
+
page_no = int(page_dim["page"])
|
|
81
|
+
size = Size(width=page_dim["width"], height=page_dim["height"])
|
|
82
|
+
|
|
83
|
+
doc.add_page(page_no=page_no, size=size)
|
|
84
|
+
|
|
85
|
+
if "properties" in doc_glm:
|
|
86
|
+
props = pd.DataFrame(
|
|
87
|
+
doc_glm["properties"]["data"], columns=doc_glm["properties"]["headers"]
|
|
88
|
+
)
|
|
89
|
+
else:
|
|
90
|
+
props = pd.DataFrame()
|
|
91
|
+
|
|
92
|
+
current_list = None
|
|
93
|
+
|
|
94
|
+
for ix, pelem in enumerate(doc_glm["page-elements"]):
|
|
95
|
+
ptype = pelem["type"]
|
|
96
|
+
span_i = pelem["span"][0]
|
|
97
|
+
span_j = pelem["span"][1]
|
|
98
|
+
|
|
99
|
+
if "iref" not in pelem:
|
|
100
|
+
# print(json.dumps(pelem, indent=2))
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
iref = pelem["iref"]
|
|
104
|
+
|
|
105
|
+
if re.match("#/figures/(\\d+)/captions/(.+)", iref):
|
|
106
|
+
# print(f"skip {iref}")
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
if re.match("#/tables/(\\d+)/captions/(.+)", iref):
|
|
110
|
+
# print(f"skip {iref}")
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
path = iref.split("/")
|
|
114
|
+
obj = resolve_item(path, doc_glm)
|
|
115
|
+
|
|
116
|
+
if obj is None:
|
|
117
|
+
current_list = None
|
|
118
|
+
print(f"warning: undefined {path}")
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
if ptype == "figure":
|
|
122
|
+
current_list = None
|
|
123
|
+
text = ""
|
|
124
|
+
caption_refs = []
|
|
125
|
+
for caption in obj["captions"]:
|
|
126
|
+
text += caption["text"]
|
|
127
|
+
|
|
128
|
+
for nprov in caption["prov"]:
|
|
129
|
+
npaths = nprov["$ref"].split("/")
|
|
130
|
+
nelem = resolve_item(npaths, doc_glm)
|
|
131
|
+
|
|
132
|
+
if nelem is None:
|
|
133
|
+
# print(f"warning: undefined caption {npaths}")
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
span_i = nelem["span"][0]
|
|
137
|
+
span_j = nelem["span"][1]
|
|
138
|
+
|
|
139
|
+
cap_text = caption["text"][span_i:span_j]
|
|
140
|
+
|
|
141
|
+
# doc_glm["page-elements"].remove(nelem)
|
|
142
|
+
|
|
143
|
+
prov = ProvenanceItem(
|
|
144
|
+
page_no=nelem["page"],
|
|
145
|
+
charspan=tuple(nelem["span"]),
|
|
146
|
+
bbox=BoundingBox.from_tuple(
|
|
147
|
+
nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
|
148
|
+
),
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
caption_obj = doc.add_text(
|
|
152
|
+
label=DocItemLabel.CAPTION, text=cap_text, prov=prov
|
|
153
|
+
)
|
|
154
|
+
caption_refs.append(caption_obj.get_ref())
|
|
155
|
+
|
|
156
|
+
prov = ProvenanceItem(
|
|
157
|
+
page_no=pelem["page"],
|
|
158
|
+
charspan=(0, len(text)),
|
|
159
|
+
bbox=BoundingBox.from_tuple(
|
|
160
|
+
pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
|
161
|
+
),
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
pic = doc.add_picture(prov=prov)
|
|
165
|
+
pic.captions.extend(caption_refs)
|
|
166
|
+
_add_child_elements(pic, doc, obj, pelem)
|
|
167
|
+
|
|
168
|
+
elif ptype == "table":
|
|
169
|
+
current_list = None
|
|
170
|
+
text = ""
|
|
171
|
+
caption_refs = []
|
|
172
|
+
for caption in obj["captions"]:
|
|
173
|
+
text += caption["text"]
|
|
174
|
+
|
|
175
|
+
for nprov in caption["prov"]:
|
|
176
|
+
npaths = nprov["$ref"].split("/")
|
|
177
|
+
nelem = resolve_item(npaths, doc_glm)
|
|
178
|
+
|
|
179
|
+
if nelem is None:
|
|
180
|
+
# print(f"warning: undefined caption {npaths}")
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
span_i = nelem["span"][0]
|
|
184
|
+
span_j = nelem["span"][1]
|
|
185
|
+
|
|
186
|
+
cap_text = caption["text"][span_i:span_j]
|
|
187
|
+
|
|
188
|
+
# doc_glm["page-elements"].remove(nelem)
|
|
189
|
+
|
|
190
|
+
prov = ProvenanceItem(
|
|
191
|
+
page_no=nelem["page"],
|
|
192
|
+
charspan=tuple(nelem["span"]),
|
|
193
|
+
bbox=BoundingBox.from_tuple(
|
|
194
|
+
nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
|
195
|
+
),
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
caption_obj = doc.add_text(
|
|
199
|
+
label=DocItemLabel.CAPTION, text=cap_text, prov=prov
|
|
200
|
+
)
|
|
201
|
+
caption_refs.append(caption_obj.get_ref())
|
|
202
|
+
|
|
203
|
+
table_cells_glm = _flatten_table_grid(obj["data"])
|
|
204
|
+
|
|
205
|
+
table_cells = []
|
|
206
|
+
for tbl_cell_glm in table_cells_glm:
|
|
207
|
+
if tbl_cell_glm["bbox"] is not None:
|
|
208
|
+
bbox = BoundingBox.from_tuple(
|
|
209
|
+
tbl_cell_glm["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
|
210
|
+
)
|
|
211
|
+
else:
|
|
212
|
+
bbox = None
|
|
213
|
+
|
|
214
|
+
is_col_header = False
|
|
215
|
+
is_row_header = False
|
|
216
|
+
is_row_section = False
|
|
217
|
+
|
|
218
|
+
if tbl_cell_glm["type"] == "col_header":
|
|
219
|
+
is_col_header = True
|
|
220
|
+
elif tbl_cell_glm["type"] == "row_header":
|
|
221
|
+
is_row_header = True
|
|
222
|
+
elif tbl_cell_glm["type"] == "row_section":
|
|
223
|
+
is_row_section = True
|
|
224
|
+
|
|
225
|
+
table_cells.append(
|
|
226
|
+
TableCell(
|
|
227
|
+
row_span=tbl_cell_glm["row-span"][1]
|
|
228
|
+
- tbl_cell_glm["row-span"][0],
|
|
229
|
+
col_span=tbl_cell_glm["col-span"][1]
|
|
230
|
+
- tbl_cell_glm["col-span"][0],
|
|
231
|
+
start_row_offset_idx=tbl_cell_glm["row-span"][0],
|
|
232
|
+
end_row_offset_idx=tbl_cell_glm["row-span"][1],
|
|
233
|
+
start_col_offset_idx=tbl_cell_glm["col-span"][0],
|
|
234
|
+
end_col_offset_idx=tbl_cell_glm["col-span"][1],
|
|
235
|
+
text=tbl_cell_glm["text"],
|
|
236
|
+
bbox=bbox,
|
|
237
|
+
column_header=is_col_header,
|
|
238
|
+
row_header=is_row_header,
|
|
239
|
+
row_section=is_row_section,
|
|
240
|
+
)
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
tbl_data = TableData(
|
|
244
|
+
num_rows=obj.get("#-rows", 0),
|
|
245
|
+
num_cols=obj.get("#-cols", 0),
|
|
246
|
+
table_cells=table_cells,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
prov = ProvenanceItem(
|
|
250
|
+
page_no=pelem["page"],
|
|
251
|
+
charspan=(0, 0),
|
|
252
|
+
bbox=BoundingBox.from_tuple(
|
|
253
|
+
pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
|
254
|
+
),
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
tbl = doc.add_table(data=tbl_data, prov=prov)
|
|
258
|
+
tbl.captions.extend(caption_refs)
|
|
259
|
+
|
|
260
|
+
elif ptype in ["form", "key_value_region"]:
|
|
261
|
+
label = DocItemLabel(ptype)
|
|
262
|
+
container_el = doc.add_group(label=GroupLabel.UNSPECIFIED, name=label)
|
|
263
|
+
|
|
264
|
+
_add_child_elements(container_el, doc, obj, pelem)
|
|
265
|
+
|
|
266
|
+
elif "text" in obj:
|
|
267
|
+
text = obj["text"][span_i:span_j]
|
|
268
|
+
|
|
269
|
+
type_label = pelem["type"]
|
|
270
|
+
name_label = pelem["name"]
|
|
271
|
+
if update_name_label and len(props) > 0 and type_label == "paragraph":
|
|
272
|
+
prop = props[
|
|
273
|
+
(props["type"] == "semantic") & (props["subj_path"] == iref)
|
|
274
|
+
]
|
|
275
|
+
if len(prop) == 1 and prop.iloc[0]["confidence"] > 0.85:
|
|
276
|
+
name_label = prop.iloc[0]["label"]
|
|
277
|
+
|
|
278
|
+
prov = ProvenanceItem(
|
|
279
|
+
page_no=pelem["page"],
|
|
280
|
+
charspan=(0, len(text)),
|
|
281
|
+
bbox=BoundingBox.from_tuple(
|
|
282
|
+
pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
|
283
|
+
),
|
|
284
|
+
)
|
|
285
|
+
label = DocItemLabel(name_label)
|
|
286
|
+
|
|
287
|
+
if label == DocItemLabel.LIST_ITEM:
|
|
288
|
+
if current_list is None:
|
|
289
|
+
current_list = doc.add_group(label=GroupLabel.LIST, name="list")
|
|
290
|
+
|
|
291
|
+
# TODO: Infer if this is a numbered or a bullet list item
|
|
292
|
+
doc.add_list_item(
|
|
293
|
+
text=text, enumerated=False, prov=prov, parent=current_list
|
|
294
|
+
)
|
|
295
|
+
elif label == DocItemLabel.SECTION_HEADER:
|
|
296
|
+
current_list = None
|
|
297
|
+
|
|
298
|
+
doc.add_heading(text=text, prov=prov)
|
|
299
|
+
else:
|
|
300
|
+
current_list = None
|
|
301
|
+
|
|
302
|
+
doc.add_text(label=DocItemLabel(name_label), text=text, prov=prov)
|
|
303
|
+
|
|
304
|
+
return doc
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _add_child_elements(container_el, doc, obj, pelem):
|
|
308
|
+
payload = obj.get("payload")
|
|
309
|
+
if payload is not None:
|
|
310
|
+
children = payload.get("children", [])
|
|
311
|
+
|
|
312
|
+
for child in children:
|
|
313
|
+
c_label = DocItemLabel(child["label"])
|
|
314
|
+
c_bbox = BoundingBox.model_validate(child["bbox"]).to_bottom_left_origin(
|
|
315
|
+
doc.pages[pelem["page"]].size.height
|
|
316
|
+
)
|
|
317
|
+
c_text = " ".join(
|
|
318
|
+
[
|
|
319
|
+
cell["text"].replace("\x02", "-").strip()
|
|
320
|
+
for cell in child["cells"]
|
|
321
|
+
if len(cell["text"].strip()) > 0
|
|
322
|
+
]
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
c_prov = ProvenanceItem(
|
|
326
|
+
page_no=pelem["page"], charspan=(0, len(c_text)), bbox=c_bbox
|
|
327
|
+
)
|
|
328
|
+
if c_label == DocItemLabel.LIST_ITEM:
|
|
329
|
+
# TODO: Infer if this is a numbered or a bullet list item
|
|
330
|
+
doc.add_list_item(parent=container_el, text=c_text, prov=c_prov)
|
|
331
|
+
elif c_label == DocItemLabel.SECTION_HEADER:
|
|
332
|
+
doc.add_heading(parent=container_el, text=c_text, prov=c_prov)
|
|
333
|
+
else:
|
|
334
|
+
doc.add_text(
|
|
335
|
+
parent=container_el, label=c_label, text=c_text, prov=c_prov
|
|
336
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.10.0
|
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
|
6
6
|
License: MIT
|
|
@@ -25,10 +25,10 @@ Provides-Extra: rapidocr
|
|
|
25
25
|
Provides-Extra: tesserocr
|
|
26
26
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
|
27
27
|
Requires-Dist: certifi (>=2024.7.4)
|
|
28
|
-
Requires-Dist: deepsearch-glm (>=0.
|
|
29
|
-
Requires-Dist: docling-core (>=2.
|
|
28
|
+
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
|
29
|
+
Requires-Dist: docling-core[chunking] (>=2.9.0,<3.0.0)
|
|
30
30
|
Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
|
|
31
|
-
Requires-Dist: docling-parse (>=
|
|
31
|
+
Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
|
|
32
32
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
|
33
33
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
|
34
34
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
|
@@ -39,7 +39,7 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
|
|
|
39
39
|
Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
|
|
40
40
|
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
|
41
41
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
|
42
|
-
Requires-Dist: pydantic (>=2.0.0,<
|
|
42
|
+
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
|
43
43
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
|
44
44
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
|
45
45
|
Requires-Dist: python-docx (>=1.1.2,<2.0.0)
|
|
@@ -59,7 +59,7 @@ Description-Content-Type: text/markdown
|
|
|
59
59
|
</a>
|
|
60
60
|
</p>
|
|
61
61
|
|
|
62
|
-
#
|
|
62
|
+
# Docling
|
|
63
63
|
|
|
64
64
|
<p align="center">
|
|
65
65
|
<a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
|
@@ -81,7 +81,7 @@ Docling parses documents and exports them to the desired format with ease and sp
|
|
|
81
81
|
|
|
82
82
|
## Features
|
|
83
83
|
|
|
84
|
-
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
|
|
84
|
+
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
|
|
85
85
|
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
|
86
86
|
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
|
87
87
|
* 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
|
|
@@ -2,28 +2,29 @@ docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
2
2
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
|
|
4
4
|
docling/backend/asciidoc_backend.py,sha256=kXZxOLk_LvLFVZwnJVVwjmvc3QWZ0iiG7VnwjgtC3hI,14051
|
|
5
|
-
docling/backend/docling_parse_backend.py,sha256=
|
|
6
|
-
docling/backend/docling_parse_v2_backend.py,sha256=
|
|
5
|
+
docling/backend/docling_parse_backend.py,sha256=_jY5f5-KGI3hi5pcZAY6e7tPLocSi5JUWrxraDVszqI,7631
|
|
6
|
+
docling/backend/docling_parse_v2_backend.py,sha256=1TDUdMIp3fEjCWBNjusUHiCUmH1g6yZQ-b13scofP0Y,8637
|
|
7
7
|
docling/backend/html_backend.py,sha256=qbu1W8xoTGnXMuZPRPLq68hDbCEj6ygnpxP5gYaodAQ,15593
|
|
8
8
|
docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
|
|
9
9
|
docling/backend/msexcel_backend.py,sha256=23qUEScqr5GhY06xiqg-eBQ_JlAqO0FkPEmX6554sVA,12040
|
|
10
10
|
docling/backend/mspowerpoint_backend.py,sha256=QD0NaatTO8U9CIFoiipkq3X5HxLZaaahH8nlrQ6ecDA,15710
|
|
11
|
-
docling/backend/msword_backend.py,sha256=
|
|
11
|
+
docling/backend/msword_backend.py,sha256=K1D_h0ulLA6KQsPe62327cDVkQqV1f7EetCHo66wCKw,19233
|
|
12
12
|
docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
|
|
13
13
|
docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
|
|
14
|
+
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
|
14
15
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
-
docling/cli/main.py,sha256=
|
|
16
|
+
docling/cli/main.py,sha256=bLk1RG0jwM4dn6G5qa5Q-S4_N3agKnoE28pTfbpV4-k,14713
|
|
16
17
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
-
docling/datamodel/base_models.py,sha256=
|
|
18
|
-
docling/datamodel/document.py,sha256=
|
|
19
|
-
docling/datamodel/pipeline_options.py,sha256=
|
|
18
|
+
docling/datamodel/base_models.py,sha256=627IB8HZdXGmHNfsX4Qhf7kKSxx2btPjS7z8hitvhyE,5560
|
|
19
|
+
docling/datamodel/document.py,sha256=GNlTsgKgDqdqv2dfhpYmnqymxDQWWWC8HgE8uAta8V4,10265
|
|
20
|
+
docling/datamodel/pipeline_options.py,sha256=zQxLVioyBrldI4V9phQma1kTTgjmFQ6d3gVj2xq51gw,5010
|
|
20
21
|
docling/datamodel/settings.py,sha256=JK8lZPBjUx2kD2q-Qpg-o3vOElADMcyQbRUL0EHZ7us,1263
|
|
21
|
-
docling/document_converter.py,sha256=
|
|
22
|
+
docling/document_converter.py,sha256=Iz5eerBWFPVJoXAMlXEivRQX2VLBiUkA07BL4NNbaEs,11583
|
|
22
23
|
docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
|
|
23
24
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
25
|
docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
|
|
25
26
|
docling/models/base_ocr_model.py,sha256=rGSpBF4dByITcsBaRIgvFKpiu0CrhmZS_PHIo686Dw0,6428
|
|
26
|
-
docling/models/ds_glm_model.py,sha256=
|
|
27
|
+
docling/models/ds_glm_model.py,sha256=3UpFu3Oavw9p0GItx2S9R7bPDdjY2NvpUQQDSVMctys,12045
|
|
27
28
|
docling/models/easyocr_model.py,sha256=c2m4x9dZpSc-cMgeEdFBRVBlB78uMGlYD8Q_2gzRuMU,3734
|
|
28
29
|
docling/models/layout_model.py,sha256=ZvbTSyxvXB5yLHNEti0Wv3trz0vwGuHySI5TCdApb0U,14011
|
|
29
30
|
docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
|
|
@@ -36,14 +37,16 @@ docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUs
|
|
|
36
37
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
38
|
docling/pipeline/base_pipeline.py,sha256=IF1XWYgUGbdB4-teLkmM4Hvg_UNEfPrGuhExMRTUsk8,7168
|
|
38
39
|
docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
|
|
39
|
-
docling/pipeline/standard_pdf_pipeline.py,sha256=
|
|
40
|
+
docling/pipeline/standard_pdf_pipeline.py,sha256=B1q8xt3Dfecpi8s8DrcfPzdATh8TYgL43FDzzcS4vEA,8885
|
|
41
|
+
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
40
42
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
43
|
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
|
44
|
+
docling/utils/glm_utils.py,sha256=H1O_tDiRksMgw45rY9LhK6GjcZSOq5IyoGurGjoo-Ac,11211
|
|
42
45
|
docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
|
|
43
46
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
|
44
47
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
|
45
|
-
docling-2.
|
|
46
|
-
docling-2.
|
|
47
|
-
docling-2.
|
|
48
|
-
docling-2.
|
|
49
|
-
docling-2.
|
|
48
|
+
docling-2.10.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
|
49
|
+
docling-2.10.0.dist-info/METADATA,sha256=YVI-dBKxqAxrLATigzeXPZvwDZUhLSl_doltc-HenQ4,7731
|
|
50
|
+
docling-2.10.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
51
|
+
docling-2.10.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
|
52
|
+
docling-2.10.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|