docling 2.18.0__py3-none-any.whl → 2.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/md_backend.py +62 -46
- docling/backend/msword_backend.py +1 -1
- docling/cli/main.py +8 -0
- docling/cli/models.py +105 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/settings.py +2 -0
- docling/models/base_model.py +3 -0
- docling/models/code_formula_model.py +15 -9
- docling/models/document_picture_classifier.py +11 -8
- docling/models/easyocr_model.py +50 -3
- docling/models/layout_model.py +49 -3
- docling/models/table_structure_model.py +44 -2
- docling/pipeline/base_pipeline.py +1 -1
- docling/pipeline/standard_pdf_pipeline.py +25 -24
- docling/utils/model_downloader.py +72 -0
- docling/utils/utils.py +24 -0
- {docling-2.18.0.dist-info → docling-2.19.0.dist-info}/METADATA +5 -4
- {docling-2.18.0.dist-info → docling-2.19.0.dist-info}/RECORD +21 -18
- {docling-2.18.0.dist-info → docling-2.19.0.dist-info}/entry_points.txt +1 -0
- {docling-2.18.0.dist-info → docling-2.19.0.dist-info}/LICENSE +0 -0
- {docling-2.18.0.dist-info → docling-2.19.0.dist-info}/WHEEL +0 -0
docling/backend/md_backend.py
CHANGED
@@ -36,7 +36,7 @@ _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
|
|
36
36
|
|
37
37
|
|
38
38
|
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
39
|
-
def
|
39
|
+
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
40
40
|
# This regex will match any sequence of underscores
|
41
41
|
pattern = r"_+"
|
42
42
|
|
@@ -81,7 +81,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
81
81
|
# very long sequences of underscores will lead to unnecessary long processing times.
|
82
82
|
# In any proper Markdown files, underscores have to be escaped,
|
83
83
|
# otherwise they represent emphasis (bold or italic)
|
84
|
-
self.markdown = self.
|
84
|
+
self.markdown = self._shorten_underscore_sequences(text_stream)
|
85
85
|
if isinstance(self.path_or_stream, Path):
|
86
86
|
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
87
87
|
md_content = f.read()
|
@@ -89,7 +89,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
89
89
|
# very long sequences of underscores will lead to unnecessary long processing times.
|
90
90
|
# In any proper Markdown files, underscores have to be escaped,
|
91
91
|
# otherwise they represent emphasis (bold or italic)
|
92
|
-
self.markdown = self.
|
92
|
+
self.markdown = self._shorten_underscore_sequences(md_content)
|
93
93
|
self.valid = True
|
94
94
|
|
95
95
|
_log.debug(self.markdown)
|
@@ -99,7 +99,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
99
99
|
) from e
|
100
100
|
return
|
101
101
|
|
102
|
-
def
|
102
|
+
def _close_table(self, doc: DoclingDocument):
|
103
103
|
if self.in_table:
|
104
104
|
_log.debug("=== TABLE START ===")
|
105
105
|
for md_table_row in self.md_table_buffer:
|
@@ -156,30 +156,35 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
156
156
|
doc.add_table(data=table_data)
|
157
157
|
return
|
158
158
|
|
159
|
-
def
|
160
|
-
self,
|
159
|
+
def _process_inline_text(
|
160
|
+
self, parent_item: Optional[NodeItem], doc: DoclingDocument
|
161
161
|
):
|
162
162
|
txt = " ".join(self.inline_texts)
|
163
163
|
if len(txt) > 0:
|
164
164
|
doc.add_text(
|
165
165
|
label=DocItemLabel.PARAGRAPH,
|
166
|
-
parent=
|
166
|
+
parent=parent_item,
|
167
167
|
text=txt,
|
168
168
|
)
|
169
169
|
self.inline_texts = []
|
170
170
|
|
171
|
-
def
|
171
|
+
def _iterate_elements(
|
172
172
|
self,
|
173
173
|
element: marko.element.Element,
|
174
174
|
depth: int,
|
175
175
|
doc: DoclingDocument,
|
176
|
-
|
176
|
+
visited: Set[marko.element.Element],
|
177
|
+
parent_item: Optional[NodeItem] = None,
|
177
178
|
):
|
179
|
+
|
180
|
+
if element in visited:
|
181
|
+
return
|
182
|
+
|
178
183
|
# Iterates over all elements in the AST
|
179
184
|
# Check for different element types and process relevant details
|
180
185
|
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
|
181
|
-
self.
|
182
|
-
self.
|
186
|
+
self._close_table(doc)
|
187
|
+
self._process_inline_text(parent_item, doc)
|
183
188
|
_log.debug(
|
184
189
|
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
185
190
|
)
|
@@ -207,8 +212,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
207
212
|
traverse(element)
|
208
213
|
snippet_text = "".join(strings)
|
209
214
|
if len(snippet_text) > 0:
|
210
|
-
|
211
|
-
label=doc_label, parent=
|
215
|
+
parent_item = doc.add_text(
|
216
|
+
label=doc_label, parent=parent_item, text=snippet_text
|
212
217
|
)
|
213
218
|
|
214
219
|
elif isinstance(element, marko.block.List):
|
@@ -218,35 +223,37 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
218
223
|
has_non_empty_list_items = True
|
219
224
|
break
|
220
225
|
|
221
|
-
self.
|
222
|
-
self.
|
226
|
+
self._close_table(doc)
|
227
|
+
self._process_inline_text(parent_item, doc)
|
223
228
|
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
224
229
|
if has_non_empty_list_items:
|
225
230
|
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
226
|
-
|
227
|
-
label=label, name=f"list", parent=
|
231
|
+
parent_item = doc.add_group(
|
232
|
+
label=label, name=f"list", parent=parent_item
|
228
233
|
)
|
229
234
|
|
230
235
|
elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
|
231
|
-
self.
|
232
|
-
self.
|
236
|
+
self._close_table(doc)
|
237
|
+
self._process_inline_text(parent_item, doc)
|
233
238
|
_log.debug(" - List item")
|
234
239
|
|
235
|
-
|
240
|
+
first_child = element.children[0]
|
241
|
+
snippet_text = str(first_child.children[0].children) # type: ignore
|
236
242
|
is_numbered = False
|
237
243
|
if (
|
238
|
-
|
239
|
-
and isinstance(
|
240
|
-
and
|
244
|
+
parent_item is not None
|
245
|
+
and isinstance(parent_item, DocItem)
|
246
|
+
and parent_item.label == GroupLabel.ORDERED_LIST
|
241
247
|
):
|
242
248
|
is_numbered = True
|
243
249
|
doc.add_list_item(
|
244
|
-
enumerated=is_numbered, parent=
|
250
|
+
enumerated=is_numbered, parent=parent_item, text=snippet_text
|
245
251
|
)
|
252
|
+
visited.add(first_child)
|
246
253
|
|
247
254
|
elif isinstance(element, marko.inline.Image):
|
248
|
-
self.
|
249
|
-
self.
|
255
|
+
self._close_table(doc)
|
256
|
+
self._process_inline_text(parent_item, doc)
|
250
257
|
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
251
258
|
|
252
259
|
fig_caption: Optional[TextItem] = None
|
@@ -255,10 +262,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
255
262
|
label=DocItemLabel.CAPTION, text=element.title
|
256
263
|
)
|
257
264
|
|
258
|
-
doc.add_picture(parent=
|
265
|
+
doc.add_picture(parent=parent_item, caption=fig_caption)
|
259
266
|
|
260
267
|
elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
|
261
|
-
self.
|
268
|
+
self._process_inline_text(parent_item, doc)
|
262
269
|
|
263
270
|
elif isinstance(element, marko.inline.RawText):
|
264
271
|
_log.debug(f" - Paragraph (raw text): {element.children}")
|
@@ -272,17 +279,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
272
279
|
else:
|
273
280
|
self.md_table_buffer.append(snippet_text)
|
274
281
|
else:
|
275
|
-
self.
|
276
|
-
self.in_table = False
|
282
|
+
self._close_table(doc)
|
277
283
|
# most likely just inline text
|
278
284
|
self.inline_texts.append(str(element.children))
|
279
285
|
|
280
286
|
elif isinstance(element, marko.inline.CodeSpan):
|
281
|
-
self.
|
282
|
-
self.
|
287
|
+
self._close_table(doc)
|
288
|
+
self._process_inline_text(parent_item, doc)
|
283
289
|
_log.debug(f" - Code Span: {element.children}")
|
284
290
|
snippet_text = str(element.children).strip()
|
285
|
-
doc.add_code(parent=
|
291
|
+
doc.add_code(parent=parent_item, text=snippet_text)
|
286
292
|
|
287
293
|
elif (
|
288
294
|
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
|
@@ -290,10 +296,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
290
296
|
and isinstance((first_child := element.children[0]), marko.inline.RawText)
|
291
297
|
and len(snippet_text := (first_child.children.strip())) > 0
|
292
298
|
):
|
293
|
-
self.
|
294
|
-
self.
|
299
|
+
self._close_table(doc)
|
300
|
+
self._process_inline_text(parent_item, doc)
|
295
301
|
_log.debug(f" - Code Block: {element.children}")
|
296
|
-
doc.add_code(parent=
|
302
|
+
doc.add_code(parent=parent_item, text=snippet_text)
|
297
303
|
|
298
304
|
elif isinstance(element, marko.inline.LineBreak):
|
299
305
|
if self.in_table:
|
@@ -302,8 +308,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
302
308
|
|
303
309
|
elif isinstance(element, marko.block.HTMLBlock):
|
304
310
|
self._html_blocks += 1
|
305
|
-
self.
|
306
|
-
self.
|
311
|
+
self._process_inline_text(parent_item, doc)
|
312
|
+
self._close_table(doc)
|
307
313
|
_log.debug("HTML Block: {}".format(element))
|
308
314
|
if (
|
309
315
|
len(element.body) > 0
|
@@ -312,18 +318,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
312
318
|
|
313
319
|
# wrap in markers to enable post-processing in convert()
|
314
320
|
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
|
315
|
-
doc.add_code(parent=
|
321
|
+
doc.add_code(parent=parent_item, text=text_to_add)
|
316
322
|
else:
|
317
323
|
if not isinstance(element, str):
|
318
|
-
self.
|
324
|
+
self._close_table(doc)
|
319
325
|
_log.debug("Some other element: {}".format(element))
|
320
326
|
|
321
327
|
processed_block_types = (
|
322
|
-
marko.block.ListItem,
|
323
328
|
marko.block.Heading,
|
324
329
|
marko.block.CodeBlock,
|
325
330
|
marko.block.FencedCode,
|
326
|
-
# marko.block.Paragraph,
|
327
331
|
marko.inline.RawText,
|
328
332
|
)
|
329
333
|
|
@@ -332,7 +336,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
332
336
|
element, processed_block_types
|
333
337
|
):
|
334
338
|
for child in element.children:
|
335
|
-
self.
|
339
|
+
self._iterate_elements(
|
340
|
+
element=child,
|
341
|
+
depth=depth + 1,
|
342
|
+
doc=doc,
|
343
|
+
visited=visited,
|
344
|
+
parent_item=parent_item,
|
345
|
+
)
|
336
346
|
|
337
347
|
def is_valid(self) -> bool:
|
338
348
|
return self.valid
|
@@ -366,9 +376,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
366
376
|
marko_parser = Markdown()
|
367
377
|
parsed_ast = marko_parser.parse(self.markdown)
|
368
378
|
# Start iterating from the root of the AST
|
369
|
-
self.
|
370
|
-
|
371
|
-
|
379
|
+
self._iterate_elements(
|
380
|
+
element=parsed_ast,
|
381
|
+
depth=0,
|
382
|
+
doc=doc,
|
383
|
+
parent_item=None,
|
384
|
+
visited=set(),
|
385
|
+
)
|
386
|
+
self._process_inline_text(None, doc) # handle last hanging inline text
|
387
|
+
self._close_table(doc=doc) # handle any last hanging table
|
372
388
|
|
373
389
|
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
374
390
|
if self._html_blocks > 0:
|
@@ -242,7 +242,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
242
242
|
parts = label.split(":")
|
243
243
|
|
244
244
|
if len(parts) == 2:
|
245
|
-
return parts[0],
|
245
|
+
return parts[0], self.str_to_int(parts[1], None)
|
246
246
|
|
247
247
|
parts = self.split_text_and_number(label)
|
248
248
|
|
docling/cli/main.py
CHANGED
@@ -219,6 +219,13 @@ def convert(
|
|
219
219
|
bool,
|
220
220
|
typer.Option(..., help="Enable the formula enrichment model in the pipeline."),
|
221
221
|
] = False,
|
222
|
+
enrich_picture_classes: Annotated[
|
223
|
+
bool,
|
224
|
+
typer.Option(
|
225
|
+
...,
|
226
|
+
help="Enable the picture classification enrichment model in the pipeline.",
|
227
|
+
),
|
228
|
+
] = False,
|
222
229
|
artifacts_path: Annotated[
|
223
230
|
Optional[Path],
|
224
231
|
typer.Option(..., help="If provided, the location of the model artifacts."),
|
@@ -375,6 +382,7 @@ def convert(
|
|
375
382
|
do_table_structure=True,
|
376
383
|
do_code_enrichment=enrich_code,
|
377
384
|
do_formula_enrichment=enrich_formula,
|
385
|
+
do_picture_classification=enrich_picture_classes,
|
378
386
|
document_timeout=document_timeout,
|
379
387
|
)
|
380
388
|
pipeline_options.table_structure_options.do_cell_matching = (
|
docling/cli/models.py
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
import logging
|
2
|
+
import warnings
|
3
|
+
from enum import Enum
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Annotated, Optional
|
6
|
+
|
7
|
+
import typer
|
8
|
+
from rich.console import Console
|
9
|
+
from rich.logging import RichHandler
|
10
|
+
|
11
|
+
from docling.datamodel.settings import settings
|
12
|
+
from docling.utils.model_downloader import download_models
|
13
|
+
|
14
|
+
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
15
|
+
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
16
|
+
|
17
|
+
console = Console()
|
18
|
+
err_console = Console(stderr=True)
|
19
|
+
|
20
|
+
|
21
|
+
app = typer.Typer(
|
22
|
+
name="Docling models helper",
|
23
|
+
no_args_is_help=True,
|
24
|
+
add_completion=False,
|
25
|
+
pretty_exceptions_enable=False,
|
26
|
+
)
|
27
|
+
|
28
|
+
|
29
|
+
class _AvailableModels(str, Enum):
|
30
|
+
LAYOUT = "layout"
|
31
|
+
TABLEFORMER = "tableformer"
|
32
|
+
CODE_FORMULA = "code_formula"
|
33
|
+
PICTURE_CLASSIFIER = "picture_classifier"
|
34
|
+
EASYOCR = "easyocr"
|
35
|
+
|
36
|
+
|
37
|
+
@app.command("download")
|
38
|
+
def download(
|
39
|
+
output_dir: Annotated[
|
40
|
+
Path,
|
41
|
+
typer.Option(
|
42
|
+
...,
|
43
|
+
"-o",
|
44
|
+
"--output-dir",
|
45
|
+
help="The directory where all the models are downloaded.",
|
46
|
+
),
|
47
|
+
] = (settings.cache_dir / "models"),
|
48
|
+
force: Annotated[
|
49
|
+
bool, typer.Option(..., help="If true, the download will be forced")
|
50
|
+
] = False,
|
51
|
+
models: Annotated[
|
52
|
+
Optional[list[_AvailableModels]],
|
53
|
+
typer.Argument(
|
54
|
+
help=f"Models to download (default behavior: all will be downloaded)",
|
55
|
+
),
|
56
|
+
] = None,
|
57
|
+
quiet: Annotated[
|
58
|
+
bool,
|
59
|
+
typer.Option(
|
60
|
+
...,
|
61
|
+
"-q",
|
62
|
+
"--quiet",
|
63
|
+
help="No extra output is generated, the CLI prints only the directory with the cached models.",
|
64
|
+
),
|
65
|
+
] = False,
|
66
|
+
):
|
67
|
+
if not quiet:
|
68
|
+
FORMAT = "%(message)s"
|
69
|
+
logging.basicConfig(
|
70
|
+
level=logging.INFO,
|
71
|
+
format="[blue]%(message)s[/blue]",
|
72
|
+
datefmt="[%X]",
|
73
|
+
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
|
74
|
+
)
|
75
|
+
to_download = models or [m for m in _AvailableModels]
|
76
|
+
output_dir = download_models(
|
77
|
+
output_dir=output_dir,
|
78
|
+
force=force,
|
79
|
+
progress=(not quiet),
|
80
|
+
with_layout=_AvailableModels.LAYOUT in to_download,
|
81
|
+
with_tableformer=_AvailableModels.TABLEFORMER in to_download,
|
82
|
+
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
|
83
|
+
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
|
84
|
+
with_easyocr=_AvailableModels.EASYOCR in to_download,
|
85
|
+
)
|
86
|
+
|
87
|
+
if quiet:
|
88
|
+
typer.echo(output_dir)
|
89
|
+
else:
|
90
|
+
typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
|
91
|
+
|
92
|
+
console.print(
|
93
|
+
"\n",
|
94
|
+
"Docling can now be configured for running offline using the local artifacts.\n\n",
|
95
|
+
"Using the CLI:",
|
96
|
+
f"`docling --artifacts-path={output_dir} FILE`",
|
97
|
+
"\n",
|
98
|
+
"Using Python: see the documentation at <https://ds4sd.github.io/docling/usage>.",
|
99
|
+
)
|
100
|
+
|
101
|
+
|
102
|
+
click_app = typer.main.get_command(app)
|
103
|
+
|
104
|
+
if __name__ == "__main__":
|
105
|
+
app()
|
docling/cli/tools.py
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
import typer
|
2
|
+
|
3
|
+
from docling.cli.models import app as models_app
|
4
|
+
|
5
|
+
app = typer.Typer(
|
6
|
+
name="Docling helpers",
|
7
|
+
no_args_is_help=True,
|
8
|
+
add_completion=False,
|
9
|
+
pretty_exceptions_enable=False,
|
10
|
+
)
|
11
|
+
|
12
|
+
app.add_typer(models_app, name="models")
|
13
|
+
|
14
|
+
click_app = typer.main.get_command(app)
|
15
|
+
|
16
|
+
if __name__ == "__main__":
|
17
|
+
app()
|
docling/datamodel/settings.py
CHANGED
docling/models/base_model.py
CHANGED
@@ -6,6 +6,7 @@ from typing_extensions import TypeVar
|
|
6
6
|
|
7
7
|
from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
|
8
8
|
from docling.datamodel.document import ConversionResult
|
9
|
+
from docling.datamodel.settings import settings
|
9
10
|
|
10
11
|
|
11
12
|
class BasePageModel(ABC):
|
@@ -21,6 +22,8 @@ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
|
|
21
22
|
|
22
23
|
class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
|
23
24
|
|
25
|
+
elements_batch_size: int = settings.perf.elements_batch_size
|
26
|
+
|
24
27
|
@abstractmethod
|
25
28
|
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
26
29
|
pass
|
@@ -2,6 +2,7 @@ import re
|
|
2
2
|
from pathlib import Path
|
3
3
|
from typing import Iterable, List, Literal, Optional, Tuple, Union
|
4
4
|
|
5
|
+
import numpy as np
|
5
6
|
from docling_core.types.doc import (
|
6
7
|
CodeItem,
|
7
8
|
DocItemLabel,
|
@@ -61,13 +62,15 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|
61
62
|
Processes the given batch of elements and enriches them with predictions.
|
62
63
|
"""
|
63
64
|
|
65
|
+
_model_repo_folder = "ds4sd--CodeFormula"
|
66
|
+
elements_batch_size = 5
|
64
67
|
images_scale = 1.66 # = 120 dpi, aligned with training data resolution
|
65
68
|
expansion_factor = 0.03
|
66
69
|
|
67
70
|
def __init__(
|
68
71
|
self,
|
69
72
|
enabled: bool,
|
70
|
-
artifacts_path: Optional[
|
73
|
+
artifacts_path: Optional[Path],
|
71
74
|
options: CodeFormulaModelOptions,
|
72
75
|
accelerator_options: AcceleratorOptions,
|
73
76
|
):
|
@@ -96,29 +99,32 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|
96
99
|
)
|
97
100
|
|
98
101
|
if artifacts_path is None:
|
99
|
-
artifacts_path = self.
|
102
|
+
artifacts_path = self.download_models()
|
100
103
|
else:
|
101
|
-
artifacts_path =
|
104
|
+
artifacts_path = artifacts_path / self._model_repo_folder
|
102
105
|
|
103
106
|
self.code_formula_model = CodeFormulaPredictor(
|
104
|
-
artifacts_path=artifacts_path,
|
107
|
+
artifacts_path=str(artifacts_path),
|
105
108
|
device=device,
|
106
109
|
num_threads=accelerator_options.num_threads,
|
107
110
|
)
|
108
111
|
|
109
112
|
@staticmethod
|
110
|
-
def
|
111
|
-
local_dir: Optional[Path] = None,
|
113
|
+
def download_models(
|
114
|
+
local_dir: Optional[Path] = None,
|
115
|
+
force: bool = False,
|
116
|
+
progress: bool = False,
|
112
117
|
) -> Path:
|
113
118
|
from huggingface_hub import snapshot_download
|
114
119
|
from huggingface_hub.utils import disable_progress_bars
|
115
120
|
|
116
|
-
|
121
|
+
if not progress:
|
122
|
+
disable_progress_bars()
|
117
123
|
download_path = snapshot_download(
|
118
124
|
repo_id="ds4sd/CodeFormula",
|
119
125
|
force_download=force,
|
120
126
|
local_dir=local_dir,
|
121
|
-
revision="v1.0.
|
127
|
+
revision="v1.0.1",
|
122
128
|
)
|
123
129
|
|
124
130
|
return Path(download_path)
|
@@ -226,7 +232,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|
226
232
|
return
|
227
233
|
|
228
234
|
labels: List[str] = []
|
229
|
-
images: List[Image.Image] = []
|
235
|
+
images: List[Union[Image.Image, np.ndarray]] = []
|
230
236
|
elements: List[TextItem] = []
|
231
237
|
for el in element_batch:
|
232
238
|
assert isinstance(el.item, TextItem)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
from typing import Iterable, List, Literal, Optional, Tuple, Union
|
3
3
|
|
4
|
+
import numpy as np
|
4
5
|
from docling_core.types.doc import (
|
5
6
|
DoclingDocument,
|
6
7
|
NodeItem,
|
@@ -55,12 +56,13 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|
55
56
|
Processes a batch of elements and adds classification annotations.
|
56
57
|
"""
|
57
58
|
|
59
|
+
_model_repo_folder = "ds4sd--DocumentFigureClassifier"
|
58
60
|
images_scale = 2
|
59
61
|
|
60
62
|
def __init__(
|
61
63
|
self,
|
62
64
|
enabled: bool,
|
63
|
-
artifacts_path: Optional[
|
65
|
+
artifacts_path: Optional[Path],
|
64
66
|
options: DocumentPictureClassifierOptions,
|
65
67
|
accelerator_options: AcceleratorOptions,
|
66
68
|
):
|
@@ -88,24 +90,25 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|
88
90
|
)
|
89
91
|
|
90
92
|
if artifacts_path is None:
|
91
|
-
artifacts_path = self.
|
93
|
+
artifacts_path = self.download_models()
|
92
94
|
else:
|
93
|
-
artifacts_path =
|
95
|
+
artifacts_path = artifacts_path / self._model_repo_folder
|
94
96
|
|
95
97
|
self.document_picture_classifier = DocumentFigureClassifierPredictor(
|
96
|
-
artifacts_path=artifacts_path,
|
98
|
+
artifacts_path=str(artifacts_path),
|
97
99
|
device=device,
|
98
100
|
num_threads=accelerator_options.num_threads,
|
99
101
|
)
|
100
102
|
|
101
103
|
@staticmethod
|
102
|
-
def
|
103
|
-
local_dir: Optional[Path] = None, force: bool = False
|
104
|
+
def download_models(
|
105
|
+
local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
|
104
106
|
) -> Path:
|
105
107
|
from huggingface_hub import snapshot_download
|
106
108
|
from huggingface_hub.utils import disable_progress_bars
|
107
109
|
|
108
|
-
|
110
|
+
if not progress:
|
111
|
+
disable_progress_bars()
|
109
112
|
download_path = snapshot_download(
|
110
113
|
repo_id="ds4sd/DocumentFigureClassifier",
|
111
114
|
force_download=force,
|
@@ -159,7 +162,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|
159
162
|
yield element
|
160
163
|
return
|
161
164
|
|
162
|
-
images: List[Image.Image] = []
|
165
|
+
images: List[Union[Image.Image, np.ndarray]] = []
|
163
166
|
elements: List[PictureItem] = []
|
164
167
|
for el in element_batch:
|
165
168
|
assert isinstance(el, PictureItem)
|
docling/models/easyocr_model.py
CHANGED
@@ -1,7 +1,10 @@
|
|
1
1
|
import logging
|
2
2
|
import warnings
|
3
|
-
|
3
|
+
import zipfile
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Iterable, List, Optional
|
4
6
|
|
7
|
+
import httpx
|
5
8
|
import numpy
|
6
9
|
import torch
|
7
10
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
@@ -17,14 +20,18 @@ from docling.datamodel.settings import settings
|
|
17
20
|
from docling.models.base_ocr_model import BaseOcrModel
|
18
21
|
from docling.utils.accelerator_utils import decide_device
|
19
22
|
from docling.utils.profiling import TimeRecorder
|
23
|
+
from docling.utils.utils import download_url_with_progress
|
20
24
|
|
21
25
|
_log = logging.getLogger(__name__)
|
22
26
|
|
23
27
|
|
24
28
|
class EasyOcrModel(BaseOcrModel):
|
29
|
+
_model_repo_folder = "EasyOcr"
|
30
|
+
|
25
31
|
def __init__(
|
26
32
|
self,
|
27
33
|
enabled: bool,
|
34
|
+
artifacts_path: Optional[Path],
|
28
35
|
options: EasyOcrOptions,
|
29
36
|
accelerator_options: AcceleratorOptions,
|
30
37
|
):
|
@@ -62,15 +69,55 @@ class EasyOcrModel(BaseOcrModel):
|
|
62
69
|
)
|
63
70
|
use_gpu = self.options.use_gpu
|
64
71
|
|
72
|
+
download_enabled = self.options.download_enabled
|
73
|
+
model_storage_directory = self.options.model_storage_directory
|
74
|
+
if artifacts_path is not None and model_storage_directory is None:
|
75
|
+
download_enabled = False
|
76
|
+
model_storage_directory = str(artifacts_path / self._model_repo_folder)
|
77
|
+
|
65
78
|
self.reader = easyocr.Reader(
|
66
79
|
lang_list=self.options.lang,
|
67
80
|
gpu=use_gpu,
|
68
|
-
model_storage_directory=
|
81
|
+
model_storage_directory=model_storage_directory,
|
69
82
|
recog_network=self.options.recog_network,
|
70
|
-
download_enabled=
|
83
|
+
download_enabled=download_enabled,
|
71
84
|
verbose=False,
|
72
85
|
)
|
73
86
|
|
87
|
+
@staticmethod
|
88
|
+
def download_models(
|
89
|
+
detection_models: List[str] = ["craft"],
|
90
|
+
recognition_models: List[str] = ["english_g2", "latin_g2"],
|
91
|
+
local_dir: Optional[Path] = None,
|
92
|
+
force: bool = False,
|
93
|
+
progress: bool = False,
|
94
|
+
) -> Path:
|
95
|
+
# Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
|
96
|
+
from easyocr.config import detection_models as det_models_dict
|
97
|
+
from easyocr.config import recognition_models as rec_models_dict
|
98
|
+
|
99
|
+
if local_dir is None:
|
100
|
+
local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
|
101
|
+
|
102
|
+
local_dir.mkdir(parents=True, exist_ok=True)
|
103
|
+
|
104
|
+
# Collect models to download
|
105
|
+
download_list = []
|
106
|
+
for model_name in detection_models:
|
107
|
+
if model_name in det_models_dict:
|
108
|
+
download_list.append(det_models_dict[model_name])
|
109
|
+
for model_name in recognition_models:
|
110
|
+
if model_name in rec_models_dict["gen2"]:
|
111
|
+
download_list.append(rec_models_dict["gen2"][model_name])
|
112
|
+
|
113
|
+
# Download models
|
114
|
+
for model_details in download_list:
|
115
|
+
buf = download_url_with_progress(model_details["url"], progress=progress)
|
116
|
+
with zipfile.ZipFile(buf, "r") as zip_ref:
|
117
|
+
zip_ref.extractall(local_dir)
|
118
|
+
|
119
|
+
return local_dir
|
120
|
+
|
74
121
|
def __call__(
|
75
122
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
76
123
|
) -> Iterable[Page]:
|
docling/models/layout_model.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
import copy
|
2
2
|
import logging
|
3
|
+
import warnings
|
3
4
|
from pathlib import Path
|
4
|
-
from typing import Iterable
|
5
|
+
from typing import Iterable, Optional, Union
|
5
6
|
|
6
7
|
from docling_core.types.doc import DocItemLabel
|
7
8
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
@@ -21,6 +22,8 @@ _log = logging.getLogger(__name__)
|
|
21
22
|
|
22
23
|
|
23
24
|
class LayoutModel(BasePageModel):
|
25
|
+
_model_repo_folder = "ds4sd--docling-models"
|
26
|
+
_model_path = "model_artifacts/layout"
|
24
27
|
|
25
28
|
TEXT_ELEM_LABELS = [
|
26
29
|
DocItemLabel.TEXT,
|
@@ -42,15 +45,56 @@ class LayoutModel(BasePageModel):
|
|
42
45
|
FORMULA_LABEL = DocItemLabel.FORMULA
|
43
46
|
CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
|
44
47
|
|
45
|
-
def __init__(
|
48
|
+
def __init__(
|
49
|
+
self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions
|
50
|
+
):
|
46
51
|
device = decide_device(accelerator_options.device)
|
47
52
|
|
53
|
+
if artifacts_path is None:
|
54
|
+
artifacts_path = self.download_models() / self._model_path
|
55
|
+
else:
|
56
|
+
# will become the default in the future
|
57
|
+
if (artifacts_path / self._model_repo_folder).exists():
|
58
|
+
artifacts_path = (
|
59
|
+
artifacts_path / self._model_repo_folder / self._model_path
|
60
|
+
)
|
61
|
+
elif (artifacts_path / self._model_path).exists():
|
62
|
+
warnings.warn(
|
63
|
+
"The usage of artifacts_path containing directly "
|
64
|
+
f"{self._model_path} is deprecated. Please point "
|
65
|
+
"the artifacts_path to the parent containing "
|
66
|
+
f"the {self._model_repo_folder} folder.",
|
67
|
+
DeprecationWarning,
|
68
|
+
stacklevel=3,
|
69
|
+
)
|
70
|
+
artifacts_path = artifacts_path / self._model_path
|
71
|
+
|
48
72
|
self.layout_predictor = LayoutPredictor(
|
49
73
|
artifact_path=str(artifacts_path),
|
50
74
|
device=device,
|
51
75
|
num_threads=accelerator_options.num_threads,
|
52
76
|
)
|
53
77
|
|
78
|
+
@staticmethod
|
79
|
+
def download_models(
|
80
|
+
local_dir: Optional[Path] = None,
|
81
|
+
force: bool = False,
|
82
|
+
progress: bool = False,
|
83
|
+
) -> Path:
|
84
|
+
from huggingface_hub import snapshot_download
|
85
|
+
from huggingface_hub.utils import disable_progress_bars
|
86
|
+
|
87
|
+
if not progress:
|
88
|
+
disable_progress_bars()
|
89
|
+
download_path = snapshot_download(
|
90
|
+
repo_id="ds4sd/docling-models",
|
91
|
+
force_download=force,
|
92
|
+
local_dir=local_dir,
|
93
|
+
revision="v2.1.0",
|
94
|
+
)
|
95
|
+
|
96
|
+
return Path(download_path)
|
97
|
+
|
54
98
|
def draw_clusters_and_cells_side_by_side(
|
55
99
|
self, conv_res, page, clusters, mode_prefix: str, show: bool = False
|
56
100
|
):
|
@@ -106,10 +150,12 @@ class LayoutModel(BasePageModel):
|
|
106
150
|
else:
|
107
151
|
with TimeRecorder(conv_res, "layout"):
|
108
152
|
assert page.size is not None
|
153
|
+
page_image = page.get_image(scale=1.0)
|
154
|
+
assert page_image is not None
|
109
155
|
|
110
156
|
clusters = []
|
111
157
|
for ix, pred_item in enumerate(
|
112
|
-
self.layout_predictor.predict(
|
158
|
+
self.layout_predictor.predict(page_image)
|
113
159
|
):
|
114
160
|
label = DocItemLabel(
|
115
161
|
pred_item["label"]
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import copy
|
2
|
+
import warnings
|
2
3
|
from pathlib import Path
|
3
|
-
from typing import Iterable
|
4
|
+
from typing import Iterable, Optional, Union
|
4
5
|
|
5
6
|
import numpy
|
6
7
|
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
@@ -22,10 +23,13 @@ from docling.utils.profiling import TimeRecorder
|
|
22
23
|
|
23
24
|
|
24
25
|
class TableStructureModel(BasePageModel):
|
26
|
+
_model_repo_folder = "ds4sd--docling-models"
|
27
|
+
_model_path = "model_artifacts/tableformer"
|
28
|
+
|
25
29
|
def __init__(
|
26
30
|
self,
|
27
31
|
enabled: bool,
|
28
|
-
artifacts_path: Path,
|
32
|
+
artifacts_path: Optional[Path],
|
29
33
|
options: TableStructureOptions,
|
30
34
|
accelerator_options: AcceleratorOptions,
|
31
35
|
):
|
@@ -35,6 +39,26 @@ class TableStructureModel(BasePageModel):
|
|
35
39
|
|
36
40
|
self.enabled = enabled
|
37
41
|
if self.enabled:
|
42
|
+
|
43
|
+
if artifacts_path is None:
|
44
|
+
artifacts_path = self.download_models() / self._model_path
|
45
|
+
else:
|
46
|
+
# will become the default in the future
|
47
|
+
if (artifacts_path / self._model_repo_folder).exists():
|
48
|
+
artifacts_path = (
|
49
|
+
artifacts_path / self._model_repo_folder / self._model_path
|
50
|
+
)
|
51
|
+
elif (artifacts_path / self._model_path).exists():
|
52
|
+
warnings.warn(
|
53
|
+
"The usage of artifacts_path containing directly "
|
54
|
+
f"{self._model_path} is deprecated. Please point "
|
55
|
+
"the artifacts_path to the parent containing "
|
56
|
+
f"the {self._model_repo_folder} folder.",
|
57
|
+
DeprecationWarning,
|
58
|
+
stacklevel=3,
|
59
|
+
)
|
60
|
+
artifacts_path = artifacts_path / self._model_path
|
61
|
+
|
38
62
|
if self.mode == TableFormerMode.ACCURATE:
|
39
63
|
artifacts_path = artifacts_path / "accurate"
|
40
64
|
else:
|
@@ -58,6 +82,24 @@ class TableStructureModel(BasePageModel):
|
|
58
82
|
)
|
59
83
|
self.scale = 2.0 # Scale up table input images to 144 dpi
|
60
84
|
|
85
|
+
@staticmethod
|
86
|
+
def download_models(
|
87
|
+
local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
|
88
|
+
) -> Path:
|
89
|
+
from huggingface_hub import snapshot_download
|
90
|
+
from huggingface_hub.utils import disable_progress_bars
|
91
|
+
|
92
|
+
if not progress:
|
93
|
+
disable_progress_bars()
|
94
|
+
download_path = snapshot_download(
|
95
|
+
repo_id="ds4sd/docling-models",
|
96
|
+
force_download=force,
|
97
|
+
local_dir=local_dir,
|
98
|
+
revision="v2.1.0",
|
99
|
+
)
|
100
|
+
|
101
|
+
return Path(download_path)
|
102
|
+
|
61
103
|
def draw_table_and_cells(
|
62
104
|
self,
|
63
105
|
conv_res: ConversionResult,
|
@@ -79,7 +79,7 @@ class BasePipeline(ABC):
|
|
79
79
|
for model in self.enrichment_pipe:
|
80
80
|
for element_batch in chunkify(
|
81
81
|
_prepare_elements(conv_res, model),
|
82
|
-
|
82
|
+
model.elements_batch_size,
|
83
83
|
):
|
84
84
|
for element in model(
|
85
85
|
doc=conv_res.document, element_batch=element_batch
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import logging
|
2
2
|
import sys
|
3
|
+
import warnings
|
3
4
|
from pathlib import Path
|
4
5
|
from typing import Optional
|
5
6
|
|
@@ -17,6 +18,7 @@ from docling.datamodel.pipeline_options import (
|
|
17
18
|
TesseractCliOcrOptions,
|
18
19
|
TesseractOcrOptions,
|
19
20
|
)
|
21
|
+
from docling.datamodel.settings import settings
|
20
22
|
from docling.models.base_ocr_model import BaseOcrModel
|
21
23
|
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
|
22
24
|
from docling.models.document_picture_classifier import (
|
@@ -37,23 +39,23 @@ from docling.models.table_structure_model import TableStructureModel
|
|
37
39
|
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
38
40
|
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
39
41
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
42
|
+
from docling.utils.model_downloader import download_models
|
40
43
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
41
44
|
|
42
45
|
_log = logging.getLogger(__name__)
|
43
46
|
|
44
47
|
|
45
48
|
class StandardPdfPipeline(PaginatedPipeline):
|
46
|
-
_layout_model_path =
|
47
|
-
_table_model_path =
|
49
|
+
_layout_model_path = LayoutModel._model_path
|
50
|
+
_table_model_path = TableStructureModel._model_path
|
48
51
|
|
49
52
|
def __init__(self, pipeline_options: PdfPipelineOptions):
|
50
53
|
super().__init__(pipeline_options)
|
51
54
|
self.pipeline_options: PdfPipelineOptions
|
52
55
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
self.artifacts_path = Path(pipeline_options.artifacts_path)
|
56
|
+
artifacts_path: Optional[Path] = None
|
57
|
+
if pipeline_options.artifacts_path is not None:
|
58
|
+
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
57
59
|
|
58
60
|
self.keep_images = (
|
59
61
|
self.pipeline_options.generate_page_images
|
@@ -63,7 +65,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
63
65
|
|
64
66
|
self.glm_model = GlmModel(options=GlmOptions())
|
65
67
|
|
66
|
-
if (ocr_model := self.get_ocr_model()) is None:
|
68
|
+
if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
|
67
69
|
raise RuntimeError(
|
68
70
|
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
|
69
71
|
)
|
@@ -79,15 +81,13 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
79
81
|
ocr_model,
|
80
82
|
# Layout model
|
81
83
|
LayoutModel(
|
82
|
-
artifacts_path=
|
83
|
-
/ StandardPdfPipeline._layout_model_path,
|
84
|
+
artifacts_path=artifacts_path,
|
84
85
|
accelerator_options=pipeline_options.accelerator_options,
|
85
86
|
),
|
86
87
|
# Table structure model
|
87
88
|
TableStructureModel(
|
88
89
|
enabled=pipeline_options.do_table_structure,
|
89
|
-
artifacts_path=
|
90
|
-
/ StandardPdfPipeline._table_model_path,
|
90
|
+
artifacts_path=artifacts_path,
|
91
91
|
options=pipeline_options.table_structure_options,
|
92
92
|
accelerator_options=pipeline_options.accelerator_options,
|
93
93
|
),
|
@@ -101,7 +101,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
101
101
|
CodeFormulaModel(
|
102
102
|
enabled=pipeline_options.do_code_enrichment
|
103
103
|
or pipeline_options.do_formula_enrichment,
|
104
|
-
artifacts_path=
|
104
|
+
artifacts_path=artifacts_path,
|
105
105
|
options=CodeFormulaModelOptions(
|
106
106
|
do_code_enrichment=pipeline_options.do_code_enrichment,
|
107
107
|
do_formula_enrichment=pipeline_options.do_formula_enrichment,
|
@@ -111,7 +111,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
111
111
|
# Document Picture Classifier
|
112
112
|
DocumentPictureClassifier(
|
113
113
|
enabled=pipeline_options.do_picture_classification,
|
114
|
-
artifacts_path=
|
114
|
+
artifacts_path=artifacts_path,
|
115
115
|
options=DocumentPictureClassifierOptions(),
|
116
116
|
accelerator_options=pipeline_options.accelerator_options,
|
117
117
|
),
|
@@ -127,23 +127,24 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
127
127
|
def download_models_hf(
|
128
128
|
local_dir: Optional[Path] = None, force: bool = False
|
129
129
|
) -> Path:
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
force_download=force,
|
137
|
-
local_dir=local_dir,
|
138
|
-
revision="v2.1.0",
|
130
|
+
warnings.warn(
|
131
|
+
"The usage of StandardPdfPipeline.download_models_hf() is deprecated "
|
132
|
+
"use instead the utility `docling-tools models download`, or "
|
133
|
+
"the upstream method docling.utils.models_downloader.download_all()",
|
134
|
+
DeprecationWarning,
|
135
|
+
stacklevel=3,
|
139
136
|
)
|
140
137
|
|
141
|
-
|
138
|
+
output_dir = download_models(output_dir=local_dir, force=force, progress=False)
|
139
|
+
return output_dir
|
142
140
|
|
143
|
-
def get_ocr_model(
|
141
|
+
def get_ocr_model(
|
142
|
+
self, artifacts_path: Optional[Path] = None
|
143
|
+
) -> Optional[BaseOcrModel]:
|
144
144
|
if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
|
145
145
|
return EasyOcrModel(
|
146
146
|
enabled=self.pipeline_options.do_ocr,
|
147
|
+
artifacts_path=artifacts_path,
|
147
148
|
options=self.pipeline_options.ocr_options,
|
148
149
|
accelerator_options=self.pipeline_options.accelerator_options,
|
149
150
|
)
|
@@ -0,0 +1,72 @@
|
|
1
|
+
import logging
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from docling.datamodel.settings import settings
|
6
|
+
from docling.models.code_formula_model import CodeFormulaModel
|
7
|
+
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
8
|
+
from docling.models.easyocr_model import EasyOcrModel
|
9
|
+
from docling.models.layout_model import LayoutModel
|
10
|
+
from docling.models.table_structure_model import TableStructureModel
|
11
|
+
|
12
|
+
_log = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
def download_models(
|
16
|
+
output_dir: Optional[Path] = None,
|
17
|
+
*,
|
18
|
+
force: bool = False,
|
19
|
+
progress: bool = False,
|
20
|
+
with_layout: bool = True,
|
21
|
+
with_tableformer: bool = True,
|
22
|
+
with_code_formula: bool = True,
|
23
|
+
with_picture_classifier: bool = True,
|
24
|
+
with_easyocr: bool = True,
|
25
|
+
):
|
26
|
+
if output_dir is None:
|
27
|
+
output_dir = settings.cache_dir / "models"
|
28
|
+
|
29
|
+
# Make sure the folder exists
|
30
|
+
output_dir.mkdir(exist_ok=True, parents=True)
|
31
|
+
|
32
|
+
if with_layout:
|
33
|
+
_log.info(f"Downloading layout model...")
|
34
|
+
LayoutModel.download_models(
|
35
|
+
local_dir=output_dir / LayoutModel._model_repo_folder,
|
36
|
+
force=force,
|
37
|
+
progress=progress,
|
38
|
+
)
|
39
|
+
|
40
|
+
if with_tableformer:
|
41
|
+
_log.info(f"Downloading tableformer model...")
|
42
|
+
TableStructureModel.download_models(
|
43
|
+
local_dir=output_dir / TableStructureModel._model_repo_folder,
|
44
|
+
force=force,
|
45
|
+
progress=progress,
|
46
|
+
)
|
47
|
+
|
48
|
+
if with_picture_classifier:
|
49
|
+
_log.info(f"Downloading picture classifier model...")
|
50
|
+
DocumentPictureClassifier.download_models(
|
51
|
+
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
|
52
|
+
force=force,
|
53
|
+
progress=progress,
|
54
|
+
)
|
55
|
+
|
56
|
+
if with_code_formula:
|
57
|
+
_log.info(f"Downloading code formula model...")
|
58
|
+
CodeFormulaModel.download_models(
|
59
|
+
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
|
60
|
+
force=force,
|
61
|
+
progress=progress,
|
62
|
+
)
|
63
|
+
|
64
|
+
if with_easyocr:
|
65
|
+
_log.info(f"Downloading easyocr models...")
|
66
|
+
EasyOcrModel.download_models(
|
67
|
+
local_dir=output_dir / EasyOcrModel._model_repo_folder,
|
68
|
+
force=force,
|
69
|
+
progress=progress,
|
70
|
+
)
|
71
|
+
|
72
|
+
return output_dir
|
docling/utils/utils.py
CHANGED
@@ -4,6 +4,9 @@ from itertools import islice
|
|
4
4
|
from pathlib import Path
|
5
5
|
from typing import List, Union
|
6
6
|
|
7
|
+
import requests
|
8
|
+
from tqdm import tqdm
|
9
|
+
|
7
10
|
|
8
11
|
def chunkify(iterator, chunk_size):
|
9
12
|
"""Yield successive chunks of chunk_size from the iterable."""
|
@@ -39,3 +42,24 @@ def create_hash(string: str):
|
|
39
42
|
hasher.update(string.encode("utf-8"))
|
40
43
|
|
41
44
|
return hasher.hexdigest()
|
45
|
+
|
46
|
+
|
47
|
+
def download_url_with_progress(url: str, progress: bool = False) -> BytesIO:
|
48
|
+
buf = BytesIO()
|
49
|
+
with requests.get(url, stream=True, allow_redirects=True) as response:
|
50
|
+
total_size = int(response.headers.get("content-length", 0))
|
51
|
+
progress_bar = tqdm(
|
52
|
+
total=total_size,
|
53
|
+
unit="B",
|
54
|
+
unit_scale=True,
|
55
|
+
unit_divisor=1024,
|
56
|
+
disable=(not progress),
|
57
|
+
)
|
58
|
+
|
59
|
+
for chunk in response.iter_content(10 * 1024):
|
60
|
+
buf.write(chunk)
|
61
|
+
progress_bar.update(len(chunk))
|
62
|
+
progress_bar.close()
|
63
|
+
|
64
|
+
buf.seek(0)
|
65
|
+
return buf
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.19.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -24,12 +24,12 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
24
|
Provides-Extra: ocrmac
|
25
25
|
Provides-Extra: rapidocr
|
26
26
|
Provides-Extra: tesserocr
|
27
|
-
Requires-Dist: beautifulsoup4 (>=4.12.3,<
|
27
|
+
Requires-Dist: beautifulsoup4 (>=4.12.3,<4.13.0)
|
28
28
|
Requires-Dist: certifi (>=2024.7.4)
|
29
29
|
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
30
|
-
Requires-Dist: docling-core[chunking] (>=2.17.
|
30
|
+
Requires-Dist: docling-core[chunking] (>=2.17.2,<3.0.0)
|
31
31
|
Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
|
32
|
-
Requires-Dist: docling-parse (>=3.
|
32
|
+
Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
|
33
33
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
34
34
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
35
35
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
@@ -52,6 +52,7 @@ Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
|
52
52
|
Requires-Dist: scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"
|
53
53
|
Requires-Dist: scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"
|
54
54
|
Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
|
55
|
+
Requires-Dist: tqdm (>=4.65.0,<5.0.0)
|
55
56
|
Requires-Dist: typer (>=0.12.5,<0.13.0)
|
56
57
|
Project-URL: Repository, https://github.com/DS4SD/docling
|
57
58
|
Description-Content-Type: text/markdown
|
@@ -7,10 +7,10 @@ docling/backend/docling_parse_v2_backend.py,sha256=IpwrBrtLGwNRl5AYO-o3NjEfNRsAk
|
|
7
7
|
docling/backend/html_backend.py,sha256=YTPLZiEEEuGaP6G62skK3wXJ0KftuqBCl8erNXeJyoE,15893
|
8
8
|
docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
9
|
docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
|
10
|
-
docling/backend/md_backend.py,sha256=
|
10
|
+
docling/backend/md_backend.py,sha256=NaVfcnEH-5bwVovjn76EobF6B6Wm8AhaTZ4E8k0TUPo,16826
|
11
11
|
docling/backend/msexcel_backend.py,sha256=lyJc4ShJGAN2ZfNTTuhdYTF-44cZsGyn_8Djstp3IEU,12700
|
12
12
|
docling/backend/mspowerpoint_backend.py,sha256=esAyaaQe17BQFweGAGJHvImKETefY0BpvfpUSECC49w,16424
|
13
|
-
docling/backend/msword_backend.py,sha256=
|
13
|
+
docling/backend/msword_backend.py,sha256=V4miLIcOH8DDlSCm25F_DALBW60Uf9JoSS0TB4yrQBw,20591
|
14
14
|
docling/backend/pdf_backend.py,sha256=17Pr8dWsD1C4FYUprrwMM9trDGW-JYLjrcScx1Ul4io,2048
|
15
15
|
docling/backend/pypdfium2_backend.py,sha256=QSPfp903ZtSpoNqPmcIek0HmvETrJ1kkwrdxnF5pjS0,9014
|
16
16
|
docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -18,45 +18,48 @@ docling/backend/xml/pubmed_backend.py,sha256=LMnpowjnxa5SydfNC00Ll840BYraL8dCJu-
|
|
18
18
|
docling/backend/xml/uspto_backend.py,sha256=a5GxWLj2SUR5Of8TWJinhef1gKyaQSjHPVXvGiN8yG8,70324
|
19
19
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
20
20
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
|
-
docling/cli/main.py,sha256=
|
21
|
+
docling/cli/main.py,sha256=qShZI1f7WWn5T16YtFTeYY1CUucNjyGefIekCWvkAqc,16366
|
22
|
+
docling/cli/models.py,sha256=cjP13QZfgHZWPVJw3kJvSszJdDrRVWx-sJipZRfHEuQ,3102
|
23
|
+
docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
22
24
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
25
|
docling/datamodel/base_models.py,sha256=vewP1X99qfAwiUsiC2m8CBDGiQPsGyp_WkKJHYPoYn4,7026
|
24
26
|
docling/datamodel/document.py,sha256=HkmvQKW3QSx3tAqPTnXiJbD_y1EVwR-WE3n6Gq8g1NY,13428
|
25
27
|
docling/datamodel/pipeline_options.py,sha256=f9-VQFgOdahyclGQgH_T8ZYBopkWsF_fbWbxo39ux3g,7888
|
26
|
-
docling/datamodel/settings.py,sha256=
|
28
|
+
docling/datamodel/settings.py,sha256=pJi9OBqZQhsNi7RwJWQFRDKGhm3u679iN76psA3VtsY,1817
|
27
29
|
docling/document_converter.py,sha256=qaldb7Thqk59RdE-RTGtj1M7l5UzaBdnxIvGoQ7lTeo,12876
|
28
30
|
docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
|
29
31
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
30
|
-
docling/models/base_model.py,sha256=
|
32
|
+
docling/models/base_model.py,sha256=IIf_PA933bdwHst3g_MOC4oiYQcSCIVOnxnCnN1NxEQ,2681
|
31
33
|
docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_VRTE,6919
|
32
|
-
docling/models/code_formula_model.py,sha256=
|
33
|
-
docling/models/document_picture_classifier.py,sha256=
|
34
|
+
docling/models/code_formula_model.py,sha256=6grbRPWaLljadheT5s4omdT6hmXfin4gJU17csWvhjY,8611
|
35
|
+
docling/models/document_picture_classifier.py,sha256=6I_j6fG5fnhIV6rqN31LYikNTZyg5isXrVs0GIqHDaY,6235
|
34
36
|
docling/models/ds_glm_model.py,sha256=CkhsP0cEWwm4wb1g3cLFriVGpVtELiUK3REDMkPwAMw,13028
|
35
|
-
docling/models/easyocr_model.py,sha256=
|
36
|
-
docling/models/layout_model.py,sha256=
|
37
|
+
docling/models/easyocr_model.py,sha256=dDy0iaR4KUrq7eFIQclMqUYap1B06PG4nC6RMlGYhSw,6886
|
38
|
+
docling/models/layout_model.py,sha256=7fQWipGV1HDrvbP4uOKa9QAicQl89jp7lailQmbFL3w,7804
|
37
39
|
docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
|
38
40
|
docling/models/page_assemble_model.py,sha256=c5KLKwkUIdW0JcDHizWsqrpb5x_3DK28x82Q8o-3VJM,5968
|
39
41
|
docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
|
40
42
|
docling/models/rapid_ocr_model.py,sha256=2HXmurNRPP6qyqn7U5h9NQIs8zi0TMHf56CpcKQk0fU,5038
|
41
|
-
docling/models/table_structure_model.py,sha256=
|
43
|
+
docling/models/table_structure_model.py,sha256=UIqWlw_9JNfGsO86c00rPb4GCg-yNliKEwyhCqlsZbM,11225
|
42
44
|
docling/models/tesseract_ocr_cli_model.py,sha256=b2Is5x2gZLS6mQWnKe0y7p6UU6hRTHDfoH4D2RQ5mx0,9310
|
43
45
|
docling/models/tesseract_ocr_model.py,sha256=BN85u-4a-xzUY7Iw21Ib8L8kx4mgbDGiUtxBelLiJm8,8513
|
44
46
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
45
|
-
docling/pipeline/base_pipeline.py,sha256=
|
47
|
+
docling/pipeline/base_pipeline.py,sha256=9ABK-Cr235bxE5vweoIA5rgBZV_EF8qFxAqLI27H_Pg,8749
|
46
48
|
docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
|
47
|
-
docling/pipeline/standard_pdf_pipeline.py,sha256=
|
49
|
+
docling/pipeline/standard_pdf_pipeline.py,sha256=xVGLYmh677hKBSRCoHYAVn7drmowba2QGI8f-eEC5gs,10624
|
48
50
|
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
49
51
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
50
52
|
docling/utils/accelerator_utils.py,sha256=ZjULCn-qhxqx3frF-rJmAlWdzqgUMxH5utLHbSPev80,1367
|
51
53
|
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
52
54
|
docling/utils/glm_utils.py,sha256=uyCoFTX9FbS1Ke0aSlkdzGLUt08dZfkgriWadkyLiiA,11856
|
53
55
|
docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
|
56
|
+
docling/utils/model_downloader.py,sha256=5jChSE88byGj7LvGNnB01qBw6n9ODJjnAS66PobRSJc,2267
|
54
57
|
docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,263
|
55
58
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
56
|
-
docling/utils/utils.py,sha256=
|
59
|
+
docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
|
57
60
|
docling/utils/visualization.py,sha256=4pn-80fVuE04ken7hUg5Ar47ndRSL9MWBgdHM-1g1zU,2735
|
58
|
-
docling-2.
|
59
|
-
docling-2.
|
60
|
-
docling-2.
|
61
|
-
docling-2.
|
62
|
-
docling-2.
|
61
|
+
docling-2.19.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
62
|
+
docling-2.19.0.dist-info/METADATA,sha256=deXdwXb0i_n3pyEDbVGNQNw4APYoUVtXnkHmC-frXWI,8442
|
63
|
+
docling-2.19.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
64
|
+
docling-2.19.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
|
65
|
+
docling-2.19.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|