docling 2.18.0__tar.gz → 2.19.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.18.0 → docling-2.19.0}/PKG-INFO +5 -4
- {docling-2.18.0 → docling-2.19.0}/docling/backend/md_backend.py +62 -46
- {docling-2.18.0 → docling-2.19.0}/docling/backend/msword_backend.py +1 -1
- {docling-2.18.0 → docling-2.19.0}/docling/cli/main.py +8 -0
- docling-2.19.0/docling/cli/models.py +105 -0
- docling-2.19.0/docling/cli/tools.py +17 -0
- {docling-2.18.0 → docling-2.19.0}/docling/datamodel/settings.py +2 -0
- {docling-2.18.0 → docling-2.19.0}/docling/models/base_model.py +3 -0
- {docling-2.18.0 → docling-2.19.0}/docling/models/code_formula_model.py +15 -9
- {docling-2.18.0 → docling-2.19.0}/docling/models/document_picture_classifier.py +11 -8
- {docling-2.18.0 → docling-2.19.0}/docling/models/easyocr_model.py +50 -3
- {docling-2.18.0 → docling-2.19.0}/docling/models/layout_model.py +49 -3
- {docling-2.18.0 → docling-2.19.0}/docling/models/table_structure_model.py +44 -2
- {docling-2.18.0 → docling-2.19.0}/docling/pipeline/base_pipeline.py +1 -1
- {docling-2.18.0 → docling-2.19.0}/docling/pipeline/standard_pdf_pipeline.py +25 -24
- docling-2.19.0/docling/utils/model_downloader.py +72 -0
- {docling-2.18.0 → docling-2.19.0}/docling/utils/utils.py +24 -0
- {docling-2.18.0 → docling-2.19.0}/pyproject.toml +7 -4
- {docling-2.18.0 → docling-2.19.0}/LICENSE +0 -0
- {docling-2.18.0 → docling-2.19.0}/README.md +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/__init__.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/backend/__init__.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/backend/html_backend.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/backend/xml/pubmed_backend.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/chunking/__init__.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/cli/__init__.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/datamodel/base_models.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/datamodel/document.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/datamodel/pipeline_options.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/document_converter.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/exceptions.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/models/__init__.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/models/ds_glm_model.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/py.typed +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/utils/__init__.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/utils/export.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/utils/profiling.py +0 -0
- {docling-2.18.0 → docling-2.19.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.19.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -24,12 +24,12 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
24
|
Provides-Extra: ocrmac
|
25
25
|
Provides-Extra: rapidocr
|
26
26
|
Provides-Extra: tesserocr
|
27
|
-
Requires-Dist: beautifulsoup4 (>=4.12.3,<
|
27
|
+
Requires-Dist: beautifulsoup4 (>=4.12.3,<4.13.0)
|
28
28
|
Requires-Dist: certifi (>=2024.7.4)
|
29
29
|
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
30
|
-
Requires-Dist: docling-core[chunking] (>=2.17.
|
30
|
+
Requires-Dist: docling-core[chunking] (>=2.17.2,<3.0.0)
|
31
31
|
Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
|
32
|
-
Requires-Dist: docling-parse (>=3.
|
32
|
+
Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
|
33
33
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
34
34
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
35
35
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
@@ -52,6 +52,7 @@ Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
|
52
52
|
Requires-Dist: scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"
|
53
53
|
Requires-Dist: scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"
|
54
54
|
Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
|
55
|
+
Requires-Dist: tqdm (>=4.65.0,<5.0.0)
|
55
56
|
Requires-Dist: typer (>=0.12.5,<0.13.0)
|
56
57
|
Project-URL: Repository, https://github.com/DS4SD/docling
|
57
58
|
Description-Content-Type: text/markdown
|
@@ -36,7 +36,7 @@ _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
|
|
36
36
|
|
37
37
|
|
38
38
|
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
39
|
-
def
|
39
|
+
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
40
40
|
# This regex will match any sequence of underscores
|
41
41
|
pattern = r"_+"
|
42
42
|
|
@@ -81,7 +81,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
81
81
|
# very long sequences of underscores will lead to unnecessary long processing times.
|
82
82
|
# In any proper Markdown files, underscores have to be escaped,
|
83
83
|
# otherwise they represent emphasis (bold or italic)
|
84
|
-
self.markdown = self.
|
84
|
+
self.markdown = self._shorten_underscore_sequences(text_stream)
|
85
85
|
if isinstance(self.path_or_stream, Path):
|
86
86
|
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
87
87
|
md_content = f.read()
|
@@ -89,7 +89,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
89
89
|
# very long sequences of underscores will lead to unnecessary long processing times.
|
90
90
|
# In any proper Markdown files, underscores have to be escaped,
|
91
91
|
# otherwise they represent emphasis (bold or italic)
|
92
|
-
self.markdown = self.
|
92
|
+
self.markdown = self._shorten_underscore_sequences(md_content)
|
93
93
|
self.valid = True
|
94
94
|
|
95
95
|
_log.debug(self.markdown)
|
@@ -99,7 +99,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
99
99
|
) from e
|
100
100
|
return
|
101
101
|
|
102
|
-
def
|
102
|
+
def _close_table(self, doc: DoclingDocument):
|
103
103
|
if self.in_table:
|
104
104
|
_log.debug("=== TABLE START ===")
|
105
105
|
for md_table_row in self.md_table_buffer:
|
@@ -156,30 +156,35 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
156
156
|
doc.add_table(data=table_data)
|
157
157
|
return
|
158
158
|
|
159
|
-
def
|
160
|
-
self,
|
159
|
+
def _process_inline_text(
|
160
|
+
self, parent_item: Optional[NodeItem], doc: DoclingDocument
|
161
161
|
):
|
162
162
|
txt = " ".join(self.inline_texts)
|
163
163
|
if len(txt) > 0:
|
164
164
|
doc.add_text(
|
165
165
|
label=DocItemLabel.PARAGRAPH,
|
166
|
-
parent=
|
166
|
+
parent=parent_item,
|
167
167
|
text=txt,
|
168
168
|
)
|
169
169
|
self.inline_texts = []
|
170
170
|
|
171
|
-
def
|
171
|
+
def _iterate_elements(
|
172
172
|
self,
|
173
173
|
element: marko.element.Element,
|
174
174
|
depth: int,
|
175
175
|
doc: DoclingDocument,
|
176
|
-
|
176
|
+
visited: Set[marko.element.Element],
|
177
|
+
parent_item: Optional[NodeItem] = None,
|
177
178
|
):
|
179
|
+
|
180
|
+
if element in visited:
|
181
|
+
return
|
182
|
+
|
178
183
|
# Iterates over all elements in the AST
|
179
184
|
# Check for different element types and process relevant details
|
180
185
|
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
|
181
|
-
self.
|
182
|
-
self.
|
186
|
+
self._close_table(doc)
|
187
|
+
self._process_inline_text(parent_item, doc)
|
183
188
|
_log.debug(
|
184
189
|
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
185
190
|
)
|
@@ -207,8 +212,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
207
212
|
traverse(element)
|
208
213
|
snippet_text = "".join(strings)
|
209
214
|
if len(snippet_text) > 0:
|
210
|
-
|
211
|
-
label=doc_label, parent=
|
215
|
+
parent_item = doc.add_text(
|
216
|
+
label=doc_label, parent=parent_item, text=snippet_text
|
212
217
|
)
|
213
218
|
|
214
219
|
elif isinstance(element, marko.block.List):
|
@@ -218,35 +223,37 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
218
223
|
has_non_empty_list_items = True
|
219
224
|
break
|
220
225
|
|
221
|
-
self.
|
222
|
-
self.
|
226
|
+
self._close_table(doc)
|
227
|
+
self._process_inline_text(parent_item, doc)
|
223
228
|
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
224
229
|
if has_non_empty_list_items:
|
225
230
|
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
226
|
-
|
227
|
-
label=label, name=f"list", parent=
|
231
|
+
parent_item = doc.add_group(
|
232
|
+
label=label, name=f"list", parent=parent_item
|
228
233
|
)
|
229
234
|
|
230
235
|
elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
|
231
|
-
self.
|
232
|
-
self.
|
236
|
+
self._close_table(doc)
|
237
|
+
self._process_inline_text(parent_item, doc)
|
233
238
|
_log.debug(" - List item")
|
234
239
|
|
235
|
-
|
240
|
+
first_child = element.children[0]
|
241
|
+
snippet_text = str(first_child.children[0].children) # type: ignore
|
236
242
|
is_numbered = False
|
237
243
|
if (
|
238
|
-
|
239
|
-
and isinstance(
|
240
|
-
and
|
244
|
+
parent_item is not None
|
245
|
+
and isinstance(parent_item, DocItem)
|
246
|
+
and parent_item.label == GroupLabel.ORDERED_LIST
|
241
247
|
):
|
242
248
|
is_numbered = True
|
243
249
|
doc.add_list_item(
|
244
|
-
enumerated=is_numbered, parent=
|
250
|
+
enumerated=is_numbered, parent=parent_item, text=snippet_text
|
245
251
|
)
|
252
|
+
visited.add(first_child)
|
246
253
|
|
247
254
|
elif isinstance(element, marko.inline.Image):
|
248
|
-
self.
|
249
|
-
self.
|
255
|
+
self._close_table(doc)
|
256
|
+
self._process_inline_text(parent_item, doc)
|
250
257
|
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
251
258
|
|
252
259
|
fig_caption: Optional[TextItem] = None
|
@@ -255,10 +262,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
255
262
|
label=DocItemLabel.CAPTION, text=element.title
|
256
263
|
)
|
257
264
|
|
258
|
-
doc.add_picture(parent=
|
265
|
+
doc.add_picture(parent=parent_item, caption=fig_caption)
|
259
266
|
|
260
267
|
elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
|
261
|
-
self.
|
268
|
+
self._process_inline_text(parent_item, doc)
|
262
269
|
|
263
270
|
elif isinstance(element, marko.inline.RawText):
|
264
271
|
_log.debug(f" - Paragraph (raw text): {element.children}")
|
@@ -272,17 +279,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
272
279
|
else:
|
273
280
|
self.md_table_buffer.append(snippet_text)
|
274
281
|
else:
|
275
|
-
self.
|
276
|
-
self.in_table = False
|
282
|
+
self._close_table(doc)
|
277
283
|
# most likely just inline text
|
278
284
|
self.inline_texts.append(str(element.children))
|
279
285
|
|
280
286
|
elif isinstance(element, marko.inline.CodeSpan):
|
281
|
-
self.
|
282
|
-
self.
|
287
|
+
self._close_table(doc)
|
288
|
+
self._process_inline_text(parent_item, doc)
|
283
289
|
_log.debug(f" - Code Span: {element.children}")
|
284
290
|
snippet_text = str(element.children).strip()
|
285
|
-
doc.add_code(parent=
|
291
|
+
doc.add_code(parent=parent_item, text=snippet_text)
|
286
292
|
|
287
293
|
elif (
|
288
294
|
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
|
@@ -290,10 +296,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
290
296
|
and isinstance((first_child := element.children[0]), marko.inline.RawText)
|
291
297
|
and len(snippet_text := (first_child.children.strip())) > 0
|
292
298
|
):
|
293
|
-
self.
|
294
|
-
self.
|
299
|
+
self._close_table(doc)
|
300
|
+
self._process_inline_text(parent_item, doc)
|
295
301
|
_log.debug(f" - Code Block: {element.children}")
|
296
|
-
doc.add_code(parent=
|
302
|
+
doc.add_code(parent=parent_item, text=snippet_text)
|
297
303
|
|
298
304
|
elif isinstance(element, marko.inline.LineBreak):
|
299
305
|
if self.in_table:
|
@@ -302,8 +308,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
302
308
|
|
303
309
|
elif isinstance(element, marko.block.HTMLBlock):
|
304
310
|
self._html_blocks += 1
|
305
|
-
self.
|
306
|
-
self.
|
311
|
+
self._process_inline_text(parent_item, doc)
|
312
|
+
self._close_table(doc)
|
307
313
|
_log.debug("HTML Block: {}".format(element))
|
308
314
|
if (
|
309
315
|
len(element.body) > 0
|
@@ -312,18 +318,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
312
318
|
|
313
319
|
# wrap in markers to enable post-processing in convert()
|
314
320
|
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
|
315
|
-
doc.add_code(parent=
|
321
|
+
doc.add_code(parent=parent_item, text=text_to_add)
|
316
322
|
else:
|
317
323
|
if not isinstance(element, str):
|
318
|
-
self.
|
324
|
+
self._close_table(doc)
|
319
325
|
_log.debug("Some other element: {}".format(element))
|
320
326
|
|
321
327
|
processed_block_types = (
|
322
|
-
marko.block.ListItem,
|
323
328
|
marko.block.Heading,
|
324
329
|
marko.block.CodeBlock,
|
325
330
|
marko.block.FencedCode,
|
326
|
-
# marko.block.Paragraph,
|
327
331
|
marko.inline.RawText,
|
328
332
|
)
|
329
333
|
|
@@ -332,7 +336,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
332
336
|
element, processed_block_types
|
333
337
|
):
|
334
338
|
for child in element.children:
|
335
|
-
self.
|
339
|
+
self._iterate_elements(
|
340
|
+
element=child,
|
341
|
+
depth=depth + 1,
|
342
|
+
doc=doc,
|
343
|
+
visited=visited,
|
344
|
+
parent_item=parent_item,
|
345
|
+
)
|
336
346
|
|
337
347
|
def is_valid(self) -> bool:
|
338
348
|
return self.valid
|
@@ -366,9 +376,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
366
376
|
marko_parser = Markdown()
|
367
377
|
parsed_ast = marko_parser.parse(self.markdown)
|
368
378
|
# Start iterating from the root of the AST
|
369
|
-
self.
|
370
|
-
|
371
|
-
|
379
|
+
self._iterate_elements(
|
380
|
+
element=parsed_ast,
|
381
|
+
depth=0,
|
382
|
+
doc=doc,
|
383
|
+
parent_item=None,
|
384
|
+
visited=set(),
|
385
|
+
)
|
386
|
+
self._process_inline_text(None, doc) # handle last hanging inline text
|
387
|
+
self._close_table(doc=doc) # handle any last hanging table
|
372
388
|
|
373
389
|
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
374
390
|
if self._html_blocks > 0:
|
@@ -242,7 +242,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
242
242
|
parts = label.split(":")
|
243
243
|
|
244
244
|
if len(parts) == 2:
|
245
|
-
return parts[0],
|
245
|
+
return parts[0], self.str_to_int(parts[1], None)
|
246
246
|
|
247
247
|
parts = self.split_text_and_number(label)
|
248
248
|
|
@@ -219,6 +219,13 @@ def convert(
|
|
219
219
|
bool,
|
220
220
|
typer.Option(..., help="Enable the formula enrichment model in the pipeline."),
|
221
221
|
] = False,
|
222
|
+
enrich_picture_classes: Annotated[
|
223
|
+
bool,
|
224
|
+
typer.Option(
|
225
|
+
...,
|
226
|
+
help="Enable the picture classification enrichment model in the pipeline.",
|
227
|
+
),
|
228
|
+
] = False,
|
222
229
|
artifacts_path: Annotated[
|
223
230
|
Optional[Path],
|
224
231
|
typer.Option(..., help="If provided, the location of the model artifacts."),
|
@@ -375,6 +382,7 @@ def convert(
|
|
375
382
|
do_table_structure=True,
|
376
383
|
do_code_enrichment=enrich_code,
|
377
384
|
do_formula_enrichment=enrich_formula,
|
385
|
+
do_picture_classification=enrich_picture_classes,
|
378
386
|
document_timeout=document_timeout,
|
379
387
|
)
|
380
388
|
pipeline_options.table_structure_options.do_cell_matching = (
|
@@ -0,0 +1,105 @@
|
|
1
|
+
import logging
|
2
|
+
import warnings
|
3
|
+
from enum import Enum
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Annotated, Optional
|
6
|
+
|
7
|
+
import typer
|
8
|
+
from rich.console import Console
|
9
|
+
from rich.logging import RichHandler
|
10
|
+
|
11
|
+
from docling.datamodel.settings import settings
|
12
|
+
from docling.utils.model_downloader import download_models
|
13
|
+
|
14
|
+
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
15
|
+
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
16
|
+
|
17
|
+
console = Console()
|
18
|
+
err_console = Console(stderr=True)
|
19
|
+
|
20
|
+
|
21
|
+
app = typer.Typer(
|
22
|
+
name="Docling models helper",
|
23
|
+
no_args_is_help=True,
|
24
|
+
add_completion=False,
|
25
|
+
pretty_exceptions_enable=False,
|
26
|
+
)
|
27
|
+
|
28
|
+
|
29
|
+
class _AvailableModels(str, Enum):
|
30
|
+
LAYOUT = "layout"
|
31
|
+
TABLEFORMER = "tableformer"
|
32
|
+
CODE_FORMULA = "code_formula"
|
33
|
+
PICTURE_CLASSIFIER = "picture_classifier"
|
34
|
+
EASYOCR = "easyocr"
|
35
|
+
|
36
|
+
|
37
|
+
@app.command("download")
|
38
|
+
def download(
|
39
|
+
output_dir: Annotated[
|
40
|
+
Path,
|
41
|
+
typer.Option(
|
42
|
+
...,
|
43
|
+
"-o",
|
44
|
+
"--output-dir",
|
45
|
+
help="The directory where all the models are downloaded.",
|
46
|
+
),
|
47
|
+
] = (settings.cache_dir / "models"),
|
48
|
+
force: Annotated[
|
49
|
+
bool, typer.Option(..., help="If true, the download will be forced")
|
50
|
+
] = False,
|
51
|
+
models: Annotated[
|
52
|
+
Optional[list[_AvailableModels]],
|
53
|
+
typer.Argument(
|
54
|
+
help=f"Models to download (default behavior: all will be downloaded)",
|
55
|
+
),
|
56
|
+
] = None,
|
57
|
+
quiet: Annotated[
|
58
|
+
bool,
|
59
|
+
typer.Option(
|
60
|
+
...,
|
61
|
+
"-q",
|
62
|
+
"--quiet",
|
63
|
+
help="No extra output is generated, the CLI prints only the directory with the cached models.",
|
64
|
+
),
|
65
|
+
] = False,
|
66
|
+
):
|
67
|
+
if not quiet:
|
68
|
+
FORMAT = "%(message)s"
|
69
|
+
logging.basicConfig(
|
70
|
+
level=logging.INFO,
|
71
|
+
format="[blue]%(message)s[/blue]",
|
72
|
+
datefmt="[%X]",
|
73
|
+
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
|
74
|
+
)
|
75
|
+
to_download = models or [m for m in _AvailableModels]
|
76
|
+
output_dir = download_models(
|
77
|
+
output_dir=output_dir,
|
78
|
+
force=force,
|
79
|
+
progress=(not quiet),
|
80
|
+
with_layout=_AvailableModels.LAYOUT in to_download,
|
81
|
+
with_tableformer=_AvailableModels.TABLEFORMER in to_download,
|
82
|
+
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
|
83
|
+
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
|
84
|
+
with_easyocr=_AvailableModels.EASYOCR in to_download,
|
85
|
+
)
|
86
|
+
|
87
|
+
if quiet:
|
88
|
+
typer.echo(output_dir)
|
89
|
+
else:
|
90
|
+
typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
|
91
|
+
|
92
|
+
console.print(
|
93
|
+
"\n",
|
94
|
+
"Docling can now be configured for running offline using the local artifacts.\n\n",
|
95
|
+
"Using the CLI:",
|
96
|
+
f"`docling --artifacts-path={output_dir} FILE`",
|
97
|
+
"\n",
|
98
|
+
"Using Python: see the documentation at <https://ds4sd.github.io/docling/usage>.",
|
99
|
+
)
|
100
|
+
|
101
|
+
|
102
|
+
click_app = typer.main.get_command(app)
|
103
|
+
|
104
|
+
if __name__ == "__main__":
|
105
|
+
app()
|
@@ -0,0 +1,17 @@
|
|
1
|
+
import typer
|
2
|
+
|
3
|
+
from docling.cli.models import app as models_app
|
4
|
+
|
5
|
+
app = typer.Typer(
|
6
|
+
name="Docling helpers",
|
7
|
+
no_args_is_help=True,
|
8
|
+
add_completion=False,
|
9
|
+
pretty_exceptions_enable=False,
|
10
|
+
)
|
11
|
+
|
12
|
+
app.add_typer(models_app, name="models")
|
13
|
+
|
14
|
+
click_app = typer.main.get_command(app)
|
15
|
+
|
16
|
+
if __name__ == "__main__":
|
17
|
+
app()
|
@@ -6,6 +6,7 @@ from typing_extensions import TypeVar
|
|
6
6
|
|
7
7
|
from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
|
8
8
|
from docling.datamodel.document import ConversionResult
|
9
|
+
from docling.datamodel.settings import settings
|
9
10
|
|
10
11
|
|
11
12
|
class BasePageModel(ABC):
|
@@ -21,6 +22,8 @@ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
|
|
21
22
|
|
22
23
|
class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
|
23
24
|
|
25
|
+
elements_batch_size: int = settings.perf.elements_batch_size
|
26
|
+
|
24
27
|
@abstractmethod
|
25
28
|
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
26
29
|
pass
|
@@ -2,6 +2,7 @@ import re
|
|
2
2
|
from pathlib import Path
|
3
3
|
from typing import Iterable, List, Literal, Optional, Tuple, Union
|
4
4
|
|
5
|
+
import numpy as np
|
5
6
|
from docling_core.types.doc import (
|
6
7
|
CodeItem,
|
7
8
|
DocItemLabel,
|
@@ -61,13 +62,15 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|
61
62
|
Processes the given batch of elements and enriches them with predictions.
|
62
63
|
"""
|
63
64
|
|
65
|
+
_model_repo_folder = "ds4sd--CodeFormula"
|
66
|
+
elements_batch_size = 5
|
64
67
|
images_scale = 1.66 # = 120 dpi, aligned with training data resolution
|
65
68
|
expansion_factor = 0.03
|
66
69
|
|
67
70
|
def __init__(
|
68
71
|
self,
|
69
72
|
enabled: bool,
|
70
|
-
artifacts_path: Optional[
|
73
|
+
artifacts_path: Optional[Path],
|
71
74
|
options: CodeFormulaModelOptions,
|
72
75
|
accelerator_options: AcceleratorOptions,
|
73
76
|
):
|
@@ -96,29 +99,32 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|
96
99
|
)
|
97
100
|
|
98
101
|
if artifacts_path is None:
|
99
|
-
artifacts_path = self.
|
102
|
+
artifacts_path = self.download_models()
|
100
103
|
else:
|
101
|
-
artifacts_path =
|
104
|
+
artifacts_path = artifacts_path / self._model_repo_folder
|
102
105
|
|
103
106
|
self.code_formula_model = CodeFormulaPredictor(
|
104
|
-
artifacts_path=artifacts_path,
|
107
|
+
artifacts_path=str(artifacts_path),
|
105
108
|
device=device,
|
106
109
|
num_threads=accelerator_options.num_threads,
|
107
110
|
)
|
108
111
|
|
109
112
|
@staticmethod
|
110
|
-
def
|
111
|
-
local_dir: Optional[Path] = None,
|
113
|
+
def download_models(
|
114
|
+
local_dir: Optional[Path] = None,
|
115
|
+
force: bool = False,
|
116
|
+
progress: bool = False,
|
112
117
|
) -> Path:
|
113
118
|
from huggingface_hub import snapshot_download
|
114
119
|
from huggingface_hub.utils import disable_progress_bars
|
115
120
|
|
116
|
-
|
121
|
+
if not progress:
|
122
|
+
disable_progress_bars()
|
117
123
|
download_path = snapshot_download(
|
118
124
|
repo_id="ds4sd/CodeFormula",
|
119
125
|
force_download=force,
|
120
126
|
local_dir=local_dir,
|
121
|
-
revision="v1.0.
|
127
|
+
revision="v1.0.1",
|
122
128
|
)
|
123
129
|
|
124
130
|
return Path(download_path)
|
@@ -226,7 +232,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|
226
232
|
return
|
227
233
|
|
228
234
|
labels: List[str] = []
|
229
|
-
images: List[Image.Image] = []
|
235
|
+
images: List[Union[Image.Image, np.ndarray]] = []
|
230
236
|
elements: List[TextItem] = []
|
231
237
|
for el in element_batch:
|
232
238
|
assert isinstance(el.item, TextItem)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
from typing import Iterable, List, Literal, Optional, Tuple, Union
|
3
3
|
|
4
|
+
import numpy as np
|
4
5
|
from docling_core.types.doc import (
|
5
6
|
DoclingDocument,
|
6
7
|
NodeItem,
|
@@ -55,12 +56,13 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|
55
56
|
Processes a batch of elements and adds classification annotations.
|
56
57
|
"""
|
57
58
|
|
59
|
+
_model_repo_folder = "ds4sd--DocumentFigureClassifier"
|
58
60
|
images_scale = 2
|
59
61
|
|
60
62
|
def __init__(
|
61
63
|
self,
|
62
64
|
enabled: bool,
|
63
|
-
artifacts_path: Optional[
|
65
|
+
artifacts_path: Optional[Path],
|
64
66
|
options: DocumentPictureClassifierOptions,
|
65
67
|
accelerator_options: AcceleratorOptions,
|
66
68
|
):
|
@@ -88,24 +90,25 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|
88
90
|
)
|
89
91
|
|
90
92
|
if artifacts_path is None:
|
91
|
-
artifacts_path = self.
|
93
|
+
artifacts_path = self.download_models()
|
92
94
|
else:
|
93
|
-
artifacts_path =
|
95
|
+
artifacts_path = artifacts_path / self._model_repo_folder
|
94
96
|
|
95
97
|
self.document_picture_classifier = DocumentFigureClassifierPredictor(
|
96
|
-
artifacts_path=artifacts_path,
|
98
|
+
artifacts_path=str(artifacts_path),
|
97
99
|
device=device,
|
98
100
|
num_threads=accelerator_options.num_threads,
|
99
101
|
)
|
100
102
|
|
101
103
|
@staticmethod
|
102
|
-
def
|
103
|
-
local_dir: Optional[Path] = None, force: bool = False
|
104
|
+
def download_models(
|
105
|
+
local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
|
104
106
|
) -> Path:
|
105
107
|
from huggingface_hub import snapshot_download
|
106
108
|
from huggingface_hub.utils import disable_progress_bars
|
107
109
|
|
108
|
-
|
110
|
+
if not progress:
|
111
|
+
disable_progress_bars()
|
109
112
|
download_path = snapshot_download(
|
110
113
|
repo_id="ds4sd/DocumentFigureClassifier",
|
111
114
|
force_download=force,
|
@@ -159,7 +162,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|
159
162
|
yield element
|
160
163
|
return
|
161
164
|
|
162
|
-
images: List[Image.Image] = []
|
165
|
+
images: List[Union[Image.Image, np.ndarray]] = []
|
163
166
|
elements: List[PictureItem] = []
|
164
167
|
for el in element_batch:
|
165
168
|
assert isinstance(el, PictureItem)
|