docling 2.18.0__tar.gz → 2.19.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {docling-2.18.0 → docling-2.19.0}/PKG-INFO +5 -4
  2. {docling-2.18.0 → docling-2.19.0}/docling/backend/md_backend.py +62 -46
  3. {docling-2.18.0 → docling-2.19.0}/docling/backend/msword_backend.py +1 -1
  4. {docling-2.18.0 → docling-2.19.0}/docling/cli/main.py +8 -0
  5. docling-2.19.0/docling/cli/models.py +105 -0
  6. docling-2.19.0/docling/cli/tools.py +17 -0
  7. {docling-2.18.0 → docling-2.19.0}/docling/datamodel/settings.py +2 -0
  8. {docling-2.18.0 → docling-2.19.0}/docling/models/base_model.py +3 -0
  9. {docling-2.18.0 → docling-2.19.0}/docling/models/code_formula_model.py +15 -9
  10. {docling-2.18.0 → docling-2.19.0}/docling/models/document_picture_classifier.py +11 -8
  11. {docling-2.18.0 → docling-2.19.0}/docling/models/easyocr_model.py +50 -3
  12. {docling-2.18.0 → docling-2.19.0}/docling/models/layout_model.py +49 -3
  13. {docling-2.18.0 → docling-2.19.0}/docling/models/table_structure_model.py +44 -2
  14. {docling-2.18.0 → docling-2.19.0}/docling/pipeline/base_pipeline.py +1 -1
  15. {docling-2.18.0 → docling-2.19.0}/docling/pipeline/standard_pdf_pipeline.py +25 -24
  16. docling-2.19.0/docling/utils/model_downloader.py +72 -0
  17. {docling-2.18.0 → docling-2.19.0}/docling/utils/utils.py +24 -0
  18. {docling-2.18.0 → docling-2.19.0}/pyproject.toml +7 -4
  19. {docling-2.18.0 → docling-2.19.0}/LICENSE +0 -0
  20. {docling-2.18.0 → docling-2.19.0}/README.md +0 -0
  21. {docling-2.18.0 → docling-2.19.0}/docling/__init__.py +0 -0
  22. {docling-2.18.0 → docling-2.19.0}/docling/backend/__init__.py +0 -0
  23. {docling-2.18.0 → docling-2.19.0}/docling/backend/abstract_backend.py +0 -0
  24. {docling-2.18.0 → docling-2.19.0}/docling/backend/asciidoc_backend.py +0 -0
  25. {docling-2.18.0 → docling-2.19.0}/docling/backend/docling_parse_backend.py +0 -0
  26. {docling-2.18.0 → docling-2.19.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  27. {docling-2.18.0 → docling-2.19.0}/docling/backend/html_backend.py +0 -0
  28. {docling-2.18.0 → docling-2.19.0}/docling/backend/json/__init__.py +0 -0
  29. {docling-2.18.0 → docling-2.19.0}/docling/backend/json/docling_json_backend.py +0 -0
  30. {docling-2.18.0 → docling-2.19.0}/docling/backend/msexcel_backend.py +0 -0
  31. {docling-2.18.0 → docling-2.19.0}/docling/backend/mspowerpoint_backend.py +0 -0
  32. {docling-2.18.0 → docling-2.19.0}/docling/backend/pdf_backend.py +0 -0
  33. {docling-2.18.0 → docling-2.19.0}/docling/backend/pypdfium2_backend.py +0 -0
  34. {docling-2.18.0 → docling-2.19.0}/docling/backend/xml/__init__.py +0 -0
  35. {docling-2.18.0 → docling-2.19.0}/docling/backend/xml/pubmed_backend.py +0 -0
  36. {docling-2.18.0 → docling-2.19.0}/docling/backend/xml/uspto_backend.py +0 -0
  37. {docling-2.18.0 → docling-2.19.0}/docling/chunking/__init__.py +0 -0
  38. {docling-2.18.0 → docling-2.19.0}/docling/cli/__init__.py +0 -0
  39. {docling-2.18.0 → docling-2.19.0}/docling/datamodel/__init__.py +0 -0
  40. {docling-2.18.0 → docling-2.19.0}/docling/datamodel/base_models.py +0 -0
  41. {docling-2.18.0 → docling-2.19.0}/docling/datamodel/document.py +0 -0
  42. {docling-2.18.0 → docling-2.19.0}/docling/datamodel/pipeline_options.py +0 -0
  43. {docling-2.18.0 → docling-2.19.0}/docling/document_converter.py +0 -0
  44. {docling-2.18.0 → docling-2.19.0}/docling/exceptions.py +0 -0
  45. {docling-2.18.0 → docling-2.19.0}/docling/models/__init__.py +0 -0
  46. {docling-2.18.0 → docling-2.19.0}/docling/models/base_ocr_model.py +0 -0
  47. {docling-2.18.0 → docling-2.19.0}/docling/models/ds_glm_model.py +0 -0
  48. {docling-2.18.0 → docling-2.19.0}/docling/models/ocr_mac_model.py +0 -0
  49. {docling-2.18.0 → docling-2.19.0}/docling/models/page_assemble_model.py +0 -0
  50. {docling-2.18.0 → docling-2.19.0}/docling/models/page_preprocessing_model.py +0 -0
  51. {docling-2.18.0 → docling-2.19.0}/docling/models/rapid_ocr_model.py +0 -0
  52. {docling-2.18.0 → docling-2.19.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  53. {docling-2.18.0 → docling-2.19.0}/docling/models/tesseract_ocr_model.py +0 -0
  54. {docling-2.18.0 → docling-2.19.0}/docling/pipeline/__init__.py +0 -0
  55. {docling-2.18.0 → docling-2.19.0}/docling/pipeline/simple_pipeline.py +0 -0
  56. {docling-2.18.0 → docling-2.19.0}/docling/py.typed +0 -0
  57. {docling-2.18.0 → docling-2.19.0}/docling/utils/__init__.py +0 -0
  58. {docling-2.18.0 → docling-2.19.0}/docling/utils/accelerator_utils.py +0 -0
  59. {docling-2.18.0 → docling-2.19.0}/docling/utils/export.py +0 -0
  60. {docling-2.18.0 → docling-2.19.0}/docling/utils/glm_utils.py +0 -0
  61. {docling-2.18.0 → docling-2.19.0}/docling/utils/layout_postprocessor.py +0 -0
  62. {docling-2.18.0 → docling-2.19.0}/docling/utils/ocr_utils.py +0 -0
  63. {docling-2.18.0 → docling-2.19.0}/docling/utils/profiling.py +0 -0
  64. {docling-2.18.0 → docling-2.19.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.18.0
3
+ Version: 2.19.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -24,12 +24,12 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
24
  Provides-Extra: ocrmac
25
25
  Provides-Extra: rapidocr
26
26
  Provides-Extra: tesserocr
27
- Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
+ Requires-Dist: beautifulsoup4 (>=4.12.3,<4.13.0)
28
28
  Requires-Dist: certifi (>=2024.7.4)
29
29
  Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
30
- Requires-Dist: docling-core[chunking] (>=2.17.0,<3.0.0)
30
+ Requires-Dist: docling-core[chunking] (>=2.17.2,<3.0.0)
31
31
  Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
32
- Requires-Dist: docling-parse (>=3.1.0,<4.0.0)
32
+ Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
33
33
  Requires-Dist: easyocr (>=1.7,<2.0)
34
34
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
35
35
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -52,6 +52,7 @@ Requires-Dist: rtree (>=1.3.0,<2.0.0)
52
52
  Requires-Dist: scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"
53
53
  Requires-Dist: scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"
54
54
  Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
55
+ Requires-Dist: tqdm (>=4.65.0,<5.0.0)
55
56
  Requires-Dist: typer (>=0.12.5,<0.13.0)
56
57
  Project-URL: Repository, https://github.com/DS4SD/docling
57
58
  Description-Content-Type: text/markdown
@@ -36,7 +36,7 @@ _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
36
36
 
37
37
 
38
38
  class MarkdownDocumentBackend(DeclarativeDocumentBackend):
39
- def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
39
+ def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
40
40
  # This regex will match any sequence of underscores
41
41
  pattern = r"_+"
42
42
 
@@ -81,7 +81,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
81
81
  # very long sequences of underscores will lead to unnecessary long processing times.
82
82
  # In any proper Markdown files, underscores have to be escaped,
83
83
  # otherwise they represent emphasis (bold or italic)
84
- self.markdown = self.shorten_underscore_sequences(text_stream)
84
+ self.markdown = self._shorten_underscore_sequences(text_stream)
85
85
  if isinstance(self.path_or_stream, Path):
86
86
  with open(self.path_or_stream, "r", encoding="utf-8") as f:
87
87
  md_content = f.read()
@@ -89,7 +89,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
89
89
  # very long sequences of underscores will lead to unnecessary long processing times.
90
90
  # In any proper Markdown files, underscores have to be escaped,
91
91
  # otherwise they represent emphasis (bold or italic)
92
- self.markdown = self.shorten_underscore_sequences(md_content)
92
+ self.markdown = self._shorten_underscore_sequences(md_content)
93
93
  self.valid = True
94
94
 
95
95
  _log.debug(self.markdown)
@@ -99,7 +99,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
99
99
  ) from e
100
100
  return
101
101
 
102
- def close_table(self, doc: DoclingDocument):
102
+ def _close_table(self, doc: DoclingDocument):
103
103
  if self.in_table:
104
104
  _log.debug("=== TABLE START ===")
105
105
  for md_table_row in self.md_table_buffer:
@@ -156,30 +156,35 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
156
156
  doc.add_table(data=table_data)
157
157
  return
158
158
 
159
- def process_inline_text(
160
- self, parent_element: Optional[NodeItem], doc: DoclingDocument
159
+ def _process_inline_text(
160
+ self, parent_item: Optional[NodeItem], doc: DoclingDocument
161
161
  ):
162
162
  txt = " ".join(self.inline_texts)
163
163
  if len(txt) > 0:
164
164
  doc.add_text(
165
165
  label=DocItemLabel.PARAGRAPH,
166
- parent=parent_element,
166
+ parent=parent_item,
167
167
  text=txt,
168
168
  )
169
169
  self.inline_texts = []
170
170
 
171
- def iterate_elements(
171
+ def _iterate_elements(
172
172
  self,
173
173
  element: marko.element.Element,
174
174
  depth: int,
175
175
  doc: DoclingDocument,
176
- parent_element: Optional[NodeItem] = None,
176
+ visited: Set[marko.element.Element],
177
+ parent_item: Optional[NodeItem] = None,
177
178
  ):
179
+
180
+ if element in visited:
181
+ return
182
+
178
183
  # Iterates over all elements in the AST
179
184
  # Check for different element types and process relevant details
180
185
  if isinstance(element, marko.block.Heading) and len(element.children) > 0:
181
- self.close_table(doc)
182
- self.process_inline_text(parent_element, doc)
186
+ self._close_table(doc)
187
+ self._process_inline_text(parent_item, doc)
183
188
  _log.debug(
184
189
  f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
185
190
  )
@@ -207,8 +212,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
207
212
  traverse(element)
208
213
  snippet_text = "".join(strings)
209
214
  if len(snippet_text) > 0:
210
- parent_element = doc.add_text(
211
- label=doc_label, parent=parent_element, text=snippet_text
215
+ parent_item = doc.add_text(
216
+ label=doc_label, parent=parent_item, text=snippet_text
212
217
  )
213
218
 
214
219
  elif isinstance(element, marko.block.List):
@@ -218,35 +223,37 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
218
223
  has_non_empty_list_items = True
219
224
  break
220
225
 
221
- self.close_table(doc)
222
- self.process_inline_text(parent_element, doc)
226
+ self._close_table(doc)
227
+ self._process_inline_text(parent_item, doc)
223
228
  _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
224
229
  if has_non_empty_list_items:
225
230
  label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
226
- parent_element = doc.add_group(
227
- label=label, name=f"list", parent=parent_element
231
+ parent_item = doc.add_group(
232
+ label=label, name=f"list", parent=parent_item
228
233
  )
229
234
 
230
235
  elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
231
- self.close_table(doc)
232
- self.process_inline_text(parent_element, doc)
236
+ self._close_table(doc)
237
+ self._process_inline_text(parent_item, doc)
233
238
  _log.debug(" - List item")
234
239
 
235
- snippet_text = str(element.children[0].children[0].children) # type: ignore
240
+ first_child = element.children[0]
241
+ snippet_text = str(first_child.children[0].children) # type: ignore
236
242
  is_numbered = False
237
243
  if (
238
- parent_element is not None
239
- and isinstance(parent_element, DocItem)
240
- and parent_element.label == GroupLabel.ORDERED_LIST
244
+ parent_item is not None
245
+ and isinstance(parent_item, DocItem)
246
+ and parent_item.label == GroupLabel.ORDERED_LIST
241
247
  ):
242
248
  is_numbered = True
243
249
  doc.add_list_item(
244
- enumerated=is_numbered, parent=parent_element, text=snippet_text
250
+ enumerated=is_numbered, parent=parent_item, text=snippet_text
245
251
  )
252
+ visited.add(first_child)
246
253
 
247
254
  elif isinstance(element, marko.inline.Image):
248
- self.close_table(doc)
249
- self.process_inline_text(parent_element, doc)
255
+ self._close_table(doc)
256
+ self._process_inline_text(parent_item, doc)
250
257
  _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
251
258
 
252
259
  fig_caption: Optional[TextItem] = None
@@ -255,10 +262,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
255
262
  label=DocItemLabel.CAPTION, text=element.title
256
263
  )
257
264
 
258
- doc.add_picture(parent=parent_element, caption=fig_caption)
265
+ doc.add_picture(parent=parent_item, caption=fig_caption)
259
266
 
260
267
  elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
261
- self.process_inline_text(parent_element, doc)
268
+ self._process_inline_text(parent_item, doc)
262
269
 
263
270
  elif isinstance(element, marko.inline.RawText):
264
271
  _log.debug(f" - Paragraph (raw text): {element.children}")
@@ -272,17 +279,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
272
279
  else:
273
280
  self.md_table_buffer.append(snippet_text)
274
281
  else:
275
- self.close_table(doc)
276
- self.in_table = False
282
+ self._close_table(doc)
277
283
  # most likely just inline text
278
284
  self.inline_texts.append(str(element.children))
279
285
 
280
286
  elif isinstance(element, marko.inline.CodeSpan):
281
- self.close_table(doc)
282
- self.process_inline_text(parent_element, doc)
287
+ self._close_table(doc)
288
+ self._process_inline_text(parent_item, doc)
283
289
  _log.debug(f" - Code Span: {element.children}")
284
290
  snippet_text = str(element.children).strip()
285
- doc.add_code(parent=parent_element, text=snippet_text)
291
+ doc.add_code(parent=parent_item, text=snippet_text)
286
292
 
287
293
  elif (
288
294
  isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
@@ -290,10 +296,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
290
296
  and isinstance((first_child := element.children[0]), marko.inline.RawText)
291
297
  and len(snippet_text := (first_child.children.strip())) > 0
292
298
  ):
293
- self.close_table(doc)
294
- self.process_inline_text(parent_element, doc)
299
+ self._close_table(doc)
300
+ self._process_inline_text(parent_item, doc)
295
301
  _log.debug(f" - Code Block: {element.children}")
296
- doc.add_code(parent=parent_element, text=snippet_text)
302
+ doc.add_code(parent=parent_item, text=snippet_text)
297
303
 
298
304
  elif isinstance(element, marko.inline.LineBreak):
299
305
  if self.in_table:
@@ -302,8 +308,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
302
308
 
303
309
  elif isinstance(element, marko.block.HTMLBlock):
304
310
  self._html_blocks += 1
305
- self.process_inline_text(parent_element, doc)
306
- self.close_table(doc)
311
+ self._process_inline_text(parent_item, doc)
312
+ self._close_table(doc)
307
313
  _log.debug("HTML Block: {}".format(element))
308
314
  if (
309
315
  len(element.body) > 0
@@ -312,18 +318,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
312
318
 
313
319
  # wrap in markers to enable post-processing in convert()
314
320
  text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
315
- doc.add_code(parent=parent_element, text=text_to_add)
321
+ doc.add_code(parent=parent_item, text=text_to_add)
316
322
  else:
317
323
  if not isinstance(element, str):
318
- self.close_table(doc)
324
+ self._close_table(doc)
319
325
  _log.debug("Some other element: {}".format(element))
320
326
 
321
327
  processed_block_types = (
322
- marko.block.ListItem,
323
328
  marko.block.Heading,
324
329
  marko.block.CodeBlock,
325
330
  marko.block.FencedCode,
326
- # marko.block.Paragraph,
327
331
  marko.inline.RawText,
328
332
  )
329
333
 
@@ -332,7 +336,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
332
336
  element, processed_block_types
333
337
  ):
334
338
  for child in element.children:
335
- self.iterate_elements(child, depth + 1, doc, parent_element)
339
+ self._iterate_elements(
340
+ element=child,
341
+ depth=depth + 1,
342
+ doc=doc,
343
+ visited=visited,
344
+ parent_item=parent_item,
345
+ )
336
346
 
337
347
  def is_valid(self) -> bool:
338
348
  return self.valid
@@ -366,9 +376,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
366
376
  marko_parser = Markdown()
367
377
  parsed_ast = marko_parser.parse(self.markdown)
368
378
  # Start iterating from the root of the AST
369
- self.iterate_elements(parsed_ast, 0, doc, None)
370
- self.process_inline_text(None, doc) # handle last hanging inline text
371
- self.close_table(doc=doc) # handle any last hanging table
379
+ self._iterate_elements(
380
+ element=parsed_ast,
381
+ depth=0,
382
+ doc=doc,
383
+ parent_item=None,
384
+ visited=set(),
385
+ )
386
+ self._process_inline_text(None, doc) # handle last hanging inline text
387
+ self._close_table(doc=doc) # handle any last hanging table
372
388
 
373
389
  # if HTML blocks were detected, export to HTML and delegate to HTML backend
374
390
  if self._html_blocks > 0:
@@ -242,7 +242,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
242
242
  parts = label.split(":")
243
243
 
244
244
  if len(parts) == 2:
245
- return parts[0], int(parts[1])
245
+ return parts[0], self.str_to_int(parts[1], None)
246
246
 
247
247
  parts = self.split_text_and_number(label)
248
248
 
@@ -219,6 +219,13 @@ def convert(
219
219
  bool,
220
220
  typer.Option(..., help="Enable the formula enrichment model in the pipeline."),
221
221
  ] = False,
222
+ enrich_picture_classes: Annotated[
223
+ bool,
224
+ typer.Option(
225
+ ...,
226
+ help="Enable the picture classification enrichment model in the pipeline.",
227
+ ),
228
+ ] = False,
222
229
  artifacts_path: Annotated[
223
230
  Optional[Path],
224
231
  typer.Option(..., help="If provided, the location of the model artifacts."),
@@ -375,6 +382,7 @@ def convert(
375
382
  do_table_structure=True,
376
383
  do_code_enrichment=enrich_code,
377
384
  do_formula_enrichment=enrich_formula,
385
+ do_picture_classification=enrich_picture_classes,
378
386
  document_timeout=document_timeout,
379
387
  )
380
388
  pipeline_options.table_structure_options.do_cell_matching = (
@@ -0,0 +1,105 @@
1
+ import logging
2
+ import warnings
3
+ from enum import Enum
4
+ from pathlib import Path
5
+ from typing import Annotated, Optional
6
+
7
+ import typer
8
+ from rich.console import Console
9
+ from rich.logging import RichHandler
10
+
11
+ from docling.datamodel.settings import settings
12
+ from docling.utils.model_downloader import download_models
13
+
14
+ warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
15
+ warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
16
+
17
+ console = Console()
18
+ err_console = Console(stderr=True)
19
+
20
+
21
+ app = typer.Typer(
22
+ name="Docling models helper",
23
+ no_args_is_help=True,
24
+ add_completion=False,
25
+ pretty_exceptions_enable=False,
26
+ )
27
+
28
+
29
+ class _AvailableModels(str, Enum):
30
+ LAYOUT = "layout"
31
+ TABLEFORMER = "tableformer"
32
+ CODE_FORMULA = "code_formula"
33
+ PICTURE_CLASSIFIER = "picture_classifier"
34
+ EASYOCR = "easyocr"
35
+
36
+
37
+ @app.command("download")
38
+ def download(
39
+ output_dir: Annotated[
40
+ Path,
41
+ typer.Option(
42
+ ...,
43
+ "-o",
44
+ "--output-dir",
45
+ help="The directory where all the models are downloaded.",
46
+ ),
47
+ ] = (settings.cache_dir / "models"),
48
+ force: Annotated[
49
+ bool, typer.Option(..., help="If true, the download will be forced")
50
+ ] = False,
51
+ models: Annotated[
52
+ Optional[list[_AvailableModels]],
53
+ typer.Argument(
54
+ help=f"Models to download (default behavior: all will be downloaded)",
55
+ ),
56
+ ] = None,
57
+ quiet: Annotated[
58
+ bool,
59
+ typer.Option(
60
+ ...,
61
+ "-q",
62
+ "--quiet",
63
+ help="No extra output is generated, the CLI prints only the directory with the cached models.",
64
+ ),
65
+ ] = False,
66
+ ):
67
+ if not quiet:
68
+ FORMAT = "%(message)s"
69
+ logging.basicConfig(
70
+ level=logging.INFO,
71
+ format="[blue]%(message)s[/blue]",
72
+ datefmt="[%X]",
73
+ handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
74
+ )
75
+ to_download = models or [m for m in _AvailableModels]
76
+ output_dir = download_models(
77
+ output_dir=output_dir,
78
+ force=force,
79
+ progress=(not quiet),
80
+ with_layout=_AvailableModels.LAYOUT in to_download,
81
+ with_tableformer=_AvailableModels.TABLEFORMER in to_download,
82
+ with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
83
+ with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
84
+ with_easyocr=_AvailableModels.EASYOCR in to_download,
85
+ )
86
+
87
+ if quiet:
88
+ typer.echo(output_dir)
89
+ else:
90
+ typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
91
+
92
+ console.print(
93
+ "\n",
94
+ "Docling can now be configured for running offline using the local artifacts.\n\n",
95
+ "Using the CLI:",
96
+ f"`docling --artifacts-path={output_dir} FILE`",
97
+ "\n",
98
+ "Using Python: see the documentation at <https://ds4sd.github.io/docling/usage>.",
99
+ )
100
+
101
+
102
+ click_app = typer.main.get_command(app)
103
+
104
+ if __name__ == "__main__":
105
+ app()
@@ -0,0 +1,17 @@
1
+ import typer
2
+
3
+ from docling.cli.models import app as models_app
4
+
5
+ app = typer.Typer(
6
+ name="Docling helpers",
7
+ no_args_is_help=True,
8
+ add_completion=False,
9
+ pretty_exceptions_enable=False,
10
+ )
11
+
12
+ app.add_typer(models_app, name="models")
13
+
14
+ click_app = typer.main.get_command(app)
15
+
16
+ if __name__ == "__main__":
17
+ app()
@@ -61,5 +61,7 @@ class AppSettings(BaseSettings):
61
61
  perf: BatchConcurrencySettings
62
62
  debug: DebugSettings
63
63
 
64
+ cache_dir: Path = Path.home() / ".cache" / "docling"
65
+
64
66
 
65
67
  settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
@@ -6,6 +6,7 @@ from typing_extensions import TypeVar
6
6
 
7
7
  from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
8
8
  from docling.datamodel.document import ConversionResult
9
+ from docling.datamodel.settings import settings
9
10
 
10
11
 
11
12
  class BasePageModel(ABC):
@@ -21,6 +22,8 @@ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
21
22
 
22
23
  class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
23
24
 
25
+ elements_batch_size: int = settings.perf.elements_batch_size
26
+
24
27
  @abstractmethod
25
28
  def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
26
29
  pass
@@ -2,6 +2,7 @@ import re
2
2
  from pathlib import Path
3
3
  from typing import Iterable, List, Literal, Optional, Tuple, Union
4
4
 
5
+ import numpy as np
5
6
  from docling_core.types.doc import (
6
7
  CodeItem,
7
8
  DocItemLabel,
@@ -61,13 +62,15 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
61
62
  Processes the given batch of elements and enriches them with predictions.
62
63
  """
63
64
 
65
+ _model_repo_folder = "ds4sd--CodeFormula"
66
+ elements_batch_size = 5
64
67
  images_scale = 1.66 # = 120 dpi, aligned with training data resolution
65
68
  expansion_factor = 0.03
66
69
 
67
70
  def __init__(
68
71
  self,
69
72
  enabled: bool,
70
- artifacts_path: Optional[Union[Path, str]],
73
+ artifacts_path: Optional[Path],
71
74
  options: CodeFormulaModelOptions,
72
75
  accelerator_options: AcceleratorOptions,
73
76
  ):
@@ -96,29 +99,32 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
96
99
  )
97
100
 
98
101
  if artifacts_path is None:
99
- artifacts_path = self.download_models_hf()
102
+ artifacts_path = self.download_models()
100
103
  else:
101
- artifacts_path = Path(artifacts_path)
104
+ artifacts_path = artifacts_path / self._model_repo_folder
102
105
 
103
106
  self.code_formula_model = CodeFormulaPredictor(
104
- artifacts_path=artifacts_path,
107
+ artifacts_path=str(artifacts_path),
105
108
  device=device,
106
109
  num_threads=accelerator_options.num_threads,
107
110
  )
108
111
 
109
112
  @staticmethod
110
- def download_models_hf(
111
- local_dir: Optional[Path] = None, force: bool = False
113
+ def download_models(
114
+ local_dir: Optional[Path] = None,
115
+ force: bool = False,
116
+ progress: bool = False,
112
117
  ) -> Path:
113
118
  from huggingface_hub import snapshot_download
114
119
  from huggingface_hub.utils import disable_progress_bars
115
120
 
116
- disable_progress_bars()
121
+ if not progress:
122
+ disable_progress_bars()
117
123
  download_path = snapshot_download(
118
124
  repo_id="ds4sd/CodeFormula",
119
125
  force_download=force,
120
126
  local_dir=local_dir,
121
- revision="v1.0.0",
127
+ revision="v1.0.1",
122
128
  )
123
129
 
124
130
  return Path(download_path)
@@ -226,7 +232,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
226
232
  return
227
233
 
228
234
  labels: List[str] = []
229
- images: List[Image.Image] = []
235
+ images: List[Union[Image.Image, np.ndarray]] = []
230
236
  elements: List[TextItem] = []
231
237
  for el in element_batch:
232
238
  assert isinstance(el.item, TextItem)
@@ -1,6 +1,7 @@
1
1
  from pathlib import Path
2
2
  from typing import Iterable, List, Literal, Optional, Tuple, Union
3
3
 
4
+ import numpy as np
4
5
  from docling_core.types.doc import (
5
6
  DoclingDocument,
6
7
  NodeItem,
@@ -55,12 +56,13 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
55
56
  Processes a batch of elements and adds classification annotations.
56
57
  """
57
58
 
59
+ _model_repo_folder = "ds4sd--DocumentFigureClassifier"
58
60
  images_scale = 2
59
61
 
60
62
  def __init__(
61
63
  self,
62
64
  enabled: bool,
63
- artifacts_path: Optional[Union[Path, str]],
65
+ artifacts_path: Optional[Path],
64
66
  options: DocumentPictureClassifierOptions,
65
67
  accelerator_options: AcceleratorOptions,
66
68
  ):
@@ -88,24 +90,25 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
88
90
  )
89
91
 
90
92
  if artifacts_path is None:
91
- artifacts_path = self.download_models_hf()
93
+ artifacts_path = self.download_models()
92
94
  else:
93
- artifacts_path = Path(artifacts_path)
95
+ artifacts_path = artifacts_path / self._model_repo_folder
94
96
 
95
97
  self.document_picture_classifier = DocumentFigureClassifierPredictor(
96
- artifacts_path=artifacts_path,
98
+ artifacts_path=str(artifacts_path),
97
99
  device=device,
98
100
  num_threads=accelerator_options.num_threads,
99
101
  )
100
102
 
101
103
  @staticmethod
102
- def download_models_hf(
103
- local_dir: Optional[Path] = None, force: bool = False
104
+ def download_models(
105
+ local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
104
106
  ) -> Path:
105
107
  from huggingface_hub import snapshot_download
106
108
  from huggingface_hub.utils import disable_progress_bars
107
109
 
108
- disable_progress_bars()
110
+ if not progress:
111
+ disable_progress_bars()
109
112
  download_path = snapshot_download(
110
113
  repo_id="ds4sd/DocumentFigureClassifier",
111
114
  force_download=force,
@@ -159,7 +162,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
159
162
  yield element
160
163
  return
161
164
 
162
- images: List[Image.Image] = []
165
+ images: List[Union[Image.Image, np.ndarray]] = []
163
166
  elements: List[PictureItem] = []
164
167
  for el in element_batch:
165
168
  assert isinstance(el, PictureItem)