docling 2.18.0__py3-none-any.whl → 2.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -36,7 +36,7 @@ _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
36
36
 
37
37
 
38
38
  class MarkdownDocumentBackend(DeclarativeDocumentBackend):
39
- def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
39
+ def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
40
40
  # This regex will match any sequence of underscores
41
41
  pattern = r"_+"
42
42
 
@@ -81,7 +81,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
81
81
  # very long sequences of underscores will lead to unnecessary long processing times.
82
82
  # In any proper Markdown files, underscores have to be escaped,
83
83
  # otherwise they represent emphasis (bold or italic)
84
- self.markdown = self.shorten_underscore_sequences(text_stream)
84
+ self.markdown = self._shorten_underscore_sequences(text_stream)
85
85
  if isinstance(self.path_or_stream, Path):
86
86
  with open(self.path_or_stream, "r", encoding="utf-8") as f:
87
87
  md_content = f.read()
@@ -89,7 +89,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
89
89
  # very long sequences of underscores will lead to unnecessary long processing times.
90
90
  # In any proper Markdown files, underscores have to be escaped,
91
91
  # otherwise they represent emphasis (bold or italic)
92
- self.markdown = self.shorten_underscore_sequences(md_content)
92
+ self.markdown = self._shorten_underscore_sequences(md_content)
93
93
  self.valid = True
94
94
 
95
95
  _log.debug(self.markdown)
@@ -99,7 +99,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
99
99
  ) from e
100
100
  return
101
101
 
102
- def close_table(self, doc: DoclingDocument):
102
+ def _close_table(self, doc: DoclingDocument):
103
103
  if self.in_table:
104
104
  _log.debug("=== TABLE START ===")
105
105
  for md_table_row in self.md_table_buffer:
@@ -156,30 +156,35 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
156
156
  doc.add_table(data=table_data)
157
157
  return
158
158
 
159
- def process_inline_text(
160
- self, parent_element: Optional[NodeItem], doc: DoclingDocument
159
+ def _process_inline_text(
160
+ self, parent_item: Optional[NodeItem], doc: DoclingDocument
161
161
  ):
162
162
  txt = " ".join(self.inline_texts)
163
163
  if len(txt) > 0:
164
164
  doc.add_text(
165
165
  label=DocItemLabel.PARAGRAPH,
166
- parent=parent_element,
166
+ parent=parent_item,
167
167
  text=txt,
168
168
  )
169
169
  self.inline_texts = []
170
170
 
171
- def iterate_elements(
171
+ def _iterate_elements(
172
172
  self,
173
173
  element: marko.element.Element,
174
174
  depth: int,
175
175
  doc: DoclingDocument,
176
- parent_element: Optional[NodeItem] = None,
176
+ visited: Set[marko.element.Element],
177
+ parent_item: Optional[NodeItem] = None,
177
178
  ):
179
+
180
+ if element in visited:
181
+ return
182
+
178
183
  # Iterates over all elements in the AST
179
184
  # Check for different element types and process relevant details
180
185
  if isinstance(element, marko.block.Heading) and len(element.children) > 0:
181
- self.close_table(doc)
182
- self.process_inline_text(parent_element, doc)
186
+ self._close_table(doc)
187
+ self._process_inline_text(parent_item, doc)
183
188
  _log.debug(
184
189
  f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
185
190
  )
@@ -207,8 +212,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
207
212
  traverse(element)
208
213
  snippet_text = "".join(strings)
209
214
  if len(snippet_text) > 0:
210
- parent_element = doc.add_text(
211
- label=doc_label, parent=parent_element, text=snippet_text
215
+ parent_item = doc.add_text(
216
+ label=doc_label, parent=parent_item, text=snippet_text
212
217
  )
213
218
 
214
219
  elif isinstance(element, marko.block.List):
@@ -218,35 +223,37 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
218
223
  has_non_empty_list_items = True
219
224
  break
220
225
 
221
- self.close_table(doc)
222
- self.process_inline_text(parent_element, doc)
226
+ self._close_table(doc)
227
+ self._process_inline_text(parent_item, doc)
223
228
  _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
224
229
  if has_non_empty_list_items:
225
230
  label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
226
- parent_element = doc.add_group(
227
- label=label, name=f"list", parent=parent_element
231
+ parent_item = doc.add_group(
232
+ label=label, name=f"list", parent=parent_item
228
233
  )
229
234
 
230
235
  elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
231
- self.close_table(doc)
232
- self.process_inline_text(parent_element, doc)
236
+ self._close_table(doc)
237
+ self._process_inline_text(parent_item, doc)
233
238
  _log.debug(" - List item")
234
239
 
235
- snippet_text = str(element.children[0].children[0].children) # type: ignore
240
+ first_child = element.children[0]
241
+ snippet_text = str(first_child.children[0].children) # type: ignore
236
242
  is_numbered = False
237
243
  if (
238
- parent_element is not None
239
- and isinstance(parent_element, DocItem)
240
- and parent_element.label == GroupLabel.ORDERED_LIST
244
+ parent_item is not None
245
+ and isinstance(parent_item, DocItem)
246
+ and parent_item.label == GroupLabel.ORDERED_LIST
241
247
  ):
242
248
  is_numbered = True
243
249
  doc.add_list_item(
244
- enumerated=is_numbered, parent=parent_element, text=snippet_text
250
+ enumerated=is_numbered, parent=parent_item, text=snippet_text
245
251
  )
252
+ visited.add(first_child)
246
253
 
247
254
  elif isinstance(element, marko.inline.Image):
248
- self.close_table(doc)
249
- self.process_inline_text(parent_element, doc)
255
+ self._close_table(doc)
256
+ self._process_inline_text(parent_item, doc)
250
257
  _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
251
258
 
252
259
  fig_caption: Optional[TextItem] = None
@@ -255,10 +262,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
255
262
  label=DocItemLabel.CAPTION, text=element.title
256
263
  )
257
264
 
258
- doc.add_picture(parent=parent_element, caption=fig_caption)
265
+ doc.add_picture(parent=parent_item, caption=fig_caption)
259
266
 
260
267
  elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
261
- self.process_inline_text(parent_element, doc)
268
+ self._process_inline_text(parent_item, doc)
262
269
 
263
270
  elif isinstance(element, marko.inline.RawText):
264
271
  _log.debug(f" - Paragraph (raw text): {element.children}")
@@ -272,17 +279,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
272
279
  else:
273
280
  self.md_table_buffer.append(snippet_text)
274
281
  else:
275
- self.close_table(doc)
276
- self.in_table = False
282
+ self._close_table(doc)
277
283
  # most likely just inline text
278
284
  self.inline_texts.append(str(element.children))
279
285
 
280
286
  elif isinstance(element, marko.inline.CodeSpan):
281
- self.close_table(doc)
282
- self.process_inline_text(parent_element, doc)
287
+ self._close_table(doc)
288
+ self._process_inline_text(parent_item, doc)
283
289
  _log.debug(f" - Code Span: {element.children}")
284
290
  snippet_text = str(element.children).strip()
285
- doc.add_code(parent=parent_element, text=snippet_text)
291
+ doc.add_code(parent=parent_item, text=snippet_text)
286
292
 
287
293
  elif (
288
294
  isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
@@ -290,10 +296,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
290
296
  and isinstance((first_child := element.children[0]), marko.inline.RawText)
291
297
  and len(snippet_text := (first_child.children.strip())) > 0
292
298
  ):
293
- self.close_table(doc)
294
- self.process_inline_text(parent_element, doc)
299
+ self._close_table(doc)
300
+ self._process_inline_text(parent_item, doc)
295
301
  _log.debug(f" - Code Block: {element.children}")
296
- doc.add_code(parent=parent_element, text=snippet_text)
302
+ doc.add_code(parent=parent_item, text=snippet_text)
297
303
 
298
304
  elif isinstance(element, marko.inline.LineBreak):
299
305
  if self.in_table:
@@ -302,8 +308,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
302
308
 
303
309
  elif isinstance(element, marko.block.HTMLBlock):
304
310
  self._html_blocks += 1
305
- self.process_inline_text(parent_element, doc)
306
- self.close_table(doc)
311
+ self._process_inline_text(parent_item, doc)
312
+ self._close_table(doc)
307
313
  _log.debug("HTML Block: {}".format(element))
308
314
  if (
309
315
  len(element.body) > 0
@@ -312,18 +318,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
312
318
 
313
319
  # wrap in markers to enable post-processing in convert()
314
320
  text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
315
- doc.add_code(parent=parent_element, text=text_to_add)
321
+ doc.add_code(parent=parent_item, text=text_to_add)
316
322
  else:
317
323
  if not isinstance(element, str):
318
- self.close_table(doc)
324
+ self._close_table(doc)
319
325
  _log.debug("Some other element: {}".format(element))
320
326
 
321
327
  processed_block_types = (
322
- marko.block.ListItem,
323
328
  marko.block.Heading,
324
329
  marko.block.CodeBlock,
325
330
  marko.block.FencedCode,
326
- # marko.block.Paragraph,
327
331
  marko.inline.RawText,
328
332
  )
329
333
 
@@ -332,7 +336,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
332
336
  element, processed_block_types
333
337
  ):
334
338
  for child in element.children:
335
- self.iterate_elements(child, depth + 1, doc, parent_element)
339
+ self._iterate_elements(
340
+ element=child,
341
+ depth=depth + 1,
342
+ doc=doc,
343
+ visited=visited,
344
+ parent_item=parent_item,
345
+ )
336
346
 
337
347
  def is_valid(self) -> bool:
338
348
  return self.valid
@@ -366,9 +376,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
366
376
  marko_parser = Markdown()
367
377
  parsed_ast = marko_parser.parse(self.markdown)
368
378
  # Start iterating from the root of the AST
369
- self.iterate_elements(parsed_ast, 0, doc, None)
370
- self.process_inline_text(None, doc) # handle last hanging inline text
371
- self.close_table(doc=doc) # handle any last hanging table
379
+ self._iterate_elements(
380
+ element=parsed_ast,
381
+ depth=0,
382
+ doc=doc,
383
+ parent_item=None,
384
+ visited=set(),
385
+ )
386
+ self._process_inline_text(None, doc) # handle last hanging inline text
387
+ self._close_table(doc=doc) # handle any last hanging table
372
388
 
373
389
  # if HTML blocks were detected, export to HTML and delegate to HTML backend
374
390
  if self._html_blocks > 0:
@@ -242,7 +242,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
242
242
  parts = label.split(":")
243
243
 
244
244
  if len(parts) == 2:
245
- return parts[0], int(parts[1])
245
+ return parts[0], self.str_to_int(parts[1], None)
246
246
 
247
247
  parts = self.split_text_and_number(label)
248
248
 
docling/cli/main.py CHANGED
@@ -219,6 +219,13 @@ def convert(
219
219
  bool,
220
220
  typer.Option(..., help="Enable the formula enrichment model in the pipeline."),
221
221
  ] = False,
222
+ enrich_picture_classes: Annotated[
223
+ bool,
224
+ typer.Option(
225
+ ...,
226
+ help="Enable the picture classification enrichment model in the pipeline.",
227
+ ),
228
+ ] = False,
222
229
  artifacts_path: Annotated[
223
230
  Optional[Path],
224
231
  typer.Option(..., help="If provided, the location of the model artifacts."),
@@ -375,6 +382,7 @@ def convert(
375
382
  do_table_structure=True,
376
383
  do_code_enrichment=enrich_code,
377
384
  do_formula_enrichment=enrich_formula,
385
+ do_picture_classification=enrich_picture_classes,
378
386
  document_timeout=document_timeout,
379
387
  )
380
388
  pipeline_options.table_structure_options.do_cell_matching = (
docling/cli/models.py ADDED
@@ -0,0 +1,105 @@
1
+ import logging
2
+ import warnings
3
+ from enum import Enum
4
+ from pathlib import Path
5
+ from typing import Annotated, Optional
6
+
7
+ import typer
8
+ from rich.console import Console
9
+ from rich.logging import RichHandler
10
+
11
+ from docling.datamodel.settings import settings
12
+ from docling.utils.model_downloader import download_models
13
+
14
+ warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
15
+ warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
16
+
17
+ console = Console()
18
+ err_console = Console(stderr=True)
19
+
20
+
21
+ app = typer.Typer(
22
+ name="Docling models helper",
23
+ no_args_is_help=True,
24
+ add_completion=False,
25
+ pretty_exceptions_enable=False,
26
+ )
27
+
28
+
29
+ class _AvailableModels(str, Enum):
30
+ LAYOUT = "layout"
31
+ TABLEFORMER = "tableformer"
32
+ CODE_FORMULA = "code_formula"
33
+ PICTURE_CLASSIFIER = "picture_classifier"
34
+ EASYOCR = "easyocr"
35
+
36
+
37
+ @app.command("download")
38
+ def download(
39
+ output_dir: Annotated[
40
+ Path,
41
+ typer.Option(
42
+ ...,
43
+ "-o",
44
+ "--output-dir",
45
+ help="The directory where all the models are downloaded.",
46
+ ),
47
+ ] = (settings.cache_dir / "models"),
48
+ force: Annotated[
49
+ bool, typer.Option(..., help="If true, the download will be forced")
50
+ ] = False,
51
+ models: Annotated[
52
+ Optional[list[_AvailableModels]],
53
+ typer.Argument(
54
+ help=f"Models to download (default behavior: all will be downloaded)",
55
+ ),
56
+ ] = None,
57
+ quiet: Annotated[
58
+ bool,
59
+ typer.Option(
60
+ ...,
61
+ "-q",
62
+ "--quiet",
63
+ help="No extra output is generated, the CLI prints only the directory with the cached models.",
64
+ ),
65
+ ] = False,
66
+ ):
67
+ if not quiet:
68
+ FORMAT = "%(message)s"
69
+ logging.basicConfig(
70
+ level=logging.INFO,
71
+ format="[blue]%(message)s[/blue]",
72
+ datefmt="[%X]",
73
+ handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
74
+ )
75
+ to_download = models or [m for m in _AvailableModels]
76
+ output_dir = download_models(
77
+ output_dir=output_dir,
78
+ force=force,
79
+ progress=(not quiet),
80
+ with_layout=_AvailableModels.LAYOUT in to_download,
81
+ with_tableformer=_AvailableModels.TABLEFORMER in to_download,
82
+ with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
83
+ with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
84
+ with_easyocr=_AvailableModels.EASYOCR in to_download,
85
+ )
86
+
87
+ if quiet:
88
+ typer.echo(output_dir)
89
+ else:
90
+ typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
91
+
92
+ console.print(
93
+ "\n",
94
+ "Docling can now be configured for running offline using the local artifacts.\n\n",
95
+ "Using the CLI:",
96
+ f"`docling --artifacts-path={output_dir} FILE`",
97
+ "\n",
98
+ "Using Python: see the documentation at <https://ds4sd.github.io/docling/usage>.",
99
+ )
100
+
101
+
102
+ click_app = typer.main.get_command(app)
103
+
104
+ if __name__ == "__main__":
105
+ app()
docling/cli/tools.py ADDED
@@ -0,0 +1,17 @@
1
+ import typer
2
+
3
+ from docling.cli.models import app as models_app
4
+
5
+ app = typer.Typer(
6
+ name="Docling helpers",
7
+ no_args_is_help=True,
8
+ add_completion=False,
9
+ pretty_exceptions_enable=False,
10
+ )
11
+
12
+ app.add_typer(models_app, name="models")
13
+
14
+ click_app = typer.main.get_command(app)
15
+
16
+ if __name__ == "__main__":
17
+ app()
@@ -61,5 +61,7 @@ class AppSettings(BaseSettings):
61
61
  perf: BatchConcurrencySettings
62
62
  debug: DebugSettings
63
63
 
64
+ cache_dir: Path = Path.home() / ".cache" / "docling"
65
+
64
66
 
65
67
  settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
@@ -6,6 +6,7 @@ from typing_extensions import TypeVar
6
6
 
7
7
  from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
8
8
  from docling.datamodel.document import ConversionResult
9
+ from docling.datamodel.settings import settings
9
10
 
10
11
 
11
12
  class BasePageModel(ABC):
@@ -21,6 +22,8 @@ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
21
22
 
22
23
  class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
23
24
 
25
+ elements_batch_size: int = settings.perf.elements_batch_size
26
+
24
27
  @abstractmethod
25
28
  def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
26
29
  pass
@@ -2,6 +2,7 @@ import re
2
2
  from pathlib import Path
3
3
  from typing import Iterable, List, Literal, Optional, Tuple, Union
4
4
 
5
+ import numpy as np
5
6
  from docling_core.types.doc import (
6
7
  CodeItem,
7
8
  DocItemLabel,
@@ -61,13 +62,15 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
61
62
  Processes the given batch of elements and enriches them with predictions.
62
63
  """
63
64
 
65
+ _model_repo_folder = "ds4sd--CodeFormula"
66
+ elements_batch_size = 5
64
67
  images_scale = 1.66 # = 120 dpi, aligned with training data resolution
65
68
  expansion_factor = 0.03
66
69
 
67
70
  def __init__(
68
71
  self,
69
72
  enabled: bool,
70
- artifacts_path: Optional[Union[Path, str]],
73
+ artifacts_path: Optional[Path],
71
74
  options: CodeFormulaModelOptions,
72
75
  accelerator_options: AcceleratorOptions,
73
76
  ):
@@ -96,29 +99,32 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
96
99
  )
97
100
 
98
101
  if artifacts_path is None:
99
- artifacts_path = self.download_models_hf()
102
+ artifacts_path = self.download_models()
100
103
  else:
101
- artifacts_path = Path(artifacts_path)
104
+ artifacts_path = artifacts_path / self._model_repo_folder
102
105
 
103
106
  self.code_formula_model = CodeFormulaPredictor(
104
- artifacts_path=artifacts_path,
107
+ artifacts_path=str(artifacts_path),
105
108
  device=device,
106
109
  num_threads=accelerator_options.num_threads,
107
110
  )
108
111
 
109
112
  @staticmethod
110
- def download_models_hf(
111
- local_dir: Optional[Path] = None, force: bool = False
113
+ def download_models(
114
+ local_dir: Optional[Path] = None,
115
+ force: bool = False,
116
+ progress: bool = False,
112
117
  ) -> Path:
113
118
  from huggingface_hub import snapshot_download
114
119
  from huggingface_hub.utils import disable_progress_bars
115
120
 
116
- disable_progress_bars()
121
+ if not progress:
122
+ disable_progress_bars()
117
123
  download_path = snapshot_download(
118
124
  repo_id="ds4sd/CodeFormula",
119
125
  force_download=force,
120
126
  local_dir=local_dir,
121
- revision="v1.0.0",
127
+ revision="v1.0.1",
122
128
  )
123
129
 
124
130
  return Path(download_path)
@@ -226,7 +232,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
226
232
  return
227
233
 
228
234
  labels: List[str] = []
229
- images: List[Image.Image] = []
235
+ images: List[Union[Image.Image, np.ndarray]] = []
230
236
  elements: List[TextItem] = []
231
237
  for el in element_batch:
232
238
  assert isinstance(el.item, TextItem)
@@ -1,6 +1,7 @@
1
1
  from pathlib import Path
2
2
  from typing import Iterable, List, Literal, Optional, Tuple, Union
3
3
 
4
+ import numpy as np
4
5
  from docling_core.types.doc import (
5
6
  DoclingDocument,
6
7
  NodeItem,
@@ -55,12 +56,13 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
55
56
  Processes a batch of elements and adds classification annotations.
56
57
  """
57
58
 
59
+ _model_repo_folder = "ds4sd--DocumentFigureClassifier"
58
60
  images_scale = 2
59
61
 
60
62
  def __init__(
61
63
  self,
62
64
  enabled: bool,
63
- artifacts_path: Optional[Union[Path, str]],
65
+ artifacts_path: Optional[Path],
64
66
  options: DocumentPictureClassifierOptions,
65
67
  accelerator_options: AcceleratorOptions,
66
68
  ):
@@ -88,24 +90,25 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
88
90
  )
89
91
 
90
92
  if artifacts_path is None:
91
- artifacts_path = self.download_models_hf()
93
+ artifacts_path = self.download_models()
92
94
  else:
93
- artifacts_path = Path(artifacts_path)
95
+ artifacts_path = artifacts_path / self._model_repo_folder
94
96
 
95
97
  self.document_picture_classifier = DocumentFigureClassifierPredictor(
96
- artifacts_path=artifacts_path,
98
+ artifacts_path=str(artifacts_path),
97
99
  device=device,
98
100
  num_threads=accelerator_options.num_threads,
99
101
  )
100
102
 
101
103
  @staticmethod
102
- def download_models_hf(
103
- local_dir: Optional[Path] = None, force: bool = False
104
+ def download_models(
105
+ local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
104
106
  ) -> Path:
105
107
  from huggingface_hub import snapshot_download
106
108
  from huggingface_hub.utils import disable_progress_bars
107
109
 
108
- disable_progress_bars()
110
+ if not progress:
111
+ disable_progress_bars()
109
112
  download_path = snapshot_download(
110
113
  repo_id="ds4sd/DocumentFigureClassifier",
111
114
  force_download=force,
@@ -159,7 +162,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
159
162
  yield element
160
163
  return
161
164
 
162
- images: List[Image.Image] = []
165
+ images: List[Union[Image.Image, np.ndarray]] = []
163
166
  elements: List[PictureItem] = []
164
167
  for el in element_batch:
165
168
  assert isinstance(el, PictureItem)
@@ -1,7 +1,10 @@
1
1
  import logging
2
2
  import warnings
3
- from typing import Iterable
3
+ import zipfile
4
+ from pathlib import Path
5
+ from typing import Iterable, List, Optional
4
6
 
7
+ import httpx
5
8
  import numpy
6
9
  import torch
7
10
  from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -17,14 +20,18 @@ from docling.datamodel.settings import settings
17
20
  from docling.models.base_ocr_model import BaseOcrModel
18
21
  from docling.utils.accelerator_utils import decide_device
19
22
  from docling.utils.profiling import TimeRecorder
23
+ from docling.utils.utils import download_url_with_progress
20
24
 
21
25
  _log = logging.getLogger(__name__)
22
26
 
23
27
 
24
28
  class EasyOcrModel(BaseOcrModel):
29
+ _model_repo_folder = "EasyOcr"
30
+
25
31
  def __init__(
26
32
  self,
27
33
  enabled: bool,
34
+ artifacts_path: Optional[Path],
28
35
  options: EasyOcrOptions,
29
36
  accelerator_options: AcceleratorOptions,
30
37
  ):
@@ -62,15 +69,55 @@ class EasyOcrModel(BaseOcrModel):
62
69
  )
63
70
  use_gpu = self.options.use_gpu
64
71
 
72
+ download_enabled = self.options.download_enabled
73
+ model_storage_directory = self.options.model_storage_directory
74
+ if artifacts_path is not None and model_storage_directory is None:
75
+ download_enabled = False
76
+ model_storage_directory = str(artifacts_path / self._model_repo_folder)
77
+
65
78
  self.reader = easyocr.Reader(
66
79
  lang_list=self.options.lang,
67
80
  gpu=use_gpu,
68
- model_storage_directory=self.options.model_storage_directory,
81
+ model_storage_directory=model_storage_directory,
69
82
  recog_network=self.options.recog_network,
70
- download_enabled=self.options.download_enabled,
83
+ download_enabled=download_enabled,
71
84
  verbose=False,
72
85
  )
73
86
 
87
+ @staticmethod
88
+ def download_models(
89
+ detection_models: List[str] = ["craft"],
90
+ recognition_models: List[str] = ["english_g2", "latin_g2"],
91
+ local_dir: Optional[Path] = None,
92
+ force: bool = False,
93
+ progress: bool = False,
94
+ ) -> Path:
95
+ # Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
96
+ from easyocr.config import detection_models as det_models_dict
97
+ from easyocr.config import recognition_models as rec_models_dict
98
+
99
+ if local_dir is None:
100
+ local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
101
+
102
+ local_dir.mkdir(parents=True, exist_ok=True)
103
+
104
+ # Collect models to download
105
+ download_list = []
106
+ for model_name in detection_models:
107
+ if model_name in det_models_dict:
108
+ download_list.append(det_models_dict[model_name])
109
+ for model_name in recognition_models:
110
+ if model_name in rec_models_dict["gen2"]:
111
+ download_list.append(rec_models_dict["gen2"][model_name])
112
+
113
+ # Download models
114
+ for model_details in download_list:
115
+ buf = download_url_with_progress(model_details["url"], progress=progress)
116
+ with zipfile.ZipFile(buf, "r") as zip_ref:
117
+ zip_ref.extractall(local_dir)
118
+
119
+ return local_dir
120
+
74
121
  def __call__(
75
122
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
76
123
  ) -> Iterable[Page]:
@@ -1,7 +1,8 @@
1
1
  import copy
2
2
  import logging
3
+ import warnings
3
4
  from pathlib import Path
4
- from typing import Iterable
5
+ from typing import Iterable, Optional, Union
5
6
 
6
7
  from docling_core.types.doc import DocItemLabel
7
8
  from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
@@ -21,6 +22,8 @@ _log = logging.getLogger(__name__)
21
22
 
22
23
 
23
24
  class LayoutModel(BasePageModel):
25
+ _model_repo_folder = "ds4sd--docling-models"
26
+ _model_path = "model_artifacts/layout"
24
27
 
25
28
  TEXT_ELEM_LABELS = [
26
29
  DocItemLabel.TEXT,
@@ -42,15 +45,56 @@ class LayoutModel(BasePageModel):
42
45
  FORMULA_LABEL = DocItemLabel.FORMULA
43
46
  CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
44
47
 
45
- def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
48
+ def __init__(
49
+ self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions
50
+ ):
46
51
  device = decide_device(accelerator_options.device)
47
52
 
53
+ if artifacts_path is None:
54
+ artifacts_path = self.download_models() / self._model_path
55
+ else:
56
+ # will become the default in the future
57
+ if (artifacts_path / self._model_repo_folder).exists():
58
+ artifacts_path = (
59
+ artifacts_path / self._model_repo_folder / self._model_path
60
+ )
61
+ elif (artifacts_path / self._model_path).exists():
62
+ warnings.warn(
63
+ "The usage of artifacts_path containing directly "
64
+ f"{self._model_path} is deprecated. Please point "
65
+ "the artifacts_path to the parent containing "
66
+ f"the {self._model_repo_folder} folder.",
67
+ DeprecationWarning,
68
+ stacklevel=3,
69
+ )
70
+ artifacts_path = artifacts_path / self._model_path
71
+
48
72
  self.layout_predictor = LayoutPredictor(
49
73
  artifact_path=str(artifacts_path),
50
74
  device=device,
51
75
  num_threads=accelerator_options.num_threads,
52
76
  )
53
77
 
78
+ @staticmethod
79
+ def download_models(
80
+ local_dir: Optional[Path] = None,
81
+ force: bool = False,
82
+ progress: bool = False,
83
+ ) -> Path:
84
+ from huggingface_hub import snapshot_download
85
+ from huggingface_hub.utils import disable_progress_bars
86
+
87
+ if not progress:
88
+ disable_progress_bars()
89
+ download_path = snapshot_download(
90
+ repo_id="ds4sd/docling-models",
91
+ force_download=force,
92
+ local_dir=local_dir,
93
+ revision="v2.1.0",
94
+ )
95
+
96
+ return Path(download_path)
97
+
54
98
  def draw_clusters_and_cells_side_by_side(
55
99
  self, conv_res, page, clusters, mode_prefix: str, show: bool = False
56
100
  ):
@@ -106,10 +150,12 @@ class LayoutModel(BasePageModel):
106
150
  else:
107
151
  with TimeRecorder(conv_res, "layout"):
108
152
  assert page.size is not None
153
+ page_image = page.get_image(scale=1.0)
154
+ assert page_image is not None
109
155
 
110
156
  clusters = []
111
157
  for ix, pred_item in enumerate(
112
- self.layout_predictor.predict(page.get_image(scale=1.0))
158
+ self.layout_predictor.predict(page_image)
113
159
  ):
114
160
  label = DocItemLabel(
115
161
  pred_item["label"]
@@ -1,6 +1,7 @@
1
1
  import copy
2
+ import warnings
2
3
  from pathlib import Path
3
- from typing import Iterable
4
+ from typing import Iterable, Optional, Union
4
5
 
5
6
  import numpy
6
7
  from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
@@ -22,10 +23,13 @@ from docling.utils.profiling import TimeRecorder
22
23
 
23
24
 
24
25
  class TableStructureModel(BasePageModel):
26
+ _model_repo_folder = "ds4sd--docling-models"
27
+ _model_path = "model_artifacts/tableformer"
28
+
25
29
  def __init__(
26
30
  self,
27
31
  enabled: bool,
28
- artifacts_path: Path,
32
+ artifacts_path: Optional[Path],
29
33
  options: TableStructureOptions,
30
34
  accelerator_options: AcceleratorOptions,
31
35
  ):
@@ -35,6 +39,26 @@ class TableStructureModel(BasePageModel):
35
39
 
36
40
  self.enabled = enabled
37
41
  if self.enabled:
42
+
43
+ if artifacts_path is None:
44
+ artifacts_path = self.download_models() / self._model_path
45
+ else:
46
+ # will become the default in the future
47
+ if (artifacts_path / self._model_repo_folder).exists():
48
+ artifacts_path = (
49
+ artifacts_path / self._model_repo_folder / self._model_path
50
+ )
51
+ elif (artifacts_path / self._model_path).exists():
52
+ warnings.warn(
53
+ "The usage of artifacts_path containing directly "
54
+ f"{self._model_path} is deprecated. Please point "
55
+ "the artifacts_path to the parent containing "
56
+ f"the {self._model_repo_folder} folder.",
57
+ DeprecationWarning,
58
+ stacklevel=3,
59
+ )
60
+ artifacts_path = artifacts_path / self._model_path
61
+
38
62
  if self.mode == TableFormerMode.ACCURATE:
39
63
  artifacts_path = artifacts_path / "accurate"
40
64
  else:
@@ -58,6 +82,24 @@ class TableStructureModel(BasePageModel):
58
82
  )
59
83
  self.scale = 2.0 # Scale up table input images to 144 dpi
60
84
 
85
+ @staticmethod
86
+ def download_models(
87
+ local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
88
+ ) -> Path:
89
+ from huggingface_hub import snapshot_download
90
+ from huggingface_hub.utils import disable_progress_bars
91
+
92
+ if not progress:
93
+ disable_progress_bars()
94
+ download_path = snapshot_download(
95
+ repo_id="ds4sd/docling-models",
96
+ force_download=force,
97
+ local_dir=local_dir,
98
+ revision="v2.1.0",
99
+ )
100
+
101
+ return Path(download_path)
102
+
61
103
  def draw_table_and_cells(
62
104
  self,
63
105
  conv_res: ConversionResult,
@@ -79,7 +79,7 @@ class BasePipeline(ABC):
79
79
  for model in self.enrichment_pipe:
80
80
  for element_batch in chunkify(
81
81
  _prepare_elements(conv_res, model),
82
- settings.perf.elements_batch_size,
82
+ model.elements_batch_size,
83
83
  ):
84
84
  for element in model(
85
85
  doc=conv_res.document, element_batch=element_batch
@@ -1,5 +1,6 @@
1
1
  import logging
2
2
  import sys
3
+ import warnings
3
4
  from pathlib import Path
4
5
  from typing import Optional
5
6
 
@@ -17,6 +18,7 @@ from docling.datamodel.pipeline_options import (
17
18
  TesseractCliOcrOptions,
18
19
  TesseractOcrOptions,
19
20
  )
21
+ from docling.datamodel.settings import settings
20
22
  from docling.models.base_ocr_model import BaseOcrModel
21
23
  from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
22
24
  from docling.models.document_picture_classifier import (
@@ -37,23 +39,23 @@ from docling.models.table_structure_model import TableStructureModel
37
39
  from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
38
40
  from docling.models.tesseract_ocr_model import TesseractOcrModel
39
41
  from docling.pipeline.base_pipeline import PaginatedPipeline
42
+ from docling.utils.model_downloader import download_models
40
43
  from docling.utils.profiling import ProfilingScope, TimeRecorder
41
44
 
42
45
  _log = logging.getLogger(__name__)
43
46
 
44
47
 
45
48
  class StandardPdfPipeline(PaginatedPipeline):
46
- _layout_model_path = "model_artifacts/layout"
47
- _table_model_path = "model_artifacts/tableformer"
49
+ _layout_model_path = LayoutModel._model_path
50
+ _table_model_path = TableStructureModel._model_path
48
51
 
49
52
  def __init__(self, pipeline_options: PdfPipelineOptions):
50
53
  super().__init__(pipeline_options)
51
54
  self.pipeline_options: PdfPipelineOptions
52
55
 
53
- if pipeline_options.artifacts_path is None:
54
- self.artifacts_path = self.download_models_hf()
55
- else:
56
- self.artifacts_path = Path(pipeline_options.artifacts_path)
56
+ artifacts_path: Optional[Path] = None
57
+ if pipeline_options.artifacts_path is not None:
58
+ artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
57
59
 
58
60
  self.keep_images = (
59
61
  self.pipeline_options.generate_page_images
@@ -63,7 +65,7 @@ class StandardPdfPipeline(PaginatedPipeline):
63
65
 
64
66
  self.glm_model = GlmModel(options=GlmOptions())
65
67
 
66
- if (ocr_model := self.get_ocr_model()) is None:
68
+ if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
67
69
  raise RuntimeError(
68
70
  f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
69
71
  )
@@ -79,15 +81,13 @@ class StandardPdfPipeline(PaginatedPipeline):
79
81
  ocr_model,
80
82
  # Layout model
81
83
  LayoutModel(
82
- artifacts_path=self.artifacts_path
83
- / StandardPdfPipeline._layout_model_path,
84
+ artifacts_path=artifacts_path,
84
85
  accelerator_options=pipeline_options.accelerator_options,
85
86
  ),
86
87
  # Table structure model
87
88
  TableStructureModel(
88
89
  enabled=pipeline_options.do_table_structure,
89
- artifacts_path=self.artifacts_path
90
- / StandardPdfPipeline._table_model_path,
90
+ artifacts_path=artifacts_path,
91
91
  options=pipeline_options.table_structure_options,
92
92
  accelerator_options=pipeline_options.accelerator_options,
93
93
  ),
@@ -101,7 +101,7 @@ class StandardPdfPipeline(PaginatedPipeline):
101
101
  CodeFormulaModel(
102
102
  enabled=pipeline_options.do_code_enrichment
103
103
  or pipeline_options.do_formula_enrichment,
104
- artifacts_path=pipeline_options.artifacts_path,
104
+ artifacts_path=artifacts_path,
105
105
  options=CodeFormulaModelOptions(
106
106
  do_code_enrichment=pipeline_options.do_code_enrichment,
107
107
  do_formula_enrichment=pipeline_options.do_formula_enrichment,
@@ -111,7 +111,7 @@ class StandardPdfPipeline(PaginatedPipeline):
111
111
  # Document Picture Classifier
112
112
  DocumentPictureClassifier(
113
113
  enabled=pipeline_options.do_picture_classification,
114
- artifacts_path=pipeline_options.artifacts_path,
114
+ artifacts_path=artifacts_path,
115
115
  options=DocumentPictureClassifierOptions(),
116
116
  accelerator_options=pipeline_options.accelerator_options,
117
117
  ),
@@ -127,23 +127,24 @@ class StandardPdfPipeline(PaginatedPipeline):
127
127
  def download_models_hf(
128
128
  local_dir: Optional[Path] = None, force: bool = False
129
129
  ) -> Path:
130
- from huggingface_hub import snapshot_download
131
- from huggingface_hub.utils import disable_progress_bars
132
-
133
- disable_progress_bars()
134
- download_path = snapshot_download(
135
- repo_id="ds4sd/docling-models",
136
- force_download=force,
137
- local_dir=local_dir,
138
- revision="v2.1.0",
130
+ warnings.warn(
131
+ "The usage of StandardPdfPipeline.download_models_hf() is deprecated "
132
+ "use instead the utility `docling-tools models download`, or "
133
+ "the upstream method docling.utils.models_downloader.download_all()",
134
+ DeprecationWarning,
135
+ stacklevel=3,
139
136
  )
140
137
 
141
- return Path(download_path)
138
+ output_dir = download_models(output_dir=local_dir, force=force, progress=False)
139
+ return output_dir
142
140
 
143
- def get_ocr_model(self) -> Optional[BaseOcrModel]:
141
+ def get_ocr_model(
142
+ self, artifacts_path: Optional[Path] = None
143
+ ) -> Optional[BaseOcrModel]:
144
144
  if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
145
145
  return EasyOcrModel(
146
146
  enabled=self.pipeline_options.do_ocr,
147
+ artifacts_path=artifacts_path,
147
148
  options=self.pipeline_options.ocr_options,
148
149
  accelerator_options=self.pipeline_options.accelerator_options,
149
150
  )
@@ -0,0 +1,72 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ from docling.datamodel.settings import settings
6
+ from docling.models.code_formula_model import CodeFormulaModel
7
+ from docling.models.document_picture_classifier import DocumentPictureClassifier
8
+ from docling.models.easyocr_model import EasyOcrModel
9
+ from docling.models.layout_model import LayoutModel
10
+ from docling.models.table_structure_model import TableStructureModel
11
+
12
+ _log = logging.getLogger(__name__)
13
+
14
+
15
+ def download_models(
16
+ output_dir: Optional[Path] = None,
17
+ *,
18
+ force: bool = False,
19
+ progress: bool = False,
20
+ with_layout: bool = True,
21
+ with_tableformer: bool = True,
22
+ with_code_formula: bool = True,
23
+ with_picture_classifier: bool = True,
24
+ with_easyocr: bool = True,
25
+ ):
26
+ if output_dir is None:
27
+ output_dir = settings.cache_dir / "models"
28
+
29
+ # Make sure the folder exists
30
+ output_dir.mkdir(exist_ok=True, parents=True)
31
+
32
+ if with_layout:
33
+ _log.info(f"Downloading layout model...")
34
+ LayoutModel.download_models(
35
+ local_dir=output_dir / LayoutModel._model_repo_folder,
36
+ force=force,
37
+ progress=progress,
38
+ )
39
+
40
+ if with_tableformer:
41
+ _log.info(f"Downloading tableformer model...")
42
+ TableStructureModel.download_models(
43
+ local_dir=output_dir / TableStructureModel._model_repo_folder,
44
+ force=force,
45
+ progress=progress,
46
+ )
47
+
48
+ if with_picture_classifier:
49
+ _log.info(f"Downloading picture classifier model...")
50
+ DocumentPictureClassifier.download_models(
51
+ local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
52
+ force=force,
53
+ progress=progress,
54
+ )
55
+
56
+ if with_code_formula:
57
+ _log.info(f"Downloading code formula model...")
58
+ CodeFormulaModel.download_models(
59
+ local_dir=output_dir / CodeFormulaModel._model_repo_folder,
60
+ force=force,
61
+ progress=progress,
62
+ )
63
+
64
+ if with_easyocr:
65
+ _log.info(f"Downloading easyocr models...")
66
+ EasyOcrModel.download_models(
67
+ local_dir=output_dir / EasyOcrModel._model_repo_folder,
68
+ force=force,
69
+ progress=progress,
70
+ )
71
+
72
+ return output_dir
docling/utils/utils.py CHANGED
@@ -4,6 +4,9 @@ from itertools import islice
4
4
  from pathlib import Path
5
5
  from typing import List, Union
6
6
 
7
+ import requests
8
+ from tqdm import tqdm
9
+
7
10
 
8
11
  def chunkify(iterator, chunk_size):
9
12
  """Yield successive chunks of chunk_size from the iterable."""
@@ -39,3 +42,24 @@ def create_hash(string: str):
39
42
  hasher.update(string.encode("utf-8"))
40
43
 
41
44
  return hasher.hexdigest()
45
+
46
+
47
+ def download_url_with_progress(url: str, progress: bool = False) -> BytesIO:
48
+ buf = BytesIO()
49
+ with requests.get(url, stream=True, allow_redirects=True) as response:
50
+ total_size = int(response.headers.get("content-length", 0))
51
+ progress_bar = tqdm(
52
+ total=total_size,
53
+ unit="B",
54
+ unit_scale=True,
55
+ unit_divisor=1024,
56
+ disable=(not progress),
57
+ )
58
+
59
+ for chunk in response.iter_content(10 * 1024):
60
+ buf.write(chunk)
61
+ progress_bar.update(len(chunk))
62
+ progress_bar.close()
63
+
64
+ buf.seek(0)
65
+ return buf
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.18.0
3
+ Version: 2.19.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -24,12 +24,12 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
24
  Provides-Extra: ocrmac
25
25
  Provides-Extra: rapidocr
26
26
  Provides-Extra: tesserocr
27
- Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
+ Requires-Dist: beautifulsoup4 (>=4.12.3,<4.13.0)
28
28
  Requires-Dist: certifi (>=2024.7.4)
29
29
  Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
30
- Requires-Dist: docling-core[chunking] (>=2.17.0,<3.0.0)
30
+ Requires-Dist: docling-core[chunking] (>=2.17.2,<3.0.0)
31
31
  Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
32
- Requires-Dist: docling-parse (>=3.1.0,<4.0.0)
32
+ Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
33
33
  Requires-Dist: easyocr (>=1.7,<2.0)
34
34
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
35
35
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -52,6 +52,7 @@ Requires-Dist: rtree (>=1.3.0,<2.0.0)
52
52
  Requires-Dist: scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"
53
53
  Requires-Dist: scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"
54
54
  Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
55
+ Requires-Dist: tqdm (>=4.65.0,<5.0.0)
55
56
  Requires-Dist: typer (>=0.12.5,<0.13.0)
56
57
  Project-URL: Repository, https://github.com/DS4SD/docling
57
58
  Description-Content-Type: text/markdown
@@ -7,10 +7,10 @@ docling/backend/docling_parse_v2_backend.py,sha256=IpwrBrtLGwNRl5AYO-o3NjEfNRsAk
7
7
  docling/backend/html_backend.py,sha256=YTPLZiEEEuGaP6G62skK3wXJ0KftuqBCl8erNXeJyoE,15893
8
8
  docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
10
- docling/backend/md_backend.py,sha256=d7XAFHzFO9qhrCJA3raWEmZ8WXSYyy3KOE57oMeqKGc,16502
10
+ docling/backend/md_backend.py,sha256=NaVfcnEH-5bwVovjn76EobF6B6Wm8AhaTZ4E8k0TUPo,16826
11
11
  docling/backend/msexcel_backend.py,sha256=lyJc4ShJGAN2ZfNTTuhdYTF-44cZsGyn_8Djstp3IEU,12700
12
12
  docling/backend/mspowerpoint_backend.py,sha256=esAyaaQe17BQFweGAGJHvImKETefY0BpvfpUSECC49w,16424
13
- docling/backend/msword_backend.py,sha256=0iR1l3eLplPv3CPT7iGwQb50LIVf3C32KZFzwAkARrc,20573
13
+ docling/backend/msword_backend.py,sha256=V4miLIcOH8DDlSCm25F_DALBW60Uf9JoSS0TB4yrQBw,20591
14
14
  docling/backend/pdf_backend.py,sha256=17Pr8dWsD1C4FYUprrwMM9trDGW-JYLjrcScx1Ul4io,2048
15
15
  docling/backend/pypdfium2_backend.py,sha256=QSPfp903ZtSpoNqPmcIek0HmvETrJ1kkwrdxnF5pjS0,9014
16
16
  docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -18,45 +18,48 @@ docling/backend/xml/pubmed_backend.py,sha256=LMnpowjnxa5SydfNC00Ll840BYraL8dCJu-
18
18
  docling/backend/xml/uspto_backend.py,sha256=a5GxWLj2SUR5Of8TWJinhef1gKyaQSjHPVXvGiN8yG8,70324
19
19
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
20
20
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- docling/cli/main.py,sha256=K5C2yQIoM40_W3YU8a7SmneY-hWbNp_JOFPLk0NPcDI,16098
21
+ docling/cli/main.py,sha256=qShZI1f7WWn5T16YtFTeYY1CUucNjyGefIekCWvkAqc,16366
22
+ docling/cli/models.py,sha256=cjP13QZfgHZWPVJw3kJvSszJdDrRVWx-sJipZRfHEuQ,3102
23
+ docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
22
24
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
25
  docling/datamodel/base_models.py,sha256=vewP1X99qfAwiUsiC2m8CBDGiQPsGyp_WkKJHYPoYn4,7026
24
26
  docling/datamodel/document.py,sha256=HkmvQKW3QSx3tAqPTnXiJbD_y1EVwR-WE3n6Gq8g1NY,13428
25
27
  docling/datamodel/pipeline_options.py,sha256=f9-VQFgOdahyclGQgH_T8ZYBopkWsF_fbWbxo39ux3g,7888
26
- docling/datamodel/settings.py,sha256=uN9jeXMwx--tJb-DFU7nr77g0Iou13YAVDzsymTvbHg,1759
28
+ docling/datamodel/settings.py,sha256=pJi9OBqZQhsNi7RwJWQFRDKGhm3u679iN76psA3VtsY,1817
27
29
  docling/document_converter.py,sha256=qaldb7Thqk59RdE-RTGtj1M7l5UzaBdnxIvGoQ7lTeo,12876
28
30
  docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
29
31
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- docling/models/base_model.py,sha256=H5X-exVaAN-XMTzxpgUc-rwH-D8Uk7-VuZtq2soNGXI,2567
32
+ docling/models/base_model.py,sha256=IIf_PA933bdwHst3g_MOC4oiYQcSCIVOnxnCnN1NxEQ,2681
31
33
  docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_VRTE,6919
32
- docling/models/code_formula_model.py,sha256=bOIKJvckZ0QpnDZ-CDiYv-CvuGvaGzJgp2PiYAidKBQ,8422
33
- docling/models/document_picture_classifier.py,sha256=RLB80ueqWZ86hdXtTKmSynCU13nT-T10vUp2sky9110,6078
34
+ docling/models/code_formula_model.py,sha256=6grbRPWaLljadheT5s4omdT6hmXfin4gJU17csWvhjY,8611
35
+ docling/models/document_picture_classifier.py,sha256=6I_j6fG5fnhIV6rqN31LYikNTZyg5isXrVs0GIqHDaY,6235
34
36
  docling/models/ds_glm_model.py,sha256=CkhsP0cEWwm4wb1g3cLFriVGpVtELiUK3REDMkPwAMw,13028
35
- docling/models/easyocr_model.py,sha256=Kakb20ioBxDmNsIqoGvSSs_vbqAWN3QQNHYtEi-eErg,4990
36
- docling/models/layout_model.py,sha256=3Fw7OM6g0j7NgItKsQOgFOCd1q6lp1DacN_db7f6QCw,6090
37
+ docling/models/easyocr_model.py,sha256=dDy0iaR4KUrq7eFIQclMqUYap1B06PG4nC6RMlGYhSw,6886
38
+ docling/models/layout_model.py,sha256=7fQWipGV1HDrvbP4uOKa9QAicQl89jp7lailQmbFL3w,7804
37
39
  docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
38
40
  docling/models/page_assemble_model.py,sha256=c5KLKwkUIdW0JcDHizWsqrpb5x_3DK28x82Q8o-3VJM,5968
39
41
  docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
40
42
  docling/models/rapid_ocr_model.py,sha256=2HXmurNRPP6qyqn7U5h9NQIs8zi0TMHf56CpcKQk0fU,5038
41
- docling/models/table_structure_model.py,sha256=qZgoBrBh7H-RJGCTtaRGcj79g2WzZiUBTPnHqJZ-bLA,9557
43
+ docling/models/table_structure_model.py,sha256=UIqWlw_9JNfGsO86c00rPb4GCg-yNliKEwyhCqlsZbM,11225
42
44
  docling/models/tesseract_ocr_cli_model.py,sha256=b2Is5x2gZLS6mQWnKe0y7p6UU6hRTHDfoH4D2RQ5mx0,9310
43
45
  docling/models/tesseract_ocr_model.py,sha256=BN85u-4a-xzUY7Iw21Ib8L8kx4mgbDGiUtxBelLiJm8,8513
44
46
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
- docling/pipeline/base_pipeline.py,sha256=lK8PQiydWJ9M16kIVL7U1A2iryTRFrN5WSucVo2ohFQ,8757
47
+ docling/pipeline/base_pipeline.py,sha256=9ABK-Cr235bxE5vweoIA5rgBZV_EF8qFxAqLI27H_Pg,8749
46
48
  docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
47
- docling/pipeline/standard_pdf_pipeline.py,sha256=Qefg1JSiFwipypi8TZPJ50WgXTLjwkC0wvYAl02RM2o,10480
49
+ docling/pipeline/standard_pdf_pipeline.py,sha256=xVGLYmh677hKBSRCoHYAVn7drmowba2QGI8f-eEC5gs,10624
48
50
  docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
49
51
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
52
  docling/utils/accelerator_utils.py,sha256=ZjULCn-qhxqx3frF-rJmAlWdzqgUMxH5utLHbSPev80,1367
51
53
  docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
52
54
  docling/utils/glm_utils.py,sha256=uyCoFTX9FbS1Ke0aSlkdzGLUt08dZfkgriWadkyLiiA,11856
53
55
  docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
56
+ docling/utils/model_downloader.py,sha256=5jChSE88byGj7LvGNnB01qBw6n9ODJjnAS66PobRSJc,2267
54
57
  docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,263
55
58
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
56
- docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
59
+ docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
57
60
  docling/utils/visualization.py,sha256=4pn-80fVuE04ken7hUg5Ar47ndRSL9MWBgdHM-1g1zU,2735
58
- docling-2.18.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
59
- docling-2.18.0.dist-info/METADATA,sha256=rBP1Z7m0HMpC-HjR360i2JNuIA9lqknRPjUab1mtVic,8403
60
- docling-2.18.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
61
- docling-2.18.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
62
- docling-2.18.0.dist-info/RECORD,,
61
+ docling-2.19.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
62
+ docling-2.19.0.dist-info/METADATA,sha256=deXdwXb0i_n3pyEDbVGNQNw4APYoUVtXnkHmC-frXWI,8442
63
+ docling-2.19.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
64
+ docling-2.19.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
65
+ docling-2.19.0.dist-info/RECORD,,
@@ -1,3 +1,4 @@
1
1
  [console_scripts]
2
2
  docling=docling.cli.main:app
3
+ docling-tools=docling.cli.tools:app
3
4