docling 2.31.0__py3-none-any.whl → 2.31.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -287,7 +287,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
287
287
 
288
288
  # ========= Section headers
289
289
  def _is_section_header(self, line):
290
- return re.match(r"^==+", line)
290
+ return re.match(r"^==+\s+", line)
291
291
 
292
292
  def _parse_section_header(self, line):
293
293
  match = re.match(r"^(=+)\s+(.*)", line)
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import traceback
2
3
  from io import BytesIO
3
4
  from pathlib import Path
4
5
  from typing import Final, Optional, Union, cast
@@ -137,7 +138,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
137
138
  self.analyze_tag(cast(Tag, element), doc)
138
139
  except Exception as exc_child:
139
140
  _log.error(
140
- f"Error processing child from tag {tag.name}: {exc_child!r}"
141
+ f"Error processing child from tag {tag.name}:\n{traceback.format_exc()}"
141
142
  )
142
143
  raise exc_child
143
144
  elif isinstance(element, NavigableString) and not isinstance(
@@ -390,46 +391,64 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
390
391
  _log.debug(f"list-item has no text: {element}")
391
392
 
392
393
  @staticmethod
393
- def parse_table_data(element: Tag) -> Optional[TableData]:
394
+ def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
394
395
  nested_tables = element.find("table")
395
396
  if nested_tables is not None:
396
397
  _log.debug("Skipping nested table.")
397
398
  return None
398
399
 
399
- # Count the number of rows (number of <tr> elements)
400
- num_rows = len(element("tr"))
401
-
402
- # Find the number of columns (taking into account colspan)
400
+ # Find the number of rows and columns (taking into account spans)
401
+ num_rows = 0
403
402
  num_cols = 0
404
403
  for row in element("tr"):
405
404
  col_count = 0
405
+ is_row_header = True
406
406
  if not isinstance(row, Tag):
407
407
  continue
408
408
  for cell in row(["td", "th"]):
409
409
  if not isinstance(row, Tag):
410
410
  continue
411
- val = cast(Tag, cell).get("colspan", "1")
411
+ cell_tag = cast(Tag, cell)
412
+ val = cell_tag.get("colspan", "1")
412
413
  colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
413
414
  col_count += colspan
415
+ if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
416
+ is_row_header = False
414
417
  num_cols = max(num_cols, col_count)
418
+ if not is_row_header:
419
+ num_rows += 1
420
+
421
+ _log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
415
422
 
416
423
  grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
417
424
 
418
425
  data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
419
426
 
420
427
  # Iterate over the rows in the table
421
- for row_idx, row in enumerate(element("tr")):
428
+ start_row_span = 0
429
+ row_idx = -1
430
+ for row in element("tr"):
422
431
  if not isinstance(row, Tag):
423
432
  continue
424
433
 
425
434
  # For each row, find all the column cells (both <td> and <th>)
426
435
  cells = row(["td", "th"])
427
436
 
428
- # Check if each cell in the row is a header -> means it is a column header
437
+ # Check if cell is in a column header or row header
429
438
  col_header = True
439
+ row_header = True
430
440
  for html_cell in cells:
431
- if isinstance(html_cell, Tag) and html_cell.name == "td":
432
- col_header = False
441
+ if isinstance(html_cell, Tag):
442
+ if html_cell.name == "td":
443
+ col_header = False
444
+ row_header = False
445
+ elif html_cell.get("rowspan") is None:
446
+ row_header = False
447
+ if not row_header:
448
+ row_idx += 1
449
+ start_row_span = 0
450
+ else:
451
+ start_row_span += 1
433
452
 
434
453
  # Extract the text content of each cell
435
454
  col_idx = 0
@@ -460,19 +479,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
460
479
  if isinstance(row_val, str) and row_val.isnumeric()
461
480
  else 1
462
481
  )
463
-
464
- while grid[row_idx][col_idx] is not None:
482
+ if row_header:
483
+ row_span -= 1
484
+ while (
485
+ col_idx < num_cols
486
+ and grid[row_idx + start_row_span][col_idx] is not None
487
+ ):
465
488
  col_idx += 1
466
- for r in range(row_span):
489
+ for r in range(start_row_span, start_row_span + row_span):
467
490
  for c in range(col_span):
468
- grid[row_idx + r][col_idx + c] = text
491
+ if row_idx + r < num_rows and col_idx + c < num_cols:
492
+ grid[row_idx + r][col_idx + c] = text
469
493
 
470
494
  table_cell = TableCell(
471
495
  text=text,
472
496
  row_span=row_span,
473
497
  col_span=col_span,
474
- start_row_offset_idx=row_idx,
475
- end_row_offset_idx=row_idx + row_span,
498
+ start_row_offset_idx=start_row_span + row_idx,
499
+ end_row_offset_idx=start_row_span + row_idx + row_span,
476
500
  start_col_offset_idx=col_idx,
477
501
  end_col_offset_idx=col_idx + col_span,
478
502
  column_header=col_header,
@@ -409,7 +409,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
409
409
  )
410
410
  return _txt
411
411
 
412
- # restore original HTML by removing previouly added markers
412
+ # restore original HTML by removing previously added markers
413
413
  for regex in [
414
414
  rf"<pre>\s*<code>\s*{_START_MARKER}",
415
415
  rf"{_STOP_MARKER}\s*</code>\s*</pre>",
@@ -436,7 +436,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
436
436
 
437
437
  # Common styles for bullet and numbered lists.
438
438
  # "List Bullet", "List Number", "List Paragraph"
439
- # Identify wether list is a numbered list or not
439
+ # Identify whether list is a numbered list or not
440
440
  # is_numbered = "List Bullet" not in paragraph.style.name
441
441
  is_numbered = False
442
442
  p_style_id, p_level = self._get_label_and_level(paragraph)
@@ -91,7 +91,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
91
91
  super().__init__(in_doc, path_or_stream)
92
92
  self.path_or_stream = path_or_stream
93
93
 
94
- # Initialize the root of the document hiearchy
94
+ # Initialize the root of the document hierarchy
95
95
  self.root: Optional[NodeItem] = None
96
96
 
97
97
  self.valid = False
@@ -1,6 +1,6 @@
1
1
  """Backend to parse patents from the United States Patent Office (USPTO).
2
2
 
3
- The parsers included in this module can handle patent grants pubished since 1976 and
3
+ The parsers included in this module can handle patent grants published since 1976 and
4
4
  patent applications since 2001.
5
5
  The original files can be found in https://bulkdata.uspto.gov.
6
6
  """
@@ -440,7 +440,7 @@ class PatentUsptoIce(PatentUspto):
440
440
  )
441
441
 
442
442
  elif name == self.Element.PARAGRAPH.value and text:
443
- # remmove blank spaces added in paragraphs
443
+ # remove blank spaces added in paragraphs
444
444
  text = re.sub("\\s+", " ", text)
445
445
  if self.Element.ABSTRACT.value in self.property:
446
446
  self.abstract = (
@@ -1697,7 +1697,7 @@ class XmlTable:
1697
1697
  class HtmlEntity:
1698
1698
  """Provide utility functions to get the HTML entities of styled characters.
1699
1699
 
1700
- This class has been developped from:
1700
+ This class has been developed from:
1701
1701
  https://unicode-table.com/en/html-entities/
1702
1702
  https://www.w3.org/TR/WD-math-970515/table03.html
1703
1703
  """
@@ -1896,7 +1896,7 @@ class HtmlEntity:
1896
1896
  """Get an HTML entity of a greek letter in ISO 8879.
1897
1897
 
1898
1898
  Args:
1899
- The text to transform, as an ISO 8879 entitiy.
1899
+ The text to transform, as an ISO 8879 entity.
1900
1900
 
1901
1901
  Returns:
1902
1902
  The HTML entity representing a greek letter. If the input text is not
docling/cli/main.py CHANGED
@@ -521,7 +521,7 @@ def convert( # noqa: C901
521
521
  if image_export_mode != ImageRefMode.PLACEHOLDER:
522
522
  pipeline_options.generate_page_images = True
523
523
  pipeline_options.generate_picture_images = (
524
- True # FIXME: to be deprecated in verson 3
524
+ True # FIXME: to be deprecated in version 3
525
525
  )
526
526
  pipeline_options.images_scale = 2
527
527
 
docling/cli/models.py CHANGED
@@ -32,6 +32,8 @@ class _AvailableModels(str, Enum):
32
32
  CODE_FORMULA = "code_formula"
33
33
  PICTURE_CLASSIFIER = "picture_classifier"
34
34
  SMOLVLM = "smolvlm"
35
+ SMOLDOCLING = "smoldocling"
36
+ SMOLDOCLING_MLX = "smoldocling_mlx"
35
37
  GRANITE_VISION = "granite_vision"
36
38
  EASYOCR = "easyocr"
37
39
 
@@ -105,6 +107,8 @@ def download(
105
107
  with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
106
108
  with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
107
109
  with_smolvlm=_AvailableModels.SMOLVLM in to_download,
110
+ with_smoldocling=_AvailableModels.SMOLDOCLING in to_download,
111
+ with_smoldocling_mlx=_AvailableModels.SMOLDOCLING_MLX in to_download,
108
112
  with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
109
113
  with_easyocr=_AvailableModels.EASYOCR in to_download,
110
114
  )
@@ -303,6 +303,14 @@ class _DocumentConversionInput(BaseModel):
303
303
  else ""
304
304
  )
305
305
  mime = _DocumentConversionInput._mime_from_extension(ext)
306
+ if mime is not None and mime.lower() == "application/zip":
307
+ objname = obj.name.lower()
308
+ if objname.endswith(".xlsx"):
309
+ mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
310
+ elif objname.endswith(".docx"):
311
+ mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
312
+ elif objname.endswith(".pptx"):
313
+ mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
306
314
 
307
315
  mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
308
316
  mime = mime or _DocumentConversionInput._detect_csv(content)
@@ -189,7 +189,9 @@ class DocumentConverter:
189
189
  def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
190
190
  """Generate a hash of pipeline options to use as part of the cache key."""
191
191
  options_str = str(pipeline_options.model_dump())
192
- return hashlib.md5(options_str.encode("utf-8")).hexdigest()
192
+ return hashlib.md5(
193
+ options_str.encode("utf-8"), usedforsecurity=False
194
+ ).hexdigest()
193
195
 
194
196
  def initialize_pipeline(self, format: InputFormat):
195
197
  """Initialize the conversion pipeline for the selected format."""
@@ -57,7 +57,10 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
57
57
  artifacts_path,
58
58
  torch_dtype=torch.bfloat16,
59
59
  _attn_implementation=(
60
- "flash_attention_2" if self.device.startswith("cuda") else "eager"
60
+ "flash_attention_2"
61
+ if self.device.startswith("cuda")
62
+ and accelerator_options.cuda_use_flash_attention2
63
+ else "eager"
61
64
  ),
62
65
  ).to(self.device)
63
66
 
@@ -346,7 +346,7 @@ class ReadingOrderModel:
346
346
  new_item.prov.append(prov)
347
347
 
348
348
  def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
349
- with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
349
+ with TimeRecorder(conv_res, "reading_order", scope=ProfilingScope.DOCUMENT):
350
350
  page_elements = self._assembled_to_readingorder_elements(conv_res)
351
351
 
352
352
  # Apply reading order
@@ -234,7 +234,7 @@ class TableStructureModel(BasePageModel):
234
234
  tcells = table_cluster.cells
235
235
  tokens = []
236
236
  for c in tcells:
237
- # Only allow non empty stings (spaces) into the cells of a table
237
+ # Only allow non empty strings (spaces) into the cells of a table
238
238
  if len(c.text.strip()) > 0:
239
239
  new_cell = copy.deepcopy(c)
240
240
  new_cell.rect = BoundingRectangle.from_bounding_box(
@@ -267,7 +267,7 @@ class TableStructureModel(BasePageModel):
267
267
  element["bbox"]["token"] = text_piece
268
268
 
269
269
  tc = TableCell.model_validate(element)
270
- if self.do_cell_matching and tc.bbox is not None:
270
+ if tc.bbox is not None:
271
271
  tc.bbox = tc.bbox.scaled(1 / self.scale)
272
272
  table_cells.append(tc)
273
273
 
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  from collections.abc import Iterable
3
5
  from pathlib import Path
@@ -38,6 +40,8 @@ class TesseractOcrModel(BaseOcrModel):
38
40
  self.options: TesseractOcrOptions
39
41
 
40
42
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
43
+ self.reader = None
44
+ self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
41
45
 
42
46
  if self.enabled:
43
47
  install_errmsg = (
@@ -84,9 +88,7 @@ class TesseractOcrModel(BaseOcrModel):
84
88
  "oem": tesserocr.OEM.DEFAULT,
85
89
  }
86
90
 
87
- self.reader = None
88
91
  self.osd_reader = None
89
- self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
90
92
 
91
93
  if self.options.path is not None:
92
94
  tesserocr_kwargs["path"] = self.options.path
@@ -151,7 +153,7 @@ class TesseractOcrModel(BaseOcrModel):
151
153
  script = map_tesseract_script(script)
152
154
  lang = f"{self.script_prefix}{script}"
153
155
 
154
- # Check if the detected languge is present in the system
156
+ # Check if the detected language is present in the system
155
157
  if lang not in self._tesserocr_languages:
156
158
  msg = f"Tesseract detected the script '{script}' and language '{lang}'."
157
159
  msg += " However this language is not installed in your system and will be ignored."
@@ -4,12 +4,15 @@ from typing import Optional
4
4
 
5
5
  from docling.datamodel.pipeline_options import (
6
6
  granite_picture_description,
7
+ smoldocling_vlm_conversion_options,
8
+ smoldocling_vlm_mlx_conversion_options,
7
9
  smolvlm_picture_description,
8
10
  )
9
11
  from docling.datamodel.settings import settings
10
12
  from docling.models.code_formula_model import CodeFormulaModel
11
13
  from docling.models.document_picture_classifier import DocumentPictureClassifier
12
14
  from docling.models.easyocr_model import EasyOcrModel
15
+ from docling.models.hf_vlm_model import HuggingFaceVlmModel
13
16
  from docling.models.layout_model import LayoutModel
14
17
  from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
15
18
  from docling.models.table_structure_model import TableStructureModel
@@ -27,6 +30,8 @@ def download_models(
27
30
  with_code_formula: bool = True,
28
31
  with_picture_classifier: bool = True,
29
32
  with_smolvlm: bool = False,
33
+ with_smoldocling: bool = False,
34
+ with_smoldocling_mlx: bool = False,
30
35
  with_granite_vision: bool = False,
31
36
  with_easyocr: bool = True,
32
37
  ):
@@ -77,6 +82,25 @@ def download_models(
77
82
  progress=progress,
78
83
  )
79
84
 
85
+ if with_smoldocling:
86
+ _log.info("Downloading SmolDocling model...")
87
+ HuggingFaceVlmModel.download_models(
88
+ repo_id=smoldocling_vlm_conversion_options.repo_id,
89
+ local_dir=output_dir / smoldocling_vlm_conversion_options.repo_cache_folder,
90
+ force=force,
91
+ progress=progress,
92
+ )
93
+
94
+ if with_smoldocling_mlx:
95
+ _log.info("Downloading SmolDocling MLX model...")
96
+ HuggingFaceVlmModel.download_models(
97
+ repo_id=smoldocling_vlm_mlx_conversion_options.repo_id,
98
+ local_dir=output_dir
99
+ / smoldocling_vlm_mlx_conversion_options.repo_cache_folder,
100
+ force=force,
101
+ progress=progress,
102
+ )
103
+
80
104
  if with_granite_vision:
81
105
  _log.info("Downloading Granite Vision model...")
82
106
  PictureDescriptionVlmModel.download_models(
docling/utils/utils.py CHANGED
@@ -20,7 +20,7 @@ def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
20
20
  """Create a stable page_hash of the path_or_stream of a file"""
21
21
 
22
22
  block_size = 65536
23
- hasher = hashlib.sha256()
23
+ hasher = hashlib.sha256(usedforsecurity=False)
24
24
 
25
25
  def _hash_buf(binary_stream):
26
26
  buf = binary_stream.read(block_size) # read and page_hash in chunks
@@ -38,7 +38,7 @@ def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
38
38
 
39
39
 
40
40
  def create_hash(string: str):
41
- hasher = hashlib.sha256()
41
+ hasher = hashlib.sha256(usedforsecurity=False)
42
42
  hasher.update(string.encode("utf-8"))
43
43
 
44
44
  return hasher.hexdigest()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.31.0
3
+ Version: 2.31.2
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
@@ -28,6 +28,7 @@ Provides-Extra: vlm
28
28
  Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
29
29
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
30
30
  Requires-Dist: certifi (>=2024.7.4)
31
+ Requires-Dist: click (<8.2.0)
31
32
  Requires-Dist: docling-core[chunking] (>=2.26.0,<3.0.0)
32
33
  Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
33
34
  Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
@@ -1,7 +1,7 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  docling/backend/abstract_backend.py,sha256=1lNxzwDTn303aXduPDVmTyXn-5ZIoWMLYqNxANGWmQQ,1658
4
- docling/backend/asciidoc_backend.py,sha256=VZ8Xk1VHGHRqBo_TdtMzRAu1NFaFaJ8dk4CaEcBaEm0,14038
4
+ docling/backend/asciidoc_backend.py,sha256=W-4MRcID6AU9Ax23q8FwDwGG-OOCrBoqcNf2Ch_WPUc,14041
5
5
  docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE0,4536
6
6
  docling/backend/docling_parse_backend.py,sha256=V_CsUdN5RkGQBBq7A_ReAiUW4CQVh0-1Ur157Ozurdg,8017
7
7
  docling/backend/docling_parse_v2_backend.py,sha256=6fokgqb1hMbZua33gL46EFamrwPTC7ms6ZuEHw-Dv28,9395
@@ -10,29 +10,29 @@ docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
10
10
  docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  docling/backend/docx/latex/latex_dict.py,sha256=tFJp4ScT_AkY2ON7nLEa560p601Jq2glcZvMKxxjn7w,6593
12
12
  docling/backend/docx/latex/omml.py,sha256=nEpcfyyrOucJyj6cD7wfThrIa-q0CQCoqMb3dkrhCRg,12094
13
- docling/backend/html_backend.py,sha256=TBiMAp3s_QbQTRymFA7wScXECyHn_w-Kb8MbqRibTmE,20099
13
+ docling/backend/html_backend.py,sha256=3K-l5SUAAyqISNEb7nPst_I51xzYOVOkgmwXh3lv9sw,21063
14
14
  docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
16
- docling/backend/md_backend.py,sha256=EdGBXe0n8zniO1LSF3VIjviKs1VRUujpF8aFUpJ5D1k,17209
16
+ docling/backend/md_backend.py,sha256=JkY1qTvQFXjKSZGfD-83d-fZelorUG_l6mpJdYGqvX8,17210
17
17
  docling/backend/msexcel_backend.py,sha256=3j0WQfqDpgPXdPMCguefdv7arcNVDedPD6gl54cmLn8,18110
18
18
  docling/backend/mspowerpoint_backend.py,sha256=RwqfvvzrtM56L9uf7PR9lvlHJ-LyYGpkS1iVxkTl72Q,17203
19
- docling/backend/msword_backend.py,sha256=Xdrs_k160-tDUmhcFGZ7MBbpiYkwPLT3wl3FUO2Ui1A,32476
19
+ docling/backend/msword_backend.py,sha256=lVVMNwt0WIl4RD5wAf8pc8bJsb60x1BA8hTTkVmEVa8,32477
20
20
  docling/backend/pdf_backend.py,sha256=KE9TMuFO5WX-o5A_DAd4tEaLi4HMZ4XjKdpllItVkWM,2238
21
21
  docling/backend/pypdfium2_backend.py,sha256=pX8f0WbUb0KTDTKyQuLzP_lgHHubyGXWD33vmpefPy8,10805
22
22
  docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
- docling/backend/xml/jats_backend.py,sha256=g9YNSS8kqhVL7ceZF2jR7Aaqbh1F1Zn6jmte0HyEH20,24926
24
- docling/backend/xml/uspto_backend.py,sha256=iE1PSAgXqtHkqcsC1RUJiwuyKNlf2elucCV1V2sk0kQ,70926
23
+ docling/backend/xml/jats_backend.py,sha256=ghGi9bHjx3BvaOtmzLw86-wZy4UxpQPOPQL4e73-BI8,24927
24
+ docling/backend/xml/uspto_backend.py,sha256=nyAMr5ht7dclxkVDwsKNeiOhLQrUtRLS8JdscB2AVJg,70924
25
25
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
26
26
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
- docling/cli/main.py,sha256=Z6EdrwESOKUBHF6yJlzuwnznScBtdrlbU_xB0AT9cA4,26137
28
- docling/cli/models.py,sha256=Cyv7d_c8J62luGWsYvbcC9_3UpPp_TVsFo5vJAyr4kI,3940
27
+ docling/cli/main.py,sha256=D7WEY4x6pQCVFRy3peK9KUDOb0Y5IVc-vTDqPnHPK00,26138
28
+ docling/cli/models.py,sha256=9yLGp6QRJGpR86U3SjmWAXDt3MvBaJLLY4xDVdsu3O8,4160
29
29
  docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
30
30
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
31
  docling/datamodel/base_models.py,sha256=DRE_XoldtCreWF4ucO0iK0l8uOnfvnhQaYjV0z1Qe0M,7921
32
- docling/datamodel/document.py,sha256=02QybqtnQ0genFU7UF9pVL3fIwguu9br0JbdtcUvu4o,14998
32
+ docling/datamodel/document.py,sha256=_0Z4zUgCB5677ZW8Y7C1fv75enLZJOJUjcUkGTSiTBA,15553
33
33
  docling/datamodel/pipeline_options.py,sha256=-1QG8dY0RZkTJb66lXErEAnPq4F_1vgnk_5AcIr3cgU,13350
34
34
  docling/datamodel/settings.py,sha256=bNMdowIKv7RUchabQTo4rFNEsxfB6pGg2LoZSY634zo,1869
35
- docling/document_converter.py,sha256=zSaGp2zx73kiE1KHmEHwnG-wxJvcMiyyn2fCAM2vdYk,13804
35
+ docling/document_converter.py,sha256=PRRr65nigQ3LZDl4G2fBMkOtJyswT7xyGt7fpUeDO3w,13849
36
36
  docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
37
37
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
38
  docling/models/api_vlm_model.py,sha256=w1SzdG3Ypz_0iZGiX-skMwV1E1JnOHH2BJiNkcEEIAA,2478
@@ -53,14 +53,14 @@ docling/models/page_assemble_model.py,sha256=GO7JI1D6T6EkSW94cLQobPGNQUahkxQqTPR
53
53
  docling/models/page_preprocessing_model.py,sha256=6pOGXiFQ-oz06UmJdcaYMdVyfZ0YVLWS6efGcx7Mxws,3105
54
54
  docling/models/picture_description_api_model.py,sha256=qs3n0smC9DXhzwJeK_iQG08Y6ZFHInKtdGPVhzgvxgU,2091
55
55
  docling/models/picture_description_base_model.py,sha256=FbBVXzAOB87xpJN28tuGCxoAdcf6mZNUOqJR7ljUg5g,2946
56
- docling/models/picture_description_vlm_model.py,sha256=nS68qbJQCP94-gbgFfAzMLaEC-wquSLEwuDix287c9c,4067
56
+ docling/models/picture_description_vlm_model.py,sha256=DiTjnehVy1n0N04xPUvZl8rx4TiNHzHn9Cnzy_ePGts,4177
57
57
  docling/models/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
58
  docling/models/plugins/defaults.py,sha256=qslXGnRX07Z3GGttNriqaox0v0vXp4zs4KLurHCZjp4,858
59
59
  docling/models/rapid_ocr_model.py,sha256=Tq_1Egu5Hjx7Y69Vox17QTtRXztSyflB1fhN08CWQwY,5894
60
- docling/models/readingorder_model.py,sha256=BxACJ-aIl2aUlyLcyl-uDtuSZH_mCLJgbkDG4Sx_www,14564
61
- docling/models/table_structure_model.py,sha256=dR3JkiPkdbScaNy6dia4_ZXPYESSiMDZztD-lLHE1uY,12591
60
+ docling/models/readingorder_model.py,sha256=S9ru2ApY9sE-Uue3hptWHmbmElwo36bUbAikxCFpHYs,14574
61
+ docling/models/table_structure_model.py,sha256=1gxLaooK0IKMrnmS8nT1BItKqt1GAKghfpmLKb3i53g,12566
62
62
  docling/models/tesseract_ocr_cli_model.py,sha256=iFdOud5ymoW9WV8bWLCDpd3LJBo9M5bTT5vc635zEDY,10229
63
- docling/models/tesseract_ocr_model.py,sha256=oPKOoTTcpYUTDNRteBG-MFcxB9SDC6dk4HuKjIODwMk,9310
63
+ docling/models/tesseract_ocr_model.py,sha256=72009TJL_7tXTEnhlsGRiw_KibrQ0LjZlCBtW8NtwUc,9339
64
64
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
65
  docling/pipeline/base_pipeline.py,sha256=DnuxAf7EQusdSRae0QUVth-0f2mSff8JZjX-2vazk00,8751
66
66
  docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
@@ -74,13 +74,13 @@ docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
74
74
  docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
75
75
  docling/utils/layout_postprocessor.py,sha256=x7exVG3HYzV9M_O78FfyoG43Y2L7PPMMydvSNwjqh8s,24528
76
76
  docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
77
- docling/utils/model_downloader.py,sha256=AMqfHTmZzzsPrlcHFdX7hhW-a3Ki6ndjnTjQQYrDSxU,3206
77
+ docling/utils/model_downloader.py,sha256=ocvud3G3qlBQhzMo69Q3RJMnvq5HPZ2DwNbMuEp8RCs,4142
78
78
  docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,263
79
79
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
80
- docling/utils/utils.py,sha256=xxmVMhQWr7oVBy93IGGpr2x8FBVRHNDwD31kwAF5xK4,1866
80
+ docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
81
81
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
82
- docling-2.31.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
83
- docling-2.31.0.dist-info/METADATA,sha256=tScsMgyfrwtIaCKPl-ygViccYxnRADeUNuHKDGjw7ww,10108
84
- docling-2.31.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
85
- docling-2.31.0.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
86
- docling-2.31.0.dist-info/RECORD,,
82
+ docling-2.31.2.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
83
+ docling-2.31.2.dist-info/METADATA,sha256=V11tJajepssRJ-ltuRsNThmo9_6U6Gc28wqZlgDzdz0,10138
84
+ docling-2.31.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
85
+ docling-2.31.2.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
86
+ docling-2.31.2.dist-info/RECORD,,