docling 2.31.0__py3-none-any.whl → 2.31.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/html_backend.py +41 -17
- docling/backend/md_backend.py +1 -1
- docling/backend/msword_backend.py +1 -1
- docling/backend/xml/jats_backend.py +1 -1
- docling/backend/xml/uspto_backend.py +4 -4
- docling/cli/main.py +1 -1
- docling/cli/models.py +4 -0
- docling/datamodel/document.py +8 -0
- docling/document_converter.py +3 -1
- docling/models/picture_description_vlm_model.py +4 -1
- docling/models/readingorder_model.py +1 -1
- docling/models/table_structure_model.py +2 -2
- docling/models/tesseract_ocr_model.py +5 -3
- docling/utils/model_downloader.py +24 -0
- docling/utils/utils.py +2 -2
- {docling-2.31.0.dist-info → docling-2.31.1.dist-info}/METADATA +1 -1
- {docling-2.31.0.dist-info → docling-2.31.1.dist-info}/RECORD +20 -20
- {docling-2.31.0.dist-info → docling-2.31.1.dist-info}/LICENSE +0 -0
- {docling-2.31.0.dist-info → docling-2.31.1.dist-info}/WHEEL +0 -0
- {docling-2.31.0.dist-info → docling-2.31.1.dist-info}/entry_points.txt +0 -0
docling/backend/html_backend.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import logging
|
2
|
+
import traceback
|
2
3
|
from io import BytesIO
|
3
4
|
from pathlib import Path
|
4
5
|
from typing import Final, Optional, Union, cast
|
@@ -137,7 +138,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
137
138
|
self.analyze_tag(cast(Tag, element), doc)
|
138
139
|
except Exception as exc_child:
|
139
140
|
_log.error(
|
140
|
-
f"Error processing child from tag {tag.name}
|
141
|
+
f"Error processing child from tag {tag.name}:\n{traceback.format_exc()}"
|
141
142
|
)
|
142
143
|
raise exc_child
|
143
144
|
elif isinstance(element, NavigableString) and not isinstance(
|
@@ -390,46 +391,64 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
390
391
|
_log.debug(f"list-item has no text: {element}")
|
391
392
|
|
392
393
|
@staticmethod
|
393
|
-
def parse_table_data(element: Tag) -> Optional[TableData]:
|
394
|
+
def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
|
394
395
|
nested_tables = element.find("table")
|
395
396
|
if nested_tables is not None:
|
396
397
|
_log.debug("Skipping nested table.")
|
397
398
|
return None
|
398
399
|
|
399
|
-
#
|
400
|
-
num_rows =
|
401
|
-
|
402
|
-
# Find the number of columns (taking into account colspan)
|
400
|
+
# Find the number of rows and columns (taking into account spans)
|
401
|
+
num_rows = 0
|
403
402
|
num_cols = 0
|
404
403
|
for row in element("tr"):
|
405
404
|
col_count = 0
|
405
|
+
is_row_header = True
|
406
406
|
if not isinstance(row, Tag):
|
407
407
|
continue
|
408
408
|
for cell in row(["td", "th"]):
|
409
409
|
if not isinstance(row, Tag):
|
410
410
|
continue
|
411
|
-
|
411
|
+
cell_tag = cast(Tag, cell)
|
412
|
+
val = cell_tag.get("colspan", "1")
|
412
413
|
colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
|
413
414
|
col_count += colspan
|
415
|
+
if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
|
416
|
+
is_row_header = False
|
414
417
|
num_cols = max(num_cols, col_count)
|
418
|
+
if not is_row_header:
|
419
|
+
num_rows += 1
|
420
|
+
|
421
|
+
_log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
|
415
422
|
|
416
423
|
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
417
424
|
|
418
425
|
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
419
426
|
|
420
427
|
# Iterate over the rows in the table
|
421
|
-
|
428
|
+
start_row_span = 0
|
429
|
+
row_idx = -1
|
430
|
+
for row in element("tr"):
|
422
431
|
if not isinstance(row, Tag):
|
423
432
|
continue
|
424
433
|
|
425
434
|
# For each row, find all the column cells (both <td> and <th>)
|
426
435
|
cells = row(["td", "th"])
|
427
436
|
|
428
|
-
# Check if
|
437
|
+
# Check if cell is in a column header or row header
|
429
438
|
col_header = True
|
439
|
+
row_header = True
|
430
440
|
for html_cell in cells:
|
431
|
-
if isinstance(html_cell, Tag)
|
432
|
-
|
441
|
+
if isinstance(html_cell, Tag):
|
442
|
+
if html_cell.name == "td":
|
443
|
+
col_header = False
|
444
|
+
row_header = False
|
445
|
+
elif html_cell.get("rowspan") is None:
|
446
|
+
row_header = False
|
447
|
+
if not row_header:
|
448
|
+
row_idx += 1
|
449
|
+
start_row_span = 0
|
450
|
+
else:
|
451
|
+
start_row_span += 1
|
433
452
|
|
434
453
|
# Extract the text content of each cell
|
435
454
|
col_idx = 0
|
@@ -460,19 +479,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
460
479
|
if isinstance(row_val, str) and row_val.isnumeric()
|
461
480
|
else 1
|
462
481
|
)
|
463
|
-
|
464
|
-
|
482
|
+
if row_header:
|
483
|
+
row_span -= 1
|
484
|
+
while (
|
485
|
+
col_idx < num_cols
|
486
|
+
and grid[row_idx + start_row_span][col_idx] is not None
|
487
|
+
):
|
465
488
|
col_idx += 1
|
466
|
-
for r in range(row_span):
|
489
|
+
for r in range(start_row_span, start_row_span + row_span):
|
467
490
|
for c in range(col_span):
|
468
|
-
|
491
|
+
if row_idx + r < num_rows and col_idx + c < num_cols:
|
492
|
+
grid[row_idx + r][col_idx + c] = text
|
469
493
|
|
470
494
|
table_cell = TableCell(
|
471
495
|
text=text,
|
472
496
|
row_span=row_span,
|
473
497
|
col_span=col_span,
|
474
|
-
start_row_offset_idx=row_idx,
|
475
|
-
end_row_offset_idx=row_idx + row_span,
|
498
|
+
start_row_offset_idx=start_row_span + row_idx,
|
499
|
+
end_row_offset_idx=start_row_span + row_idx + row_span,
|
476
500
|
start_col_offset_idx=col_idx,
|
477
501
|
end_col_offset_idx=col_idx + col_span,
|
478
502
|
column_header=col_header,
|
docling/backend/md_backend.py
CHANGED
@@ -409,7 +409,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
409
409
|
)
|
410
410
|
return _txt
|
411
411
|
|
412
|
-
# restore original HTML by removing
|
412
|
+
# restore original HTML by removing previously added markers
|
413
413
|
for regex in [
|
414
414
|
rf"<pre>\s*<code>\s*{_START_MARKER}",
|
415
415
|
rf"{_STOP_MARKER}\s*</code>\s*</pre>",
|
@@ -436,7 +436,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
436
436
|
|
437
437
|
# Common styles for bullet and numbered lists.
|
438
438
|
# "List Bullet", "List Number", "List Paragraph"
|
439
|
-
# Identify
|
439
|
+
# Identify whether list is a numbered list or not
|
440
440
|
# is_numbered = "List Bullet" not in paragraph.style.name
|
441
441
|
is_numbered = False
|
442
442
|
p_style_id, p_level = self._get_label_and_level(paragraph)
|
@@ -91,7 +91,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
91
91
|
super().__init__(in_doc, path_or_stream)
|
92
92
|
self.path_or_stream = path_or_stream
|
93
93
|
|
94
|
-
# Initialize the root of the document
|
94
|
+
# Initialize the root of the document hierarchy
|
95
95
|
self.root: Optional[NodeItem] = None
|
96
96
|
|
97
97
|
self.valid = False
|
@@ -1,6 +1,6 @@
|
|
1
1
|
"""Backend to parse patents from the United States Patent Office (USPTO).
|
2
2
|
|
3
|
-
The parsers included in this module can handle patent grants
|
3
|
+
The parsers included in this module can handle patent grants published since 1976 and
|
4
4
|
patent applications since 2001.
|
5
5
|
The original files can be found in https://bulkdata.uspto.gov.
|
6
6
|
"""
|
@@ -440,7 +440,7 @@ class PatentUsptoIce(PatentUspto):
|
|
440
440
|
)
|
441
441
|
|
442
442
|
elif name == self.Element.PARAGRAPH.value and text:
|
443
|
-
#
|
443
|
+
# remove blank spaces added in paragraphs
|
444
444
|
text = re.sub("\\s+", " ", text)
|
445
445
|
if self.Element.ABSTRACT.value in self.property:
|
446
446
|
self.abstract = (
|
@@ -1697,7 +1697,7 @@ class XmlTable:
|
|
1697
1697
|
class HtmlEntity:
|
1698
1698
|
"""Provide utility functions to get the HTML entities of styled characters.
|
1699
1699
|
|
1700
|
-
This class has been
|
1700
|
+
This class has been developed from:
|
1701
1701
|
https://unicode-table.com/en/html-entities/
|
1702
1702
|
https://www.w3.org/TR/WD-math-970515/table03.html
|
1703
1703
|
"""
|
@@ -1896,7 +1896,7 @@ class HtmlEntity:
|
|
1896
1896
|
"""Get an HTML entity of a greek letter in ISO 8879.
|
1897
1897
|
|
1898
1898
|
Args:
|
1899
|
-
The text to transform, as an ISO 8879
|
1899
|
+
The text to transform, as an ISO 8879 entity.
|
1900
1900
|
|
1901
1901
|
Returns:
|
1902
1902
|
The HTML entity representing a greek letter. If the input text is not
|
docling/cli/main.py
CHANGED
@@ -521,7 +521,7 @@ def convert( # noqa: C901
|
|
521
521
|
if image_export_mode != ImageRefMode.PLACEHOLDER:
|
522
522
|
pipeline_options.generate_page_images = True
|
523
523
|
pipeline_options.generate_picture_images = (
|
524
|
-
True # FIXME: to be deprecated in
|
524
|
+
True # FIXME: to be deprecated in version 3
|
525
525
|
)
|
526
526
|
pipeline_options.images_scale = 2
|
527
527
|
|
docling/cli/models.py
CHANGED
@@ -32,6 +32,8 @@ class _AvailableModels(str, Enum):
|
|
32
32
|
CODE_FORMULA = "code_formula"
|
33
33
|
PICTURE_CLASSIFIER = "picture_classifier"
|
34
34
|
SMOLVLM = "smolvlm"
|
35
|
+
SMOLDOCLING = "smoldocling"
|
36
|
+
SMOLDOCLING_MLX = "smoldocling_mlx"
|
35
37
|
GRANITE_VISION = "granite_vision"
|
36
38
|
EASYOCR = "easyocr"
|
37
39
|
|
@@ -105,6 +107,8 @@ def download(
|
|
105
107
|
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
|
106
108
|
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
|
107
109
|
with_smolvlm=_AvailableModels.SMOLVLM in to_download,
|
110
|
+
with_smoldocling=_AvailableModels.SMOLDOCLING in to_download,
|
111
|
+
with_smoldocling_mlx=_AvailableModels.SMOLDOCLING_MLX in to_download,
|
108
112
|
with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
|
109
113
|
with_easyocr=_AvailableModels.EASYOCR in to_download,
|
110
114
|
)
|
docling/datamodel/document.py
CHANGED
@@ -303,6 +303,14 @@ class _DocumentConversionInput(BaseModel):
|
|
303
303
|
else ""
|
304
304
|
)
|
305
305
|
mime = _DocumentConversionInput._mime_from_extension(ext)
|
306
|
+
if mime is not None and mime.lower() == "application/zip":
|
307
|
+
objname = obj.name.lower()
|
308
|
+
if objname.endswith(".xlsx"):
|
309
|
+
mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
310
|
+
elif objname.endswith(".docx"):
|
311
|
+
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
312
|
+
elif objname.endswith(".pptx"):
|
313
|
+
mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
306
314
|
|
307
315
|
mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
|
308
316
|
mime = mime or _DocumentConversionInput._detect_csv(content)
|
docling/document_converter.py
CHANGED
@@ -189,7 +189,9 @@ class DocumentConverter:
|
|
189
189
|
def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
|
190
190
|
"""Generate a hash of pipeline options to use as part of the cache key."""
|
191
191
|
options_str = str(pipeline_options.model_dump())
|
192
|
-
return hashlib.md5(
|
192
|
+
return hashlib.md5(
|
193
|
+
options_str.encode("utf-8"), usedforsecurity=False
|
194
|
+
).hexdigest()
|
193
195
|
|
194
196
|
def initialize_pipeline(self, format: InputFormat):
|
195
197
|
"""Initialize the conversion pipeline for the selected format."""
|
@@ -57,7 +57,10 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
|
57
57
|
artifacts_path,
|
58
58
|
torch_dtype=torch.bfloat16,
|
59
59
|
_attn_implementation=(
|
60
|
-
"flash_attention_2"
|
60
|
+
"flash_attention_2"
|
61
|
+
if self.device.startswith("cuda")
|
62
|
+
and accelerator_options.cuda_use_flash_attention2
|
63
|
+
else "eager"
|
61
64
|
),
|
62
65
|
).to(self.device)
|
63
66
|
|
@@ -346,7 +346,7 @@ class ReadingOrderModel:
|
|
346
346
|
new_item.prov.append(prov)
|
347
347
|
|
348
348
|
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
349
|
-
with TimeRecorder(conv_res, "
|
349
|
+
with TimeRecorder(conv_res, "reading_order", scope=ProfilingScope.DOCUMENT):
|
350
350
|
page_elements = self._assembled_to_readingorder_elements(conv_res)
|
351
351
|
|
352
352
|
# Apply reading order
|
@@ -234,7 +234,7 @@ class TableStructureModel(BasePageModel):
|
|
234
234
|
tcells = table_cluster.cells
|
235
235
|
tokens = []
|
236
236
|
for c in tcells:
|
237
|
-
# Only allow non empty
|
237
|
+
# Only allow non empty strings (spaces) into the cells of a table
|
238
238
|
if len(c.text.strip()) > 0:
|
239
239
|
new_cell = copy.deepcopy(c)
|
240
240
|
new_cell.rect = BoundingRectangle.from_bounding_box(
|
@@ -267,7 +267,7 @@ class TableStructureModel(BasePageModel):
|
|
267
267
|
element["bbox"]["token"] = text_piece
|
268
268
|
|
269
269
|
tc = TableCell.model_validate(element)
|
270
|
-
if
|
270
|
+
if tc.bbox is not None:
|
271
271
|
tc.bbox = tc.bbox.scaled(1 / self.scale)
|
272
272
|
table_cells.append(tc)
|
273
273
|
|
@@ -1,3 +1,5 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import logging
|
2
4
|
from collections.abc import Iterable
|
3
5
|
from pathlib import Path
|
@@ -38,6 +40,8 @@ class TesseractOcrModel(BaseOcrModel):
|
|
38
40
|
self.options: TesseractOcrOptions
|
39
41
|
|
40
42
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
43
|
+
self.reader = None
|
44
|
+
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
41
45
|
|
42
46
|
if self.enabled:
|
43
47
|
install_errmsg = (
|
@@ -84,9 +88,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|
84
88
|
"oem": tesserocr.OEM.DEFAULT,
|
85
89
|
}
|
86
90
|
|
87
|
-
self.reader = None
|
88
91
|
self.osd_reader = None
|
89
|
-
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
90
92
|
|
91
93
|
if self.options.path is not None:
|
92
94
|
tesserocr_kwargs["path"] = self.options.path
|
@@ -151,7 +153,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|
151
153
|
script = map_tesseract_script(script)
|
152
154
|
lang = f"{self.script_prefix}{script}"
|
153
155
|
|
154
|
-
# Check if the detected
|
156
|
+
# Check if the detected language is present in the system
|
155
157
|
if lang not in self._tesserocr_languages:
|
156
158
|
msg = f"Tesseract detected the script '{script}' and language '{lang}'."
|
157
159
|
msg += " However this language is not installed in your system and will be ignored."
|
@@ -4,12 +4,15 @@ from typing import Optional
|
|
4
4
|
|
5
5
|
from docling.datamodel.pipeline_options import (
|
6
6
|
granite_picture_description,
|
7
|
+
smoldocling_vlm_conversion_options,
|
8
|
+
smoldocling_vlm_mlx_conversion_options,
|
7
9
|
smolvlm_picture_description,
|
8
10
|
)
|
9
11
|
from docling.datamodel.settings import settings
|
10
12
|
from docling.models.code_formula_model import CodeFormulaModel
|
11
13
|
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
12
14
|
from docling.models.easyocr_model import EasyOcrModel
|
15
|
+
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
13
16
|
from docling.models.layout_model import LayoutModel
|
14
17
|
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
15
18
|
from docling.models.table_structure_model import TableStructureModel
|
@@ -27,6 +30,8 @@ def download_models(
|
|
27
30
|
with_code_formula: bool = True,
|
28
31
|
with_picture_classifier: bool = True,
|
29
32
|
with_smolvlm: bool = False,
|
33
|
+
with_smoldocling: bool = False,
|
34
|
+
with_smoldocling_mlx: bool = False,
|
30
35
|
with_granite_vision: bool = False,
|
31
36
|
with_easyocr: bool = True,
|
32
37
|
):
|
@@ -77,6 +82,25 @@ def download_models(
|
|
77
82
|
progress=progress,
|
78
83
|
)
|
79
84
|
|
85
|
+
if with_smoldocling:
|
86
|
+
_log.info("Downloading SmolDocling model...")
|
87
|
+
HuggingFaceVlmModel.download_models(
|
88
|
+
repo_id=smoldocling_vlm_conversion_options.repo_id,
|
89
|
+
local_dir=output_dir / smoldocling_vlm_conversion_options.repo_cache_folder,
|
90
|
+
force=force,
|
91
|
+
progress=progress,
|
92
|
+
)
|
93
|
+
|
94
|
+
if with_smoldocling_mlx:
|
95
|
+
_log.info("Downloading SmolDocling MLX model...")
|
96
|
+
HuggingFaceVlmModel.download_models(
|
97
|
+
repo_id=smoldocling_vlm_mlx_conversion_options.repo_id,
|
98
|
+
local_dir=output_dir
|
99
|
+
/ smoldocling_vlm_mlx_conversion_options.repo_cache_folder,
|
100
|
+
force=force,
|
101
|
+
progress=progress,
|
102
|
+
)
|
103
|
+
|
80
104
|
if with_granite_vision:
|
81
105
|
_log.info("Downloading Granite Vision model...")
|
82
106
|
PictureDescriptionVlmModel.download_models(
|
docling/utils/utils.py
CHANGED
@@ -20,7 +20,7 @@ def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
|
|
20
20
|
"""Create a stable page_hash of the path_or_stream of a file"""
|
21
21
|
|
22
22
|
block_size = 65536
|
23
|
-
hasher = hashlib.sha256()
|
23
|
+
hasher = hashlib.sha256(usedforsecurity=False)
|
24
24
|
|
25
25
|
def _hash_buf(binary_stream):
|
26
26
|
buf = binary_stream.read(block_size) # read and page_hash in chunks
|
@@ -38,7 +38,7 @@ def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
|
|
38
38
|
|
39
39
|
|
40
40
|
def create_hash(string: str):
|
41
|
-
hasher = hashlib.sha256()
|
41
|
+
hasher = hashlib.sha256(usedforsecurity=False)
|
42
42
|
hasher.update(string.encode("utf-8"))
|
43
43
|
|
44
44
|
return hasher.hexdigest()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.31.
|
3
|
+
Version: 2.31.1
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/docling-project/docling
|
6
6
|
License: MIT
|
@@ -10,29 +10,29 @@ docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
|
|
10
10
|
docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
docling/backend/docx/latex/latex_dict.py,sha256=tFJp4ScT_AkY2ON7nLEa560p601Jq2glcZvMKxxjn7w,6593
|
12
12
|
docling/backend/docx/latex/omml.py,sha256=nEpcfyyrOucJyj6cD7wfThrIa-q0CQCoqMb3dkrhCRg,12094
|
13
|
-
docling/backend/html_backend.py,sha256=
|
13
|
+
docling/backend/html_backend.py,sha256=3K-l5SUAAyqISNEb7nPst_I51xzYOVOkgmwXh3lv9sw,21063
|
14
14
|
docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
15
|
docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
|
16
|
-
docling/backend/md_backend.py,sha256=
|
16
|
+
docling/backend/md_backend.py,sha256=JkY1qTvQFXjKSZGfD-83d-fZelorUG_l6mpJdYGqvX8,17210
|
17
17
|
docling/backend/msexcel_backend.py,sha256=3j0WQfqDpgPXdPMCguefdv7arcNVDedPD6gl54cmLn8,18110
|
18
18
|
docling/backend/mspowerpoint_backend.py,sha256=RwqfvvzrtM56L9uf7PR9lvlHJ-LyYGpkS1iVxkTl72Q,17203
|
19
|
-
docling/backend/msword_backend.py,sha256=
|
19
|
+
docling/backend/msword_backend.py,sha256=lVVMNwt0WIl4RD5wAf8pc8bJsb60x1BA8hTTkVmEVa8,32477
|
20
20
|
docling/backend/pdf_backend.py,sha256=KE9TMuFO5WX-o5A_DAd4tEaLi4HMZ4XjKdpllItVkWM,2238
|
21
21
|
docling/backend/pypdfium2_backend.py,sha256=pX8f0WbUb0KTDTKyQuLzP_lgHHubyGXWD33vmpefPy8,10805
|
22
22
|
docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
-
docling/backend/xml/jats_backend.py,sha256=
|
24
|
-
docling/backend/xml/uspto_backend.py,sha256=
|
23
|
+
docling/backend/xml/jats_backend.py,sha256=ghGi9bHjx3BvaOtmzLw86-wZy4UxpQPOPQL4e73-BI8,24927
|
24
|
+
docling/backend/xml/uspto_backend.py,sha256=nyAMr5ht7dclxkVDwsKNeiOhLQrUtRLS8JdscB2AVJg,70924
|
25
25
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
26
26
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
|
-
docling/cli/main.py,sha256=
|
28
|
-
docling/cli/models.py,sha256=
|
27
|
+
docling/cli/main.py,sha256=D7WEY4x6pQCVFRy3peK9KUDOb0Y5IVc-vTDqPnHPK00,26138
|
28
|
+
docling/cli/models.py,sha256=9yLGp6QRJGpR86U3SjmWAXDt3MvBaJLLY4xDVdsu3O8,4160
|
29
29
|
docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
30
30
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
31
|
docling/datamodel/base_models.py,sha256=DRE_XoldtCreWF4ucO0iK0l8uOnfvnhQaYjV0z1Qe0M,7921
|
32
|
-
docling/datamodel/document.py,sha256=
|
32
|
+
docling/datamodel/document.py,sha256=_0Z4zUgCB5677ZW8Y7C1fv75enLZJOJUjcUkGTSiTBA,15553
|
33
33
|
docling/datamodel/pipeline_options.py,sha256=-1QG8dY0RZkTJb66lXErEAnPq4F_1vgnk_5AcIr3cgU,13350
|
34
34
|
docling/datamodel/settings.py,sha256=bNMdowIKv7RUchabQTo4rFNEsxfB6pGg2LoZSY634zo,1869
|
35
|
-
docling/document_converter.py,sha256=
|
35
|
+
docling/document_converter.py,sha256=PRRr65nigQ3LZDl4G2fBMkOtJyswT7xyGt7fpUeDO3w,13849
|
36
36
|
docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
|
37
37
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
38
|
docling/models/api_vlm_model.py,sha256=w1SzdG3Ypz_0iZGiX-skMwV1E1JnOHH2BJiNkcEEIAA,2478
|
@@ -53,14 +53,14 @@ docling/models/page_assemble_model.py,sha256=GO7JI1D6T6EkSW94cLQobPGNQUahkxQqTPR
|
|
53
53
|
docling/models/page_preprocessing_model.py,sha256=6pOGXiFQ-oz06UmJdcaYMdVyfZ0YVLWS6efGcx7Mxws,3105
|
54
54
|
docling/models/picture_description_api_model.py,sha256=qs3n0smC9DXhzwJeK_iQG08Y6ZFHInKtdGPVhzgvxgU,2091
|
55
55
|
docling/models/picture_description_base_model.py,sha256=FbBVXzAOB87xpJN28tuGCxoAdcf6mZNUOqJR7ljUg5g,2946
|
56
|
-
docling/models/picture_description_vlm_model.py,sha256=
|
56
|
+
docling/models/picture_description_vlm_model.py,sha256=DiTjnehVy1n0N04xPUvZl8rx4TiNHzHn9Cnzy_ePGts,4177
|
57
57
|
docling/models/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
58
58
|
docling/models/plugins/defaults.py,sha256=qslXGnRX07Z3GGttNriqaox0v0vXp4zs4KLurHCZjp4,858
|
59
59
|
docling/models/rapid_ocr_model.py,sha256=Tq_1Egu5Hjx7Y69Vox17QTtRXztSyflB1fhN08CWQwY,5894
|
60
|
-
docling/models/readingorder_model.py,sha256=
|
61
|
-
docling/models/table_structure_model.py,sha256=
|
60
|
+
docling/models/readingorder_model.py,sha256=S9ru2ApY9sE-Uue3hptWHmbmElwo36bUbAikxCFpHYs,14574
|
61
|
+
docling/models/table_structure_model.py,sha256=1gxLaooK0IKMrnmS8nT1BItKqt1GAKghfpmLKb3i53g,12566
|
62
62
|
docling/models/tesseract_ocr_cli_model.py,sha256=iFdOud5ymoW9WV8bWLCDpd3LJBo9M5bTT5vc635zEDY,10229
|
63
|
-
docling/models/tesseract_ocr_model.py,sha256=
|
63
|
+
docling/models/tesseract_ocr_model.py,sha256=72009TJL_7tXTEnhlsGRiw_KibrQ0LjZlCBtW8NtwUc,9339
|
64
64
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
65
65
|
docling/pipeline/base_pipeline.py,sha256=DnuxAf7EQusdSRae0QUVth-0f2mSff8JZjX-2vazk00,8751
|
66
66
|
docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
|
@@ -74,13 +74,13 @@ docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
|
|
74
74
|
docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
|
75
75
|
docling/utils/layout_postprocessor.py,sha256=x7exVG3HYzV9M_O78FfyoG43Y2L7PPMMydvSNwjqh8s,24528
|
76
76
|
docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
|
77
|
-
docling/utils/model_downloader.py,sha256=
|
77
|
+
docling/utils/model_downloader.py,sha256=ocvud3G3qlBQhzMo69Q3RJMnvq5HPZ2DwNbMuEp8RCs,4142
|
78
78
|
docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,263
|
79
79
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
80
|
-
docling/utils/utils.py,sha256=
|
80
|
+
docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
|
81
81
|
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
82
|
-
docling-2.31.
|
83
|
-
docling-2.31.
|
84
|
-
docling-2.31.
|
85
|
-
docling-2.31.
|
86
|
-
docling-2.31.
|
82
|
+
docling-2.31.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
83
|
+
docling-2.31.1.dist-info/METADATA,sha256=31fTxA8TvMdw_KdThEyn3Z5GAHAhNEtvFYlrPdzqV4w,10108
|
84
|
+
docling-2.31.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
85
|
+
docling-2.31.1.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
|
86
|
+
docling-2.31.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|