docling 2.4.2__py3-none-any.whl → 2.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/html_backend.py +12 -0
- docling/backend/mspowerpoint_backend.py +17 -22
- docling/backend/msword_backend.py +8 -8
- docling/cli/main.py +10 -3
- docling/datamodel/pipeline_options.py +1 -0
- docling/datamodel/settings.py +3 -1
- docling/models/base_ocr_model.py +22 -3
- docling/models/easyocr_model.py +4 -11
- docling/models/tesseract_ocr_cli_model.py +3 -7
- docling/models/tesseract_ocr_model.py +3 -7
- {docling-2.4.2.dist-info → docling-2.5.1.dist-info}/METADATA +1 -1
- {docling-2.4.2.dist-info → docling-2.5.1.dist-info}/RECORD +15 -15
- {docling-2.4.2.dist-info → docling-2.5.1.dist-info}/LICENSE +0 -0
- {docling-2.4.2.dist-info → docling-2.5.1.dist-info}/WHEEL +0 -0
- {docling-2.4.2.dist-info → docling-2.5.1.dist-info}/entry_points.txt +0 -0
docling/backend/html_backend.py
CHANGED
@@ -120,6 +120,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
120
120
|
self.handle_header(element, idx, doc)
|
121
121
|
elif element.name in ["p"]:
|
122
122
|
self.handle_paragraph(element, idx, doc)
|
123
|
+
elif element.name in ["pre"]:
|
124
|
+
self.handle_code(element, idx, doc)
|
123
125
|
elif element.name in ["ul", "ol"]:
|
124
126
|
self.handle_list(element, idx, doc)
|
125
127
|
elif element.name in ["li"]:
|
@@ -205,6 +207,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
205
207
|
level=hlevel,
|
206
208
|
)
|
207
209
|
|
210
|
+
def handle_code(self, element, idx, doc):
|
211
|
+
"""Handles monospace code snippets (pre)."""
|
212
|
+
if element.text is None:
|
213
|
+
return
|
214
|
+
text = element.text.strip()
|
215
|
+
label = DocItemLabel.CODE
|
216
|
+
if len(text) == 0:
|
217
|
+
return
|
218
|
+
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
219
|
+
|
208
220
|
def handle_paragraph(self, element, idx, doc):
|
209
221
|
"""Handles paragraph tags (p)."""
|
210
222
|
if element.text is None:
|
@@ -358,41 +358,36 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
358
358
|
|
359
359
|
size = Size(width=slide_width, height=slide_height)
|
360
360
|
parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
|
361
|
-
# parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)
|
362
|
-
|
363
|
-
# Loop through each shape in the slide
|
364
|
-
for shape in slide.shapes:
|
365
361
|
|
362
|
+
def handle_shapes(shape, parent_slide, slide_ind, doc):
|
363
|
+
handle_groups(shape, parent_slide, slide_ind, doc)
|
366
364
|
if shape.has_table:
|
367
365
|
# Handle Tables
|
368
366
|
self.handle_tables(shape, parent_slide, slide_ind, doc)
|
369
|
-
|
370
367
|
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
371
|
-
# Handle
|
368
|
+
# Handle Pictures
|
372
369
|
self.handle_pictures(shape, parent_slide, slide_ind, doc)
|
373
|
-
|
374
370
|
# If shape doesn't have any text, move on to the next shape
|
375
371
|
if not hasattr(shape, "text"):
|
376
|
-
|
372
|
+
return
|
377
373
|
if shape.text is None:
|
378
|
-
|
374
|
+
return
|
379
375
|
if len(shape.text.strip()) == 0:
|
380
|
-
|
376
|
+
return
|
381
377
|
if not shape.has_text_frame:
|
382
|
-
_log.
|
383
|
-
|
384
|
-
|
385
|
-
# if shape.is_placeholder:
|
386
|
-
# Handle Titles (Headers) and Subtitles
|
387
|
-
# Check if the shape is a placeholder (titles are placeholders)
|
388
|
-
# self.handle_title(shape, parent_slide, slide_ind, doc)
|
389
|
-
# self.handle_text_elements(shape, parent_slide, slide_ind, doc)
|
390
|
-
# else:
|
391
|
-
|
378
|
+
_log.warning("Warning: shape has text but not text_frame")
|
379
|
+
return
|
392
380
|
# Handle other text elements, including lists (bullet lists, numbered lists)
|
393
381
|
self.handle_text_elements(shape, parent_slide, slide_ind, doc)
|
382
|
+
return
|
383
|
+
|
384
|
+
def handle_groups(shape, parent_slide, slide_ind, doc):
|
385
|
+
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
386
|
+
for groupedshape in shape.shapes:
|
387
|
+
handle_shapes(groupedshape, parent_slide, slide_ind, doc)
|
394
388
|
|
395
|
-
|
396
|
-
|
389
|
+
# Loop through each shape in the slide
|
390
|
+
for shape in slide.shapes:
|
391
|
+
handle_shapes(shape, parent_slide, slide_ind, doc)
|
397
392
|
|
398
393
|
return doc
|
@@ -130,7 +130,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
130
130
|
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
131
131
|
for element in body:
|
132
132
|
tag_name = etree.QName(element).localname
|
133
|
-
|
134
133
|
# Check for Inline Images (drawings or blip elements)
|
135
134
|
found_drawing = etree.ElementBase.xpath(
|
136
135
|
element, ".//w:drawing", namespaces=self.xml_namespaces
|
@@ -201,7 +200,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
201
200
|
label_str = ""
|
202
201
|
label_level = 0
|
203
202
|
if parts[0] == "Heading":
|
204
|
-
# print("{} - {}".format(parts[0], parts[1]))
|
205
203
|
label_str = parts[0]
|
206
204
|
label_level = self.str_to_int(parts[1], default=None)
|
207
205
|
if parts[1] == "Heading":
|
@@ -217,19 +215,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
217
215
|
if paragraph.text is None:
|
218
216
|
# _log.warn(f"paragraph has text==None")
|
219
217
|
return
|
220
|
-
|
221
218
|
text = paragraph.text.strip()
|
222
219
|
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
|
223
220
|
|
224
221
|
# Common styles for bullet and numbered lists.
|
225
222
|
# "List Bullet", "List Number", "List Paragraph"
|
226
|
-
#
|
223
|
+
# Identify wether list is a numbered list or not
|
227
224
|
# is_numbered = "List Bullet" not in paragraph.style.name
|
228
225
|
is_numbered = False
|
229
|
-
|
230
226
|
p_style_name, p_level = self.get_label_and_level(paragraph)
|
231
227
|
numid, ilevel = self.get_numId_and_ilvl(paragraph)
|
232
|
-
# print("numid: {}, ilevel: {}, text: {}".format(numid, ilevel, text))
|
233
228
|
|
234
229
|
if numid == 0:
|
235
230
|
numid = None
|
@@ -450,8 +445,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
450
445
|
for row in table.rows:
|
451
446
|
# Calculate the max number of columns
|
452
447
|
num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
|
453
|
-
|
454
|
-
|
448
|
+
|
449
|
+
if num_rows == 1 and num_cols == 1:
|
450
|
+
cell_element = table.rows[0].cells[0]
|
451
|
+
# In case we have a table of only 1 cell, we consider it furniture
|
452
|
+
# And proceed processing the content of the cell as though it's in the document body
|
453
|
+
self.walk_linear(cell_element._element, docx_obj, doc)
|
454
|
+
return
|
455
455
|
|
456
456
|
# Initialize the table grid
|
457
457
|
table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
docling/cli/main.py
CHANGED
@@ -153,6 +153,13 @@ def convert(
|
|
153
153
|
..., help="If enabled, the bitmap content will be processed using OCR."
|
154
154
|
),
|
155
155
|
] = True,
|
156
|
+
force_ocr: Annotated[
|
157
|
+
bool,
|
158
|
+
typer.Option(
|
159
|
+
...,
|
160
|
+
help="Replace any existing text with OCR generated text over the full content.",
|
161
|
+
),
|
162
|
+
] = False,
|
156
163
|
ocr_engine: Annotated[
|
157
164
|
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
158
165
|
] = OcrEngine.EASYOCR,
|
@@ -219,11 +226,11 @@ def convert(
|
|
219
226
|
|
220
227
|
match ocr_engine:
|
221
228
|
case OcrEngine.EASYOCR:
|
222
|
-
ocr_options: OcrOptions = EasyOcrOptions()
|
229
|
+
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
|
223
230
|
case OcrEngine.TESSERACT_CLI:
|
224
|
-
ocr_options = TesseractCliOcrOptions()
|
231
|
+
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
|
225
232
|
case OcrEngine.TESSERACT:
|
226
|
-
ocr_options = TesseractOcrOptions()
|
233
|
+
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
227
234
|
case _:
|
228
235
|
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
229
236
|
|
@@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):
|
|
22
22
|
|
23
23
|
class OcrOptions(BaseModel):
|
24
24
|
kind: str
|
25
|
+
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
25
26
|
bitmap_area_threshold: float = (
|
26
27
|
0.05 # percentage of the area for a bitmap to processed with OCR
|
27
28
|
)
|
docling/datamodel/settings.py
CHANGED
@@ -2,7 +2,7 @@ import sys
|
|
2
2
|
from pathlib import Path
|
3
3
|
|
4
4
|
from pydantic import BaseModel
|
5
|
-
from pydantic_settings import BaseSettings
|
5
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
6
6
|
|
7
7
|
|
8
8
|
class DocumentLimits(BaseModel):
|
@@ -40,6 +40,8 @@ class DebugSettings(BaseModel):
|
|
40
40
|
|
41
41
|
|
42
42
|
class AppSettings(BaseSettings):
|
43
|
+
model_config = SettingsConfigDict(env_prefix="DOCLING_", env_nested_delimiter="_")
|
44
|
+
|
43
45
|
perf: BatchConcurrencySettings
|
44
46
|
debug: DebugSettings
|
45
47
|
|
docling/models/base_ocr_model.py
CHANGED
@@ -10,7 +10,7 @@ from PIL import Image, ImageDraw
|
|
10
10
|
from rtree import index
|
11
11
|
from scipy.ndimage import find_objects, label
|
12
12
|
|
13
|
-
from docling.datamodel.base_models import OcrCell, Page
|
13
|
+
from docling.datamodel.base_models import Cell, OcrCell, Page
|
14
14
|
from docling.datamodel.document import ConversionResult
|
15
15
|
from docling.datamodel.pipeline_options import OcrOptions
|
16
16
|
from docling.datamodel.settings import settings
|
@@ -73,7 +73,9 @@ class BaseOcrModel(BasePageModel):
|
|
73
73
|
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
|
74
74
|
|
75
75
|
# return full-page rectangle if sufficiently covered with bitmaps
|
76
|
-
if coverage > max(
|
76
|
+
if self.options.force_full_page_ocr or coverage > max(
|
77
|
+
BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
|
78
|
+
):
|
77
79
|
return [
|
78
80
|
BoundingBox(
|
79
81
|
l=0,
|
@@ -96,7 +98,7 @@ class BaseOcrModel(BasePageModel):
|
|
96
98
|
return ocr_rects
|
97
99
|
|
98
100
|
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
99
|
-
def
|
101
|
+
def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
|
100
102
|
# Create R-tree index for programmatic cells
|
101
103
|
p = index.Property()
|
102
104
|
p.dimension = 2
|
@@ -117,6 +119,23 @@ class BaseOcrModel(BasePageModel):
|
|
117
119
|
]
|
118
120
|
return filtered_ocr_cells
|
119
121
|
|
122
|
+
def post_process_cells(self, ocr_cells, programmatic_cells):
|
123
|
+
r"""
|
124
|
+
Post-process the ocr and programmatic cells and return the final list of of cells
|
125
|
+
"""
|
126
|
+
if self.options.force_full_page_ocr:
|
127
|
+
# If a full page OCR is forced, use only the OCR cells
|
128
|
+
cells = [
|
129
|
+
Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
|
130
|
+
for c_ocr in ocr_cells
|
131
|
+
]
|
132
|
+
return cells
|
133
|
+
|
134
|
+
## Remove OCR cells which overlap with programmatic cells.
|
135
|
+
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
|
136
|
+
programmatic_cells.extend(filtered_ocr_cells)
|
137
|
+
return programmatic_cells
|
138
|
+
|
120
139
|
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
121
140
|
image = copy.deepcopy(page.image)
|
122
141
|
draw = ImageDraw.Draw(image, "RGBA")
|
docling/models/easyocr_model.py
CHANGED
@@ -5,7 +5,7 @@ import numpy
|
|
5
5
|
import torch
|
6
6
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
7
7
|
|
8
|
-
from docling.datamodel.base_models import OcrCell, Page
|
8
|
+
from docling.datamodel.base_models import Cell, OcrCell, Page
|
9
9
|
from docling.datamodel.document import ConversionResult
|
10
10
|
from docling.datamodel.pipeline_options import EasyOcrOptions
|
11
11
|
from docling.datamodel.settings import settings
|
@@ -31,12 +31,9 @@ class EasyOcrModel(BaseOcrModel):
|
|
31
31
|
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
32
32
|
)
|
33
33
|
|
34
|
-
use_gpu = (
|
35
|
-
False if torch.backends.mps.is_available() else self.options.use_gpu
|
36
|
-
)
|
37
34
|
self.reader = easyocr.Reader(
|
38
35
|
lang_list=self.options.lang,
|
39
|
-
gpu=use_gpu,
|
36
|
+
gpu=self.options.use_gpu,
|
40
37
|
model_storage_directory=self.options.model_storage_directory,
|
41
38
|
download_enabled=self.options.download_enabled,
|
42
39
|
)
|
@@ -91,12 +88,8 @@ class EasyOcrModel(BaseOcrModel):
|
|
91
88
|
]
|
92
89
|
all_ocr_cells.extend(cells)
|
93
90
|
|
94
|
-
|
95
|
-
|
96
|
-
all_ocr_cells, page.cells
|
97
|
-
)
|
98
|
-
|
99
|
-
page.cells.extend(filtered_ocr_cells)
|
91
|
+
# Post-process the cells
|
92
|
+
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
100
93
|
|
101
94
|
# DEBUG code:
|
102
95
|
if settings.debug.visualize_ocr:
|
@@ -7,7 +7,7 @@ from typing import Iterable, Optional, Tuple
|
|
7
7
|
import pandas as pd
|
8
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
9
9
|
|
10
|
-
from docling.datamodel.base_models import OcrCell, Page
|
10
|
+
from docling.datamodel.base_models import Cell, OcrCell, Page
|
11
11
|
from docling.datamodel.document import ConversionResult
|
12
12
|
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
13
13
|
from docling.datamodel.settings import settings
|
@@ -170,12 +170,8 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
170
170
|
)
|
171
171
|
all_ocr_cells.append(cell)
|
172
172
|
|
173
|
-
|
174
|
-
|
175
|
-
all_ocr_cells, page.cells
|
176
|
-
)
|
177
|
-
|
178
|
-
page.cells.extend(filtered_ocr_cells)
|
173
|
+
# Post-process the cells
|
174
|
+
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
179
175
|
|
180
176
|
# DEBUG code:
|
181
177
|
if settings.debug.visualize_ocr:
|
@@ -3,7 +3,7 @@ from typing import Iterable
|
|
3
3
|
|
4
4
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
5
5
|
|
6
|
-
from docling.datamodel.base_models import OcrCell, Page
|
6
|
+
from docling.datamodel.base_models import Cell, OcrCell, Page
|
7
7
|
from docling.datamodel.document import ConversionResult
|
8
8
|
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
9
9
|
from docling.datamodel.settings import settings
|
@@ -140,12 +140,8 @@ class TesseractOcrModel(BaseOcrModel):
|
|
140
140
|
# del high_res_image
|
141
141
|
all_ocr_cells.extend(cells)
|
142
142
|
|
143
|
-
|
144
|
-
|
145
|
-
all_ocr_cells, page.cells
|
146
|
-
)
|
147
|
-
|
148
|
-
page.cells.extend(filtered_ocr_cells)
|
143
|
+
# Post-process the cells
|
144
|
+
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
149
145
|
|
150
146
|
# DEBUG code:
|
151
147
|
if settings.debug.visualize_ocr:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.5.1
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -4,31 +4,31 @@ docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq
|
|
4
4
|
docling/backend/asciidoc_backend.py,sha256=kXZxOLk_LvLFVZwnJVVwjmvc3QWZ0iiG7VnwjgtC3hI,14051
|
5
5
|
docling/backend/docling_parse_backend.py,sha256=csWy6ZGxDuZfNr0YTrUU40DXqelN_TJksWIYoXxZMjU,7633
|
6
6
|
docling/backend/docling_parse_v2_backend.py,sha256=gUr9_fwHbkj238oYQPJ9AxpjFL2jGvhjBlBQPblmSAg,8589
|
7
|
-
docling/backend/html_backend.py,sha256=
|
7
|
+
docling/backend/html_backend.py,sha256=qbu1W8xoTGnXMuZPRPLq68hDbCEj6ygnpxP5gYaodAQ,15593
|
8
8
|
docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
|
9
|
-
docling/backend/mspowerpoint_backend.py,sha256=
|
10
|
-
docling/backend/msword_backend.py,sha256=
|
9
|
+
docling/backend/mspowerpoint_backend.py,sha256=YaVJc6RXWmM1EPTp0TzAiXpGxu6K-MZdPNsmR_64LSg,15358
|
10
|
+
docling/backend/msword_backend.py,sha256=IEqGz-lUrQw0tgBly_gv_mYGC0X0iNnGhkwnDWaDtBY,17341
|
11
11
|
docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
|
12
12
|
docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
|
13
13
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
-
docling/cli/main.py,sha256=
|
14
|
+
docling/cli/main.py,sha256=7stF4dMjGVp5R0Gvcawm21rff5RbEQnWj8ZzoAHvV9k,9619
|
15
15
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
16
|
docling/datamodel/base_models.py,sha256=fmkS6iTxGZCTtNCo2zsgMmBC11Ogf2Ht-mNIlZ9GP-o,5375
|
17
17
|
docling/datamodel/document.py,sha256=9dQf_J18X_MEWs-Mg3Ed6BykFPJ79ETmkkxcssY-vYo,20698
|
18
|
-
docling/datamodel/pipeline_options.py,sha256
|
19
|
-
docling/datamodel/settings.py,sha256=
|
18
|
+
docling/datamodel/pipeline_options.py,sha256=-PXwqkdwSpWjIMCxyqwB8Q453szVNR1zVM-7d0PAOWQ,2530
|
19
|
+
docling/datamodel/settings.py,sha256=JK8lZPBjUx2kD2q-Qpg-o3vOElADMcyQbRUL0EHZ7us,1263
|
20
20
|
docling/document_converter.py,sha256=U52_rZQDm2wzrnsuUrvsfX2MnmOWFFhjBzfS8tEvt6Y,10595
|
21
21
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
22
|
docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
|
23
|
-
docling/models/base_ocr_model.py,sha256=
|
23
|
+
docling/models/base_ocr_model.py,sha256=rGSpBF4dByITcsBaRIgvFKpiu0CrhmZS_PHIo686Dw0,6428
|
24
24
|
docling/models/ds_glm_model.py,sha256=2OpWW8MMzCIshrtP36gDSRPYOCjv1ex34FqxD2nYjP4,11986
|
25
|
-
docling/models/easyocr_model.py,sha256=
|
25
|
+
docling/models/easyocr_model.py,sha256=c2m4x9dZpSc-cMgeEdFBRVBlB78uMGlYD8Q_2gzRuMU,3734
|
26
26
|
docling/models/layout_model.py,sha256=ZvbTSyxvXB5yLHNEti0Wv3trz0vwGuHySI5TCdApb0U,14011
|
27
27
|
docling/models/page_assemble_model.py,sha256=kSGNiRKhmzkpFH7xCiT3rulMsgJmUXFa6Th_eB-cLEk,7103
|
28
28
|
docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
|
29
29
|
docling/models/table_structure_model.py,sha256=-ANSQpiN2avt3B9sbi7dHcoULUJbMBalAR5xxlrM7To,8421
|
30
|
-
docling/models/tesseract_ocr_cli_model.py,sha256=
|
31
|
-
docling/models/tesseract_ocr_model.py,sha256=
|
30
|
+
docling/models/tesseract_ocr_cli_model.py,sha256=OfopQnt2FGwtLJTMtW9jbJZ9EN2G2QFkA_aACjuUuDs,6372
|
31
|
+
docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUsOkuz0neY,6095
|
32
32
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
33
33
|
docling/pipeline/base_pipeline.py,sha256=IF1XWYgUGbdB4-teLkmM4Hvg_UNEfPrGuhExMRTUsk8,7168
|
34
34
|
docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
|
@@ -38,8 +38,8 @@ docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
|
38
38
|
docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
|
39
39
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
40
40
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
41
|
-
docling-2.
|
42
|
-
docling-2.
|
43
|
-
docling-2.
|
44
|
-
docling-2.
|
45
|
-
docling-2.
|
41
|
+
docling-2.5.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
42
|
+
docling-2.5.1.dist-info/METADATA,sha256=qOFYM-E7GjYUIaHtwPoef22zJEWAhIZW8tlIALD17u0,6530
|
43
|
+
docling-2.5.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
44
|
+
docling-2.5.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
45
|
+
docling-2.5.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|