docling 2.6.0__py3-none-any.whl → 2.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/msword_backend.py +22 -9
- docling/cli/main.py +20 -18
- docling/datamodel/pipeline_options.py +14 -3
- docling/document_converter.py +4 -4
- docling/models/ocr_mac_model.py +118 -0
- docling/pipeline/standard_pdf_pipeline.py +12 -0
- {docling-2.6.0.dist-info → docling-2.7.1.dist-info}/METADATA +13 -8
- {docling-2.6.0.dist-info → docling-2.7.1.dist-info}/RECORD +11 -10
- {docling-2.6.0.dist-info → docling-2.7.1.dist-info}/LICENSE +0 -0
- {docling-2.6.0.dist-info → docling-2.7.1.dist-info}/WHEEL +0 -0
- {docling-2.6.0.dist-info → docling-2.7.1.dist-info}/entry_points.txt +0 -0
@@ -14,7 +14,8 @@ from docling_core.types.doc import (
|
|
14
14
|
TableData,
|
15
15
|
)
|
16
16
|
from lxml import etree
|
17
|
-
from
|
17
|
+
from lxml.etree import XPath
|
18
|
+
from PIL import Image, UnidentifiedImageError
|
18
19
|
|
19
20
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
20
21
|
from docling.datamodel.base_models import InputFormat
|
@@ -132,8 +133,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
132
133
|
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
133
134
|
for element in body:
|
134
135
|
tag_name = etree.QName(element).localname
|
136
|
+
|
135
137
|
# Check for Inline Images (blip elements)
|
136
|
-
|
138
|
+
namespaces = {
|
139
|
+
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
140
|
+
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
141
|
+
}
|
142
|
+
xpath_expr = XPath(".//a:blip", namespaces=namespaces)
|
143
|
+
drawing_blip = xpath_expr(element)
|
137
144
|
|
138
145
|
# Check for Tables
|
139
146
|
if element.tag.endswith("tbl"):
|
@@ -210,7 +217,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
210
217
|
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
|
211
218
|
|
212
219
|
if paragraph.text is None:
|
213
|
-
# _log.warn(f"paragraph has text==None")
|
214
220
|
return
|
215
221
|
text = paragraph.text.strip()
|
216
222
|
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
|
@@ -502,10 +508,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
502
508
|
image_data = get_docx_image(element, drawing_blip)
|
503
509
|
image_bytes = BytesIO(image_data)
|
504
510
|
# Open the BytesIO object with PIL to create an Image
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
+
try:
|
512
|
+
pil_image = Image.open(image_bytes)
|
513
|
+
doc.add_picture(
|
514
|
+
parent=self.parents[self.level],
|
515
|
+
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
516
|
+
caption=None,
|
517
|
+
)
|
518
|
+
except (UnidentifiedImageError, OSError) as e:
|
519
|
+
_log.warning("Warning: image cannot be loaded by Pillow")
|
520
|
+
doc.add_picture(
|
521
|
+
parent=self.parents[self.level],
|
522
|
+
caption=None,
|
523
|
+
)
|
511
524
|
return
|
docling/cli/main.py
CHANGED
@@ -24,6 +24,7 @@ from docling.datamodel.base_models import (
|
|
24
24
|
from docling.datamodel.document import ConversionResult
|
25
25
|
from docling.datamodel.pipeline_options import (
|
26
26
|
EasyOcrOptions,
|
27
|
+
OcrMacOptions,
|
27
28
|
OcrOptions,
|
28
29
|
PdfPipelineOptions,
|
29
30
|
TableFormerMode,
|
@@ -74,6 +75,7 @@ class OcrEngine(str, Enum):
|
|
74
75
|
EASYOCR = "easyocr"
|
75
76
|
TESSERACT_CLI = "tesseract_cli"
|
76
77
|
TESSERACT = "tesseract"
|
78
|
+
OCRMAC = "ocrmac"
|
77
79
|
|
78
80
|
|
79
81
|
def export_documents(
|
@@ -252,15 +254,16 @@ def convert(
|
|
252
254
|
export_txt = OutputFormat.TEXT in to_formats
|
253
255
|
export_doctags = OutputFormat.DOCTAGS in to_formats
|
254
256
|
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
257
|
+
if ocr_engine == OcrEngine.EASYOCR:
|
258
|
+
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
|
259
|
+
elif ocr_engine == OcrEngine.TESSERACT_CLI:
|
260
|
+
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
|
261
|
+
elif ocr_engine == OcrEngine.TESSERACT:
|
262
|
+
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
263
|
+
elif ocr_engine == OcrEngine.OCRMAC:
|
264
|
+
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
|
265
|
+
else:
|
266
|
+
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
264
267
|
|
265
268
|
ocr_lang_list = _split_list(ocr_lang)
|
266
269
|
if ocr_lang_list is not None:
|
@@ -277,15 +280,14 @@ def convert(
|
|
277
280
|
if artifacts_path is not None:
|
278
281
|
pipeline_options.artifacts_path = artifacts_path
|
279
282
|
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
283
|
+
if pdf_backend == PdfBackend.DLPARSE_V1:
|
284
|
+
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
285
|
+
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
286
|
+
backend = DoclingParseV2DocumentBackend
|
287
|
+
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
288
|
+
backend = PyPdfiumDocumentBackend
|
289
|
+
else:
|
290
|
+
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
289
291
|
|
290
292
|
format_options: Dict[InputFormat, FormatOption] = {
|
291
293
|
InputFormat.PDF: PdfFormatOption(
|
@@ -63,6 +63,17 @@ class TesseractOcrOptions(OcrOptions):
|
|
63
63
|
)
|
64
64
|
|
65
65
|
|
66
|
+
class OcrMacOptions(OcrOptions):
|
67
|
+
kind: Literal["ocrmac"] = "ocrmac"
|
68
|
+
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
69
|
+
recognition: str = "accurate"
|
70
|
+
framework: str = "vision"
|
71
|
+
|
72
|
+
model_config = ConfigDict(
|
73
|
+
extra="forbid",
|
74
|
+
)
|
75
|
+
|
76
|
+
|
66
77
|
class PipelineOptions(BaseModel):
|
67
78
|
create_legacy_output: bool = (
|
68
79
|
True # This defautl will be set to False on a future version of docling
|
@@ -75,9 +86,9 @@ class PdfPipelineOptions(PipelineOptions):
|
|
75
86
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
76
87
|
|
77
88
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
78
|
-
ocr_options: Union[
|
79
|
-
|
80
|
-
)
|
89
|
+
ocr_options: Union[
|
90
|
+
EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
|
91
|
+
] = Field(EasyOcrOptions(), discriminator="kind")
|
81
92
|
|
82
93
|
images_scale: float = 1.0
|
83
94
|
generate_page_images: bool = False
|
docling/document_converter.py
CHANGED
@@ -3,7 +3,7 @@ import sys
|
|
3
3
|
import time
|
4
4
|
from functools import partial
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import Dict, Iterable, Iterator, List, Optional, Type
|
6
|
+
from typing import Dict, Iterable, Iterator, List, Optional, Type, Union
|
7
7
|
|
8
8
|
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
9
9
|
|
@@ -155,7 +155,7 @@ class DocumentConverter:
|
|
155
155
|
@validate_call(config=ConfigDict(strict=True))
|
156
156
|
def convert(
|
157
157
|
self,
|
158
|
-
source: Path
|
158
|
+
source: Union[Path, str, DocumentStream], # TODO review naming
|
159
159
|
raises_on_error: bool = True,
|
160
160
|
max_num_pages: int = sys.maxsize,
|
161
161
|
max_file_size: int = sys.maxsize,
|
@@ -172,7 +172,7 @@ class DocumentConverter:
|
|
172
172
|
@validate_call(config=ConfigDict(strict=True))
|
173
173
|
def convert_all(
|
174
174
|
self,
|
175
|
-
source: Iterable[Path
|
175
|
+
source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
|
176
176
|
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
177
177
|
max_num_pages: int = sys.maxsize,
|
178
178
|
max_file_size: int = sys.maxsize,
|
@@ -183,7 +183,7 @@ class DocumentConverter:
|
|
183
183
|
)
|
184
184
|
conv_input = _DocumentConversionInput(
|
185
185
|
path_or_stream_iterator=source,
|
186
|
-
|
186
|
+
limits=limits,
|
187
187
|
)
|
188
188
|
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
189
189
|
for conv_res in conv_res_iter:
|
@@ -0,0 +1,118 @@
|
|
1
|
+
import logging
|
2
|
+
import tempfile
|
3
|
+
from typing import Iterable, Optional, Tuple
|
4
|
+
|
5
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
6
|
+
|
7
|
+
from docling.datamodel.base_models import OcrCell, Page
|
8
|
+
from docling.datamodel.document import ConversionResult
|
9
|
+
from docling.datamodel.pipeline_options import OcrMacOptions
|
10
|
+
from docling.datamodel.settings import settings
|
11
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
12
|
+
from docling.utils.profiling import TimeRecorder
|
13
|
+
|
14
|
+
_log = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
class OcrMacModel(BaseOcrModel):
|
18
|
+
def __init__(self, enabled: bool, options: OcrMacOptions):
|
19
|
+
super().__init__(enabled=enabled, options=options)
|
20
|
+
self.options: OcrMacOptions
|
21
|
+
|
22
|
+
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
23
|
+
|
24
|
+
if self.enabled:
|
25
|
+
install_errmsg = (
|
26
|
+
"ocrmac is not correctly installed. "
|
27
|
+
"Please install it via `pip install ocrmac` to use this OCR engine. "
|
28
|
+
"Alternatively, Docling has support for other OCR engines. See the documentation: "
|
29
|
+
"https://ds4sd.github.io/docling/installation/"
|
30
|
+
)
|
31
|
+
try:
|
32
|
+
from ocrmac import ocrmac
|
33
|
+
except ImportError:
|
34
|
+
raise ImportError(install_errmsg)
|
35
|
+
|
36
|
+
self.reader_RIL = ocrmac.OCR
|
37
|
+
|
38
|
+
def __call__(
|
39
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
40
|
+
) -> Iterable[Page]:
|
41
|
+
|
42
|
+
if not self.enabled:
|
43
|
+
yield from page_batch
|
44
|
+
return
|
45
|
+
|
46
|
+
for page in page_batch:
|
47
|
+
assert page._backend is not None
|
48
|
+
if not page._backend.is_valid():
|
49
|
+
yield page
|
50
|
+
else:
|
51
|
+
with TimeRecorder(conv_res, "ocr"):
|
52
|
+
|
53
|
+
ocr_rects = self.get_ocr_rects(page)
|
54
|
+
|
55
|
+
all_ocr_cells = []
|
56
|
+
for ocr_rect in ocr_rects:
|
57
|
+
# Skip zero area boxes
|
58
|
+
if ocr_rect.area() == 0:
|
59
|
+
continue
|
60
|
+
high_res_image = page._backend.get_page_image(
|
61
|
+
scale=self.scale, cropbox=ocr_rect
|
62
|
+
)
|
63
|
+
|
64
|
+
with tempfile.NamedTemporaryFile(
|
65
|
+
suffix=".png", mode="w"
|
66
|
+
) as image_file:
|
67
|
+
fname = image_file.name
|
68
|
+
high_res_image.save(fname)
|
69
|
+
|
70
|
+
boxes = self.reader_RIL(
|
71
|
+
fname,
|
72
|
+
recognition_level=self.options.recognition,
|
73
|
+
framework=self.options.framework,
|
74
|
+
language_preference=self.options.lang,
|
75
|
+
).recognize()
|
76
|
+
|
77
|
+
im_width, im_height = high_res_image.size
|
78
|
+
cells = []
|
79
|
+
for ix, (text, confidence, box) in enumerate(boxes):
|
80
|
+
x = float(box[0])
|
81
|
+
y = float(box[1])
|
82
|
+
w = float(box[2])
|
83
|
+
h = float(box[3])
|
84
|
+
|
85
|
+
x1 = x * im_width
|
86
|
+
y2 = (1 - y) * im_height
|
87
|
+
|
88
|
+
x2 = x1 + w * im_width
|
89
|
+
y1 = y2 - h * im_height
|
90
|
+
|
91
|
+
left = x1 / self.scale
|
92
|
+
top = y1 / self.scale
|
93
|
+
right = x2 / self.scale
|
94
|
+
bottom = y2 / self.scale
|
95
|
+
|
96
|
+
cells.append(
|
97
|
+
OcrCell(
|
98
|
+
id=ix,
|
99
|
+
text=text,
|
100
|
+
confidence=confidence,
|
101
|
+
bbox=BoundingBox.from_tuple(
|
102
|
+
coord=(left, top, right, bottom),
|
103
|
+
origin=CoordOrigin.TOPLEFT,
|
104
|
+
),
|
105
|
+
)
|
106
|
+
)
|
107
|
+
|
108
|
+
# del high_res_image
|
109
|
+
all_ocr_cells.extend(cells)
|
110
|
+
|
111
|
+
# Post-process the cells
|
112
|
+
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
113
|
+
|
114
|
+
# DEBUG code:
|
115
|
+
if settings.debug.visualize_ocr:
|
116
|
+
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
117
|
+
|
118
|
+
yield page
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import logging
|
2
|
+
import sys
|
2
3
|
from pathlib import Path
|
3
4
|
from typing import Optional
|
4
5
|
|
@@ -10,6 +11,7 @@ from docling.datamodel.base_models import AssembledUnit, Page
|
|
10
11
|
from docling.datamodel.document import ConversionResult
|
11
12
|
from docling.datamodel.pipeline_options import (
|
12
13
|
EasyOcrOptions,
|
14
|
+
OcrMacOptions,
|
13
15
|
PdfPipelineOptions,
|
14
16
|
TesseractCliOcrOptions,
|
15
17
|
TesseractOcrOptions,
|
@@ -18,6 +20,7 @@ from docling.models.base_ocr_model import BaseOcrModel
|
|
18
20
|
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
19
21
|
from docling.models.easyocr_model import EasyOcrModel
|
20
22
|
from docling.models.layout_model import LayoutModel
|
23
|
+
from docling.models.ocr_mac_model import OcrMacModel
|
21
24
|
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
22
25
|
from docling.models.page_preprocessing_model import (
|
23
26
|
PagePreprocessingModel,
|
@@ -118,6 +121,15 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
118
121
|
enabled=self.pipeline_options.do_ocr,
|
119
122
|
options=self.pipeline_options.ocr_options,
|
120
123
|
)
|
124
|
+
elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
|
125
|
+
if "darwin" != sys.platform:
|
126
|
+
raise RuntimeError(
|
127
|
+
f"The specified OCR type is only supported on Mac: {self.pipeline_options.ocr_options.kind}."
|
128
|
+
)
|
129
|
+
return OcrMacModel(
|
130
|
+
enabled=self.pipeline_options.do_ocr,
|
131
|
+
options=self.pipeline_options.ocr_options,
|
132
|
+
)
|
121
133
|
return None
|
122
134
|
|
123
135
|
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
@@ -1,13 +1,13 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.7.1
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
7
7
|
Keywords: docling,convert,document,pdf,docx,html,markdown,layout model,segmentation,table structure,table former
|
8
8
|
Author: Christoph Auer
|
9
9
|
Author-email: cau@zurich.ibm.com
|
10
|
-
Requires-Python: >=3.
|
10
|
+
Requires-Python: >=3.9,<4.0
|
11
11
|
Classifier: Development Status :: 5 - Production/Stable
|
12
12
|
Classifier: Intended Audience :: Developers
|
13
13
|
Classifier: Intended Audience :: Science/Research
|
@@ -15,32 +15,36 @@ Classifier: License :: OSI Approved :: MIT License
|
|
15
15
|
Classifier: Operating System :: MacOS :: MacOS X
|
16
16
|
Classifier: Operating System :: POSIX :: Linux
|
17
17
|
Classifier: Programming Language :: Python :: 3
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
18
19
|
Classifier: Programming Language :: Python :: 3.10
|
19
20
|
Classifier: Programming Language :: Python :: 3.11
|
20
21
|
Classifier: Programming Language :: Python :: 3.12
|
21
22
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
23
|
+
Provides-Extra: ocrmac
|
22
24
|
Provides-Extra: tesserocr
|
23
25
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
24
26
|
Requires-Dist: certifi (>=2024.7.4)
|
25
27
|
Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
|
26
28
|
Requires-Dist: docling-core (>=2.4.0,<3.0.0)
|
27
|
-
Requires-Dist: docling-ibm-models (>=2.0.
|
28
|
-
Requires-Dist: docling-parse (>=2.0.
|
29
|
+
Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
|
30
|
+
Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
|
29
31
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
30
32
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
31
33
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
34
|
+
Requires-Dist: lxml (>=4.0.0,<6.0.0)
|
32
35
|
Requires-Dist: marko (>=2.1.2,<3.0.0)
|
36
|
+
Requires-Dist: ocrmac (>=1.0.0,<2.0.0) ; (sys_platform == "darwin") and (extra == "ocrmac")
|
33
37
|
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
34
38
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
35
39
|
Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
|
36
|
-
Requires-Dist: pydantic (>=2.0.0,<
|
40
|
+
Requires-Dist: pydantic (>=2.0.0,<2.10)
|
37
41
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
38
42
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
39
43
|
Requires-Dist: python-docx (>=1.1.2,<2.0.0)
|
40
44
|
Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
|
41
45
|
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
42
46
|
Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
43
|
-
Requires-Dist: scipy (>=1.
|
47
|
+
Requires-Dist: scipy (>=1.6.0,<2.0.0)
|
44
48
|
Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
|
45
49
|
Requires-Dist: typer (>=0.12.5,<0.13.0)
|
46
50
|
Project-URL: Repository, https://github.com/DS4SD/docling
|
@@ -61,19 +65,20 @@ Description-Content-Type: text/markdown
|
|
61
65
|
[](https://arxiv.org/abs/2408.09869)
|
62
66
|
[](https://ds4sd.github.io/docling/)
|
63
67
|
[](https://pypi.org/project/docling/)
|
64
|
-
](https://pypi.org/project/docling/)
|
65
69
|
[](https://python-poetry.org/)
|
66
70
|
[](https://github.com/psf/black)
|
67
71
|
[](https://pycqa.github.io/isort/)
|
68
72
|
[](https://pydantic.dev)
|
69
73
|
[](https://github.com/pre-commit/pre-commit)
|
70
74
|
[](https://opensource.org/licenses/MIT)
|
75
|
+
[](https://pepy.tech/projects/docling)
|
71
76
|
|
72
77
|
Docling parses documents and exports them to the desired format with ease and speed.
|
73
78
|
|
74
79
|
## Features
|
75
80
|
|
76
|
-
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc
|
81
|
+
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
|
77
82
|
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
78
83
|
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
79
84
|
* 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications
|
@@ -8,23 +8,24 @@ docling/backend/html_backend.py,sha256=qbu1W8xoTGnXMuZPRPLq68hDbCEj6ygnpxP5gYaod
|
|
8
8
|
docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
|
9
9
|
docling/backend/msexcel_backend.py,sha256=23qUEScqr5GhY06xiqg-eBQ_JlAqO0FkPEmX6554sVA,12040
|
10
10
|
docling/backend/mspowerpoint_backend.py,sha256=QD0NaatTO8U9CIFoiipkq3X5HxLZaaahH8nlrQ6ecDA,15710
|
11
|
-
docling/backend/msword_backend.py,sha256
|
11
|
+
docling/backend/msword_backend.py,sha256=sMumfB9Xa2Md1a8WO-fGPPAKf1s3mCvErMyZ-xnBC2E,18495
|
12
12
|
docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
|
13
13
|
docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
|
14
14
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
-
docling/cli/main.py,sha256=
|
15
|
+
docling/cli/main.py,sha256=MpjbAXhOlbGnAnl5_OaKCdub61YPQBy1NOqroXQtNYE,10722
|
16
16
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
17
|
docling/datamodel/base_models.py,sha256=6qlwPamDZ3XUsE2kTAyGKG6O2IJClVjCqaE7DZ74KHU,5533
|
18
18
|
docling/datamodel/document.py,sha256=9dQf_J18X_MEWs-Mg3Ed6BykFPJ79ETmkkxcssY-vYo,20698
|
19
|
-
docling/datamodel/pipeline_options.py,sha256=
|
19
|
+
docling/datamodel/pipeline_options.py,sha256=aC_CmtEhNLIbn9n3JuYhL_aA8UA0vFgw7HcGMUuOI4o,3117
|
20
20
|
docling/datamodel/settings.py,sha256=JK8lZPBjUx2kD2q-Qpg-o3vOElADMcyQbRUL0EHZ7us,1263
|
21
|
-
docling/document_converter.py,sha256=
|
21
|
+
docling/document_converter.py,sha256=L0A3g7IQBaKIK7dWpUFC72ZqKywIPYkyh71Qd6DiNPE,10940
|
22
22
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
23
|
docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
|
24
24
|
docling/models/base_ocr_model.py,sha256=rGSpBF4dByITcsBaRIgvFKpiu0CrhmZS_PHIo686Dw0,6428
|
25
25
|
docling/models/ds_glm_model.py,sha256=hBRCx6oFGhxBbKEJlRSWVndDwFtB5IpeLOowFAVqFM0,12033
|
26
26
|
docling/models/easyocr_model.py,sha256=c2m4x9dZpSc-cMgeEdFBRVBlB78uMGlYD8Q_2gzRuMU,3734
|
27
27
|
docling/models/layout_model.py,sha256=ZvbTSyxvXB5yLHNEti0Wv3trz0vwGuHySI5TCdApb0U,14011
|
28
|
+
docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
|
28
29
|
docling/models/page_assemble_model.py,sha256=kSGNiRKhmzkpFH7xCiT3rulMsgJmUXFa6Th_eB-cLEk,7103
|
29
30
|
docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
|
30
31
|
docling/models/table_structure_model.py,sha256=-ANSQpiN2avt3B9sbi7dHcoULUJbMBalAR5xxlrM7To,8421
|
@@ -33,14 +34,14 @@ docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUs
|
|
33
34
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
34
35
|
docling/pipeline/base_pipeline.py,sha256=IF1XWYgUGbdB4-teLkmM4Hvg_UNEfPrGuhExMRTUsk8,7168
|
35
36
|
docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
|
36
|
-
docling/pipeline/standard_pdf_pipeline.py,sha256=
|
37
|
+
docling/pipeline/standard_pdf_pipeline.py,sha256=btm_y1ZsjUrtWvMbF6RA8BVM0ENrK4z_rqF0jjdeZmU,8473
|
37
38
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
39
|
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
39
40
|
docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
|
40
41
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
41
42
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
42
|
-
docling-2.
|
43
|
-
docling-2.
|
44
|
-
docling-2.
|
45
|
-
docling-2.
|
46
|
-
docling-2.
|
43
|
+
docling-2.7.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
44
|
+
docling-2.7.1.dist-info/METADATA,sha256=TvD3BGlbO1ci54NzwmLxqSITXIdMefyj71YjdZkD7Vs,6906
|
45
|
+
docling-2.7.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
46
|
+
docling-2.7.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
47
|
+
docling-2.7.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|