docling 2.15.1__py3-none-any.whl → 2.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +0 -1
- docling/backend/asciidoc_backend.py +0 -1
- docling/backend/docling_parse_backend.py +1 -1
- docling/backend/docling_parse_v2_backend.py +1 -1
- docling/backend/html_backend.py +1 -1
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +44 -27
- docling/backend/msexcel_backend.py +50 -38
- docling/backend/msword_backend.py +0 -1
- docling/backend/pdf_backend.py +0 -2
- docling/backend/pypdfium2_backend.py +1 -1
- docling/datamodel/base_models.py +30 -3
- docling/datamodel/document.py +2 -0
- docling/datamodel/pipeline_options.py +6 -9
- docling/document_converter.py +4 -0
- docling/models/base_model.py +62 -6
- docling/models/code_formula_model.py +245 -0
- docling/models/document_picture_classifier.py +187 -0
- docling/models/layout_model.py +10 -86
- docling/models/page_assemble_model.py +1 -33
- docling/models/tesseract_ocr_cli_model.py +0 -1
- docling/models/tesseract_ocr_model.py +63 -15
- docling/pipeline/base_pipeline.py +40 -17
- docling/pipeline/standard_pdf_pipeline.py +31 -2
- docling/utils/glm_utils.py +4 -1
- docling/utils/visualization.py +80 -0
- {docling-2.15.1.dist-info → docling-2.16.0.dist-info}/METADATA +5 -4
- docling-2.16.0.dist-info/RECORD +61 -0
- docling-2.15.1.dist-info/RECORD +0 -56
- {docling-2.15.1.dist-info → docling-2.16.0.dist-info}/LICENSE +0 -0
- {docling-2.15.1.dist-info → docling-2.16.0.dist-info}/WHEEL +0 -0
- {docling-2.15.1.dist-info → docling-2.16.0.dist-info}/entry_points.txt +0 -0
@@ -54,43 +54,56 @@ class TesseractOcrModel(BaseOcrModel):
|
|
54
54
|
# Initialize the tesseractAPI
|
55
55
|
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
56
56
|
lang = "+".join(self.options.lang)
|
57
|
+
|
58
|
+
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
59
|
+
|
60
|
+
if any([l.startswith("script/") for l in tesserocr_languages]):
|
61
|
+
self.script_prefix = "script/"
|
62
|
+
else:
|
63
|
+
self.script_prefix = ""
|
64
|
+
|
65
|
+
tesserocr_kwargs = {
|
66
|
+
"psm": tesserocr.PSM.AUTO,
|
67
|
+
"init": True,
|
68
|
+
"oem": tesserocr.OEM.DEFAULT,
|
69
|
+
}
|
70
|
+
|
57
71
|
if self.options.path is not None:
|
72
|
+
tesserocr_kwargs["path"] = self.options.path
|
73
|
+
|
74
|
+
if lang == "auto":
|
58
75
|
self.reader = tesserocr.PyTessBaseAPI(
|
59
|
-
|
60
|
-
lang=lang,
|
61
|
-
psm=tesserocr.PSM.AUTO,
|
62
|
-
init=True,
|
63
|
-
oem=tesserocr.OEM.DEFAULT,
|
76
|
+
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
|
64
77
|
)
|
65
78
|
else:
|
66
79
|
self.reader = tesserocr.PyTessBaseAPI(
|
67
|
-
lang
|
68
|
-
psm=tesserocr.PSM.AUTO,
|
69
|
-
init=True,
|
70
|
-
oem=tesserocr.OEM.DEFAULT,
|
80
|
+
**{"lang": lang} | tesserocr_kwargs,
|
71
81
|
)
|
82
|
+
|
72
83
|
self.reader_RIL = tesserocr.RIL
|
73
84
|
|
74
85
|
def __del__(self):
|
75
86
|
if self.reader is not None:
|
76
87
|
# Finalize the tesseractAPI
|
77
88
|
self.reader.End()
|
89
|
+
for script in self.script_readers:
|
90
|
+
self.script_readers[script].End()
|
78
91
|
|
79
92
|
def __call__(
|
80
93
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
81
94
|
) -> Iterable[Page]:
|
82
|
-
|
83
95
|
if not self.enabled:
|
84
96
|
yield from page_batch
|
85
97
|
return
|
86
98
|
|
99
|
+
import tesserocr
|
100
|
+
|
87
101
|
for page in page_batch:
|
88
102
|
assert page._backend is not None
|
89
103
|
if not page._backend.is_valid():
|
90
104
|
yield page
|
91
105
|
else:
|
92
106
|
with TimeRecorder(conv_res, "ocr"):
|
93
|
-
|
94
107
|
assert self.reader is not None
|
95
108
|
|
96
109
|
ocr_rects = self.get_ocr_rects(page)
|
@@ -106,20 +119,55 @@ class TesseractOcrModel(BaseOcrModel):
|
|
106
119
|
|
107
120
|
# Retrieve text snippets with their bounding boxes
|
108
121
|
self.reader.SetImage(high_res_image)
|
109
|
-
|
122
|
+
|
123
|
+
if self.options.lang == ["auto"]:
|
124
|
+
osd = self.reader.DetectOrientationScript()
|
125
|
+
|
126
|
+
# No text, probably
|
127
|
+
if osd is None:
|
128
|
+
continue
|
129
|
+
|
130
|
+
script = osd["script_name"]
|
131
|
+
|
132
|
+
if script == "Katakana" or script == "Hiragana":
|
133
|
+
script = "Japanese"
|
134
|
+
elif script == "Han":
|
135
|
+
script = "HanS"
|
136
|
+
elif script == "Korean":
|
137
|
+
script = "Hangul"
|
138
|
+
|
139
|
+
_log.debug(
|
140
|
+
f'Using model for the detected script "{script}"'
|
141
|
+
)
|
142
|
+
|
143
|
+
if script not in self.script_readers:
|
144
|
+
self.script_readers[script] = tesserocr.PyTessBaseAPI(
|
145
|
+
path=self.reader.GetDatapath(),
|
146
|
+
lang=f"{self.script_prefix}{script}",
|
147
|
+
psm=tesserocr.PSM.AUTO,
|
148
|
+
init=True,
|
149
|
+
oem=tesserocr.OEM.DEFAULT,
|
150
|
+
)
|
151
|
+
|
152
|
+
local_reader = self.script_readers[script]
|
153
|
+
local_reader.SetImage(high_res_image)
|
154
|
+
else:
|
155
|
+
local_reader = self.reader
|
156
|
+
|
157
|
+
boxes = local_reader.GetComponentImages(
|
110
158
|
self.reader_RIL.TEXTLINE, True
|
111
159
|
)
|
112
160
|
|
113
161
|
cells = []
|
114
162
|
for ix, (im, box, _, _) in enumerate(boxes):
|
115
163
|
# Set the area of interest. Tesseract uses Bottom-Left for the origin
|
116
|
-
|
164
|
+
local_reader.SetRectangle(
|
117
165
|
box["x"], box["y"], box["w"], box["h"]
|
118
166
|
)
|
119
167
|
|
120
168
|
# Extract text within the bounding box
|
121
|
-
text =
|
122
|
-
confidence =
|
169
|
+
text = local_reader.GetUTF8Text().strip()
|
170
|
+
confidence = local_reader.MeanTextConf()
|
123
171
|
left = box["x"] / self.scale
|
124
172
|
bottom = box["y"] / self.scale
|
125
173
|
right = (box["x"] + box["w"]) / self.scale
|
@@ -3,7 +3,7 @@ import logging
|
|
3
3
|
import time
|
4
4
|
import traceback
|
5
5
|
from abc import ABC, abstractmethod
|
6
|
-
from typing import Callable, Iterable, List
|
6
|
+
from typing import Any, Callable, Iterable, List
|
7
7
|
|
8
8
|
from docling_core.types.doc import DoclingDocument, NodeItem
|
9
9
|
|
@@ -18,7 +18,7 @@ from docling.datamodel.base_models import (
|
|
18
18
|
from docling.datamodel.document import ConversionResult, InputDocument
|
19
19
|
from docling.datamodel.pipeline_options import PipelineOptions
|
20
20
|
from docling.datamodel.settings import settings
|
21
|
-
from docling.models.base_model import
|
21
|
+
from docling.models.base_model import GenericEnrichmentModel
|
22
22
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
23
23
|
from docling.utils.utils import chunkify
|
24
24
|
|
@@ -28,8 +28,9 @@ _log = logging.getLogger(__name__)
|
|
28
28
|
class BasePipeline(ABC):
|
29
29
|
def __init__(self, pipeline_options: PipelineOptions):
|
30
30
|
self.pipeline_options = pipeline_options
|
31
|
+
self.keep_images = False
|
31
32
|
self.build_pipe: List[Callable] = []
|
32
|
-
self.enrichment_pipe: List[
|
33
|
+
self.enrichment_pipe: List[GenericEnrichmentModel[Any]] = []
|
33
34
|
|
34
35
|
def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
|
35
36
|
conv_res = ConversionResult(input=in_doc)
|
@@ -40,7 +41,7 @@ class BasePipeline(ABC):
|
|
40
41
|
conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
|
41
42
|
):
|
42
43
|
# These steps are building and assembling the structure of the
|
43
|
-
# output DoclingDocument
|
44
|
+
# output DoclingDocument.
|
44
45
|
conv_res = self._build_document(conv_res)
|
45
46
|
conv_res = self._assemble_document(conv_res)
|
46
47
|
# From this stage, all operations should rely only on conv_res.output
|
@@ -50,6 +51,8 @@ class BasePipeline(ABC):
|
|
50
51
|
conv_res.status = ConversionStatus.FAILURE
|
51
52
|
if raises_on_error:
|
52
53
|
raise e
|
54
|
+
finally:
|
55
|
+
self._unload(conv_res)
|
53
56
|
|
54
57
|
return conv_res
|
55
58
|
|
@@ -62,21 +65,22 @@ class BasePipeline(ABC):
|
|
62
65
|
|
63
66
|
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
|
64
67
|
|
65
|
-
def
|
66
|
-
|
68
|
+
def _prepare_elements(
|
69
|
+
conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
|
67
70
|
) -> Iterable[NodeItem]:
|
68
|
-
for
|
69
|
-
|
70
|
-
|
71
|
+
for doc_element, _level in conv_res.document.iterate_items():
|
72
|
+
prepared_element = model.prepare_element(
|
73
|
+
conv_res=conv_res, element=doc_element
|
74
|
+
)
|
75
|
+
if prepared_element is not None:
|
76
|
+
yield prepared_element
|
71
77
|
|
72
78
|
with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
|
73
79
|
for model in self.enrichment_pipe:
|
74
80
|
for element_batch in chunkify(
|
75
|
-
|
81
|
+
_prepare_elements(conv_res, model),
|
76
82
|
settings.perf.elements_batch_size,
|
77
83
|
):
|
78
|
-
# TODO: currently we assume the element itself is modified, because
|
79
|
-
# we don't have an interface to save the element back to the document
|
80
84
|
for element in model(
|
81
85
|
doc=conv_res.document, element_batch=element_batch
|
82
86
|
): # Must exhaust!
|
@@ -88,6 +92,9 @@ class BasePipeline(ABC):
|
|
88
92
|
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
89
93
|
pass
|
90
94
|
|
95
|
+
def _unload(self, conv_res: ConversionResult):
|
96
|
+
pass
|
97
|
+
|
91
98
|
@classmethod
|
92
99
|
@abstractmethod
|
93
100
|
def get_default_options(cls) -> PipelineOptions:
|
@@ -107,6 +114,10 @@ class BasePipeline(ABC):
|
|
107
114
|
|
108
115
|
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
109
116
|
|
117
|
+
def __init__(self, pipeline_options: PipelineOptions):
|
118
|
+
super().__init__(pipeline_options)
|
119
|
+
self.keep_backend = False
|
120
|
+
|
110
121
|
def _apply_on_pages(
|
111
122
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
112
123
|
) -> Iterable[Page]:
|
@@ -148,7 +159,14 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
148
159
|
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
|
149
160
|
|
150
161
|
for p in pipeline_pages: # Must exhaust!
|
151
|
-
|
162
|
+
|
163
|
+
# Cleanup cached images
|
164
|
+
if not self.keep_images:
|
165
|
+
p._image_cache = {}
|
166
|
+
|
167
|
+
# Cleanup page backends
|
168
|
+
if not self.keep_backend and p._backend is not None:
|
169
|
+
p._backend.unload()
|
152
170
|
|
153
171
|
end_batch_time = time.monotonic()
|
154
172
|
total_elapsed_time += end_batch_time - start_batch_time
|
@@ -177,10 +195,15 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
177
195
|
)
|
178
196
|
raise e
|
179
197
|
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
198
|
+
return conv_res
|
199
|
+
|
200
|
+
def _unload(self, conv_res: ConversionResult) -> ConversionResult:
|
201
|
+
for page in conv_res.pages:
|
202
|
+
if page._backend is not None:
|
203
|
+
page._backend.unload()
|
204
|
+
|
205
|
+
if conv_res.input._backend:
|
206
|
+
conv_res.input._backend.unload()
|
184
207
|
|
185
208
|
return conv_res
|
186
209
|
|
@@ -18,6 +18,11 @@ from docling.datamodel.pipeline_options import (
|
|
18
18
|
TesseractOcrOptions,
|
19
19
|
)
|
20
20
|
from docling.models.base_ocr_model import BaseOcrModel
|
21
|
+
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
|
22
|
+
from docling.models.document_picture_classifier import (
|
23
|
+
DocumentPictureClassifier,
|
24
|
+
DocumentPictureClassifierOptions,
|
25
|
+
)
|
21
26
|
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
22
27
|
from docling.models.easyocr_model import EasyOcrModel
|
23
28
|
from docling.models.layout_model import LayoutModel
|
@@ -50,7 +55,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
50
55
|
else:
|
51
56
|
self.artifacts_path = Path(pipeline_options.artifacts_path)
|
52
57
|
|
53
|
-
keep_images = (
|
58
|
+
self.keep_images = (
|
54
59
|
self.pipeline_options.generate_page_images
|
55
60
|
or self.pipeline_options.generate_picture_images
|
56
61
|
or self.pipeline_options.generate_table_images
|
@@ -87,13 +92,37 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
87
92
|
accelerator_options=pipeline_options.accelerator_options,
|
88
93
|
),
|
89
94
|
# Page assemble
|
90
|
-
PageAssembleModel(options=PageAssembleOptions(
|
95
|
+
PageAssembleModel(options=PageAssembleOptions()),
|
91
96
|
]
|
92
97
|
|
93
98
|
self.enrichment_pipe = [
|
94
99
|
# Other models working on `NodeItem` elements in the DoclingDocument
|
100
|
+
# Code Formula Enrichment Model
|
101
|
+
CodeFormulaModel(
|
102
|
+
enabled=pipeline_options.do_code_enrichment
|
103
|
+
or pipeline_options.do_formula_enrichment,
|
104
|
+
artifacts_path=pipeline_options.artifacts_path,
|
105
|
+
options=CodeFormulaModelOptions(
|
106
|
+
do_code_enrichment=pipeline_options.do_code_enrichment,
|
107
|
+
do_formula_enrichment=pipeline_options.do_formula_enrichment,
|
108
|
+
),
|
109
|
+
accelerator_options=pipeline_options.accelerator_options,
|
110
|
+
),
|
111
|
+
# Document Picture Classifier
|
112
|
+
DocumentPictureClassifier(
|
113
|
+
enabled=pipeline_options.do_picture_classification,
|
114
|
+
artifacts_path=pipeline_options.artifacts_path,
|
115
|
+
options=DocumentPictureClassifierOptions(),
|
116
|
+
accelerator_options=pipeline_options.accelerator_options,
|
117
|
+
),
|
95
118
|
]
|
96
119
|
|
120
|
+
if (
|
121
|
+
self.pipeline_options.do_formula_enrichment
|
122
|
+
or self.pipeline_options.do_code_enrichment
|
123
|
+
):
|
124
|
+
self.keep_backend = True
|
125
|
+
|
97
126
|
@staticmethod
|
98
127
|
def download_models_hf(
|
99
128
|
local_dir: Optional[Path] = None, force: bool = False
|
docling/utils/glm_utils.py
CHANGED
@@ -270,7 +270,6 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
|
270
270
|
container_el = doc.add_group(label=group_label)
|
271
271
|
|
272
272
|
_add_child_elements(container_el, doc, obj, pelem)
|
273
|
-
|
274
273
|
elif "text" in obj:
|
275
274
|
text = obj["text"][span_i:span_j]
|
276
275
|
|
@@ -304,6 +303,10 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
|
304
303
|
current_list = None
|
305
304
|
|
306
305
|
doc.add_heading(text=text, prov=prov)
|
306
|
+
elif label == DocItemLabel.CODE:
|
307
|
+
current_list = None
|
308
|
+
|
309
|
+
doc.add_code(text=text, prov=prov)
|
307
310
|
else:
|
308
311
|
current_list = None
|
309
312
|
|
@@ -0,0 +1,80 @@
|
|
1
|
+
from docling_core.types.doc import DocItemLabel
|
2
|
+
from PIL import Image, ImageDraw, ImageFont
|
3
|
+
from PIL.ImageFont import FreeTypeFont
|
4
|
+
|
5
|
+
from docling.datamodel.base_models import Cluster
|
6
|
+
|
7
|
+
|
8
|
+
def draw_clusters(
|
9
|
+
image: Image.Image, clusters: list[Cluster], scale_x: float, scale_y: float
|
10
|
+
) -> None:
|
11
|
+
"""
|
12
|
+
Draw clusters on an image
|
13
|
+
"""
|
14
|
+
draw = ImageDraw.Draw(image, "RGBA")
|
15
|
+
# Create a smaller font for the labels
|
16
|
+
font: ImageFont.ImageFont | FreeTypeFont
|
17
|
+
try:
|
18
|
+
font = ImageFont.truetype("arial.ttf", 12)
|
19
|
+
except OSError:
|
20
|
+
# Fallback to default font if arial is not available
|
21
|
+
font = ImageFont.load_default()
|
22
|
+
for c_tl in clusters:
|
23
|
+
all_clusters = [c_tl, *c_tl.children]
|
24
|
+
for c in all_clusters:
|
25
|
+
# Draw cells first (underneath)
|
26
|
+
cell_color = (0, 0, 0, 40) # Transparent black for cells
|
27
|
+
for tc in c.cells:
|
28
|
+
cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
|
29
|
+
cx0 *= scale_x
|
30
|
+
cx1 *= scale_x
|
31
|
+
cy0 *= scale_x
|
32
|
+
cy1 *= scale_y
|
33
|
+
|
34
|
+
draw.rectangle(
|
35
|
+
[(cx0, cy0), (cx1, cy1)],
|
36
|
+
outline=None,
|
37
|
+
fill=cell_color,
|
38
|
+
)
|
39
|
+
# Draw cluster rectangle
|
40
|
+
x0, y0, x1, y1 = c.bbox.as_tuple()
|
41
|
+
x0 *= scale_x
|
42
|
+
x1 *= scale_x
|
43
|
+
y0 *= scale_x
|
44
|
+
y1 *= scale_y
|
45
|
+
|
46
|
+
cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
|
47
|
+
cluster_outline_color = (
|
48
|
+
*list(DocItemLabel.get_color(c.label)),
|
49
|
+
255,
|
50
|
+
)
|
51
|
+
draw.rectangle(
|
52
|
+
[(x0, y0), (x1, y1)],
|
53
|
+
outline=cluster_outline_color,
|
54
|
+
fill=cluster_fill_color,
|
55
|
+
)
|
56
|
+
# Add label name and confidence
|
57
|
+
label_text = f"{c.label.name} ({c.confidence:.2f})"
|
58
|
+
# Create semi-transparent background for text
|
59
|
+
text_bbox = draw.textbbox((x0, y0), label_text, font=font)
|
60
|
+
text_bg_padding = 2
|
61
|
+
draw.rectangle(
|
62
|
+
[
|
63
|
+
(
|
64
|
+
text_bbox[0] - text_bg_padding,
|
65
|
+
text_bbox[1] - text_bg_padding,
|
66
|
+
),
|
67
|
+
(
|
68
|
+
text_bbox[2] + text_bg_padding,
|
69
|
+
text_bbox[3] + text_bg_padding,
|
70
|
+
),
|
71
|
+
],
|
72
|
+
fill=(255, 255, 255, 180), # Semi-transparent white
|
73
|
+
)
|
74
|
+
# Draw text
|
75
|
+
draw.text(
|
76
|
+
(x0, y0),
|
77
|
+
label_text,
|
78
|
+
fill=(0, 0, 0, 255), # Solid black
|
79
|
+
font=font,
|
80
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.16.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -26,9 +26,9 @@ Provides-Extra: tesserocr
|
|
26
26
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
27
27
|
Requires-Dist: certifi (>=2024.7.4)
|
28
28
|
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
29
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
30
|
-
Requires-Dist: docling-ibm-models (>=3.
|
31
|
-
Requires-Dist: docling-parse (>=3.
|
29
|
+
Requires-Dist: docling-core[chunking] (>=2.15.1,<3.0.0)
|
30
|
+
Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
|
31
|
+
Requires-Dist: docling-parse (>=3.1.0,<4.0.0)
|
32
32
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
33
33
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
34
34
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
@@ -39,6 +39,7 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
|
|
39
39
|
Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
|
40
40
|
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
41
41
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
42
|
+
Requires-Dist: pillow (>=10.0.0,<11.0.0)
|
42
43
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
43
44
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
44
45
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
@@ -0,0 +1,61 @@
|
|
1
|
+
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
docling/backend/abstract_backend.py,sha256=1lNxzwDTn303aXduPDVmTyXn-5ZIoWMLYqNxANGWmQQ,1658
|
4
|
+
docling/backend/asciidoc_backend.py,sha256=zyHxlG_BvlLwvpdNca3P6aopxOJZw8wbDFkJQQknNXk,14050
|
5
|
+
docling/backend/docling_parse_backend.py,sha256=hEEJibI1oJS0LAnFoIs6gMshS3bCqGtVxHnDNvBGZuA,7649
|
6
|
+
docling/backend/docling_parse_v2_backend.py,sha256=IpwrBrtLGwNRl5AYO-o3NjEfNRsAkuMhzvDt2HXb9Ko,8655
|
7
|
+
docling/backend/html_backend.py,sha256=vUEfx0h24gEaHO2taQyWNs8zCkDox7kopEeMbWBXss0,15560
|
8
|
+
docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
+
docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
|
10
|
+
docling/backend/md_backend.py,sha256=ajEooDWNnWPHnPQMgUDh-K44Ch1X-sTBHqa1xBp7yJs,14645
|
11
|
+
docling/backend/msexcel_backend.py,sha256=lyJc4ShJGAN2ZfNTTuhdYTF-44cZsGyn_8Djstp3IEU,12700
|
12
|
+
docling/backend/mspowerpoint_backend.py,sha256=kOGawhcn0BFq4M_C6kW0mY8vMIB24_6R6q6GaszbSt0,15957
|
13
|
+
docling/backend/msword_backend.py,sha256=WcQmRYmpH8o2snGoWGxNRkCtUI3mf2JL3-9CxAfDAJg,19232
|
14
|
+
docling/backend/pdf_backend.py,sha256=17Pr8dWsD1C4FYUprrwMM9trDGW-JYLjrcScx1Ul4io,2048
|
15
|
+
docling/backend/pypdfium2_backend.py,sha256=QSPfp903ZtSpoNqPmcIek0HmvETrJ1kkwrdxnF5pjS0,9014
|
16
|
+
docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
+
docling/backend/xml/pubmed_backend.py,sha256=LMnpowjnxa5SydfNC00Ll840BYraL8dCJu-FfC9iSKk,20447
|
18
|
+
docling/backend/xml/uspto_backend.py,sha256=2YsnB-WRARIAaHPL6gxHePP24GQGi-Up2_K8ZapD3k4,70974
|
19
|
+
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
20
|
+
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
|
+
docling/cli/main.py,sha256=NR7NEt8Sf3FE9D7sHpEmABM9mFMTMO5w0VPwYIIvVsk,15481
|
22
|
+
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
+
docling/datamodel/base_models.py,sha256=vewP1X99qfAwiUsiC2m8CBDGiQPsGyp_WkKJHYPoYn4,7026
|
24
|
+
docling/datamodel/document.py,sha256=R748mLCFai4MeiE8ougQrQVJF_16t3f4CUrrEes5AV0,13202
|
25
|
+
docling/datamodel/pipeline_options.py,sha256=GA5LwywfOkcBDvG2LhDHikqDQYlFlUPJa93tPSx-vFw,7820
|
26
|
+
docling/datamodel/settings.py,sha256=Sw0rN_f8rdLV1eNvVeKiyET2Oe6oz9jtW3lJzniW9Do,1302
|
27
|
+
docling/document_converter.py,sha256=qtYPEkWuMUUGmFko2in38iSHdYrjAFf_GHNoXRRvEVs,12631
|
28
|
+
docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
|
29
|
+
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
30
|
+
docling/models/base_model.py,sha256=H5X-exVaAN-XMTzxpgUc-rwH-D8Uk7-VuZtq2soNGXI,2567
|
31
|
+
docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_VRTE,6919
|
32
|
+
docling/models/code_formula_model.py,sha256=bOIKJvckZ0QpnDZ-CDiYv-CvuGvaGzJgp2PiYAidKBQ,8422
|
33
|
+
docling/models/document_picture_classifier.py,sha256=RLB80ueqWZ86hdXtTKmSynCU13nT-T10vUp2sky9110,6078
|
34
|
+
docling/models/ds_glm_model.py,sha256=CkhsP0cEWwm4wb1g3cLFriVGpVtELiUK3REDMkPwAMw,13028
|
35
|
+
docling/models/easyocr_model.py,sha256=Kakb20ioBxDmNsIqoGvSSs_vbqAWN3QQNHYtEi-eErg,4990
|
36
|
+
docling/models/layout_model.py,sha256=3Fw7OM6g0j7NgItKsQOgFOCd1q6lp1DacN_db7f6QCw,6090
|
37
|
+
docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
|
38
|
+
docling/models/page_assemble_model.py,sha256=c5KLKwkUIdW0JcDHizWsqrpb5x_3DK28x82Q8o-3VJM,5968
|
39
|
+
docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
|
40
|
+
docling/models/rapid_ocr_model.py,sha256=LOIvczJs3_db2o8mtrKk-pIXgC-xqWqRLu2cjA3wvy4,4980
|
41
|
+
docling/models/table_structure_model.py,sha256=fUpCHthO4Uk3BhA99a85BHBm51fmdE9kfqhAk3WjuBw,9392
|
42
|
+
docling/models/tesseract_ocr_cli_model.py,sha256=FP9cnSkSyj6-EETHtabV720Fr3x9K_oBP2UuJi4VUwE,6621
|
43
|
+
docling/models/tesseract_ocr_model.py,sha256=N27xjo8aPb5x276wKHkf_6VFwJObfosdHLo5_hCuf94,8055
|
44
|
+
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
45
|
+
docling/pipeline/base_pipeline.py,sha256=J0ZjtincsJr-BbRgqoQozxIhDWxWFlWaS9CTPwypJFk,8621
|
46
|
+
docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
|
47
|
+
docling/pipeline/standard_pdf_pipeline.py,sha256=Qefg1JSiFwipypi8TZPJ50WgXTLjwkC0wvYAl02RM2o,10480
|
48
|
+
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
49
|
+
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
50
|
+
docling/utils/accelerator_utils.py,sha256=ZjULCn-qhxqx3frF-rJmAlWdzqgUMxH5utLHbSPev80,1367
|
51
|
+
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
52
|
+
docling/utils/glm_utils.py,sha256=Nfxdx0W-sl1owYncTeJmZdiPcn-jpTqK8f8TeQlDOMY,11683
|
53
|
+
docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
|
54
|
+
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
55
|
+
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
56
|
+
docling/utils/visualization.py,sha256=4pn-80fVuE04ken7hUg5Ar47ndRSL9MWBgdHM-1g1zU,2735
|
57
|
+
docling-2.16.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
58
|
+
docling-2.16.0.dist-info/METADATA,sha256=wJgRO2R9Szl69jFE8gj-VGIBpkwwMWPfgytz9nDsT_E,7780
|
59
|
+
docling-2.16.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
60
|
+
docling-2.16.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
61
|
+
docling-2.16.0.dist-info/RECORD,,
|
docling-2.15.1.dist-info/RECORD
DELETED
@@ -1,56 +0,0 @@
|
|
1
|
-
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
|
4
|
-
docling/backend/asciidoc_backend.py,sha256=kXZxOLk_LvLFVZwnJVVwjmvc3QWZ0iiG7VnwjgtC3hI,14051
|
5
|
-
docling/backend/docling_parse_backend.py,sha256=cJLkuOmfCtshRrwsv7WWayRNeMQASZv76v3nUHucqgM,7636
|
6
|
-
docling/backend/docling_parse_v2_backend.py,sha256=-lLsorxhK_Awrql_zXPen2LX0Gt9UvcDLMcmXf7_LKc,8642
|
7
|
-
docling/backend/html_backend.py,sha256=O8qXaw7MzOIdaxbBcjHieM9Ce4GEdtBj9YW0vpJspuA,15560
|
8
|
-
docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
|
9
|
-
docling/backend/msexcel_backend.py,sha256=23qUEScqr5GhY06xiqg-eBQ_JlAqO0FkPEmX6554sVA,12040
|
10
|
-
docling/backend/mspowerpoint_backend.py,sha256=kOGawhcn0BFq4M_C6kW0mY8vMIB24_6R6q6GaszbSt0,15957
|
11
|
-
docling/backend/msword_backend.py,sha256=K1D_h0ulLA6KQsPe62327cDVkQqV1f7EetCHo66wCKw,19233
|
12
|
-
docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
|
13
|
-
docling/backend/pypdfium2_backend.py,sha256=Exb3NBp3x2YSLoNfmXq4NefShgooJXsxTXrJ4JbTzcc,9001
|
14
|
-
docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
-
docling/backend/xml/pubmed_backend.py,sha256=LMnpowjnxa5SydfNC00Ll840BYraL8dCJu-FfC9iSKk,20447
|
16
|
-
docling/backend/xml/uspto_backend.py,sha256=2YsnB-WRARIAaHPL6gxHePP24GQGi-Up2_K8ZapD3k4,70974
|
17
|
-
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
18
|
-
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
|
-
docling/cli/main.py,sha256=NR7NEt8Sf3FE9D7sHpEmABM9mFMTMO5w0VPwYIIvVsk,15481
|
20
|
-
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
|
-
docling/datamodel/base_models.py,sha256=50Jf5zk9c4-zmnOzZLoPBnHQhTX0_OFQzIkKgnKK1o4,6229
|
22
|
-
docling/datamodel/document.py,sha256=OHM6bm0a-62xnAZ8DFlMHzATmbgNcfMxQoQO2udaW5Q,13071
|
23
|
-
docling/datamodel/pipeline_options.py,sha256=wKFzw8sAim6emQGsjuS12n7FfpMo8HVNoMOPhkXTkVo,7734
|
24
|
-
docling/datamodel/settings.py,sha256=Sw0rN_f8rdLV1eNvVeKiyET2Oe6oz9jtW3lJzniW9Do,1302
|
25
|
-
docling/document_converter.py,sha256=_pk0sHuPXJ14NEutatf5bK2VyNiU5cvYsVbh1HIgrIw,12431
|
26
|
-
docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
|
27
|
-
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
28
|
-
docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
|
29
|
-
docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_VRTE,6919
|
30
|
-
docling/models/ds_glm_model.py,sha256=CkhsP0cEWwm4wb1g3cLFriVGpVtELiUK3REDMkPwAMw,13028
|
31
|
-
docling/models/easyocr_model.py,sha256=Kakb20ioBxDmNsIqoGvSSs_vbqAWN3QQNHYtEi-eErg,4990
|
32
|
-
docling/models/layout_model.py,sha256=Xo8sclRTOO_V8Cr4RwuxB67vSWKF0LZ5nJRYU1WI--k,9063
|
33
|
-
docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
|
34
|
-
docling/models/page_assemble_model.py,sha256=qdEX0AIb76ZOqJV6O9j-7r67WmuIkUlwbb2PsL7eFK4,7608
|
35
|
-
docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
|
36
|
-
docling/models/rapid_ocr_model.py,sha256=LOIvczJs3_db2o8mtrKk-pIXgC-xqWqRLu2cjA3wvy4,4980
|
37
|
-
docling/models/table_structure_model.py,sha256=fUpCHthO4Uk3BhA99a85BHBm51fmdE9kfqhAk3WjuBw,9392
|
38
|
-
docling/models/tesseract_ocr_cli_model.py,sha256=aKQBaty4cYu6zG_C5uy6Zm3eeRQo5fxIierbKixa2kc,6622
|
39
|
-
docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUsOkuz0neY,6095
|
40
|
-
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
41
|
-
docling/pipeline/base_pipeline.py,sha256=W1HgNp0vPp8vVY0vTX47Xe4m7j0lWT8XroCkTx5PyMw,7949
|
42
|
-
docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
|
43
|
-
docling/pipeline/standard_pdf_pipeline.py,sha256=iXjVLy-9q82jrU_0AZTkbz3ccrqz4WiRLYD-epxG5BQ,9174
|
44
|
-
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
45
|
-
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
46
|
-
docling/utils/accelerator_utils.py,sha256=ZjULCn-qhxqx3frF-rJmAlWdzqgUMxH5utLHbSPev80,1367
|
47
|
-
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
48
|
-
docling/utils/glm_utils.py,sha256=IB19wToGath97gD3jAA3G_rQSptnZKhQCWLvPUCnkww,11551
|
49
|
-
docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
|
50
|
-
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
51
|
-
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
52
|
-
docling-2.15.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
53
|
-
docling-2.15.1.dist-info/METADATA,sha256=6WRzA633us43nw7RHwhX_jwizh2JSpGWxNh0pJq2ZYs,7739
|
54
|
-
docling-2.15.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
55
|
-
docling-2.15.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
56
|
-
docling-2.15.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|