docling 2.2.0__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +0 -4
- docling/backend/html_backend.py +53 -56
- docling/backend/md_backend.py +59 -6
- docling/backend/msword_backend.py +9 -15
- docling/datamodel/base_models.py +1 -1
- docling/datamodel/document.py +3 -1
- docling/datamodel/settings.py +15 -1
- docling/document_converter.py +12 -8
- docling/models/base_model.py +4 -1
- docling/models/base_ocr_model.py +21 -4
- docling/models/ds_glm_model.py +27 -11
- docling/models/easyocr_model.py +49 -39
- docling/models/layout_model.py +87 -61
- docling/models/page_assemble_model.py +102 -100
- docling/models/page_preprocessing_model.py +25 -7
- docling/models/table_structure_model.py +125 -90
- docling/models/tesseract_ocr_cli_model.py +62 -52
- docling/models/tesseract_ocr_model.py +57 -45
- docling/pipeline/base_pipeline.py +68 -69
- docling/pipeline/simple_pipeline.py +8 -11
- docling/pipeline/standard_pdf_pipeline.py +59 -56
- docling/utils/profiling.py +62 -0
- {docling-2.2.0.dist-info → docling-2.3.0.dist-info}/METADATA +5 -4
- docling-2.3.0.dist-info/RECORD +45 -0
- docling-2.2.0.dist-info/RECORD +0 -44
- {docling-2.2.0.dist-info → docling-2.3.0.dist-info}/LICENSE +0 -0
- {docling-2.2.0.dist-info → docling-2.3.0.dist-info}/WHEEL +0 -0
- {docling-2.2.0.dist-info → docling-2.3.0.dist-info}/entry_points.txt +0 -0
@@ -19,6 +19,7 @@ from docling.datamodel.document import ConversionResult, InputDocument
|
|
19
19
|
from docling.datamodel.pipeline_options import PipelineOptions
|
20
20
|
from docling.datamodel.settings import settings
|
21
21
|
from docling.models.base_model import BaseEnrichmentModel
|
22
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
22
23
|
from docling.utils.utils import chunkify
|
23
24
|
|
24
25
|
_log = logging.getLogger(__name__)
|
@@ -35,13 +36,16 @@ class BasePipeline(ABC):
|
|
35
36
|
|
36
37
|
_log.info(f"Processing document {in_doc.file.name}")
|
37
38
|
try:
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
39
|
+
with TimeRecorder(
|
40
|
+
conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
|
41
|
+
):
|
42
|
+
# These steps are building and assembling the structure of the
|
43
|
+
# output DoclingDocument
|
44
|
+
conv_res = self._build_document(conv_res)
|
45
|
+
conv_res = self._assemble_document(conv_res)
|
46
|
+
# From this stage, all operations should rely only on conv_res.output
|
47
|
+
conv_res = self._enrich_document(conv_res)
|
48
|
+
conv_res.status = self._determine_status(conv_res)
|
45
49
|
except Exception as e:
|
46
50
|
conv_res.status = ConversionStatus.FAILURE
|
47
51
|
if raises_on_error:
|
@@ -50,19 +54,13 @@ class BasePipeline(ABC):
|
|
50
54
|
return conv_res
|
51
55
|
|
52
56
|
@abstractmethod
|
53
|
-
def _build_document(
|
54
|
-
self, in_doc: InputDocument, conv_res: ConversionResult
|
55
|
-
) -> ConversionResult:
|
57
|
+
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
56
58
|
pass
|
57
59
|
|
58
|
-
def _assemble_document(
|
59
|
-
self, in_doc: InputDocument, conv_res: ConversionResult
|
60
|
-
) -> ConversionResult:
|
60
|
+
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
61
61
|
return conv_res
|
62
62
|
|
63
|
-
def _enrich_document(
|
64
|
-
self, in_doc: InputDocument, conv_res: ConversionResult
|
65
|
-
) -> ConversionResult:
|
63
|
+
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
|
66
64
|
|
67
65
|
def _filter_elements(
|
68
66
|
doc: DoclingDocument, model: BaseEnrichmentModel
|
@@ -71,24 +69,23 @@ class BasePipeline(ABC):
|
|
71
69
|
if model.is_processable(doc=doc, element=element):
|
72
70
|
yield element
|
73
71
|
|
74
|
-
|
75
|
-
for
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
72
|
+
with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
|
73
|
+
for model in self.enrichment_pipe:
|
74
|
+
for element_batch in chunkify(
|
75
|
+
_filter_elements(conv_res.document, model),
|
76
|
+
settings.perf.elements_batch_size,
|
77
|
+
):
|
78
|
+
# TODO: currently we assume the element itself is modified, because
|
79
|
+
# we don't have an interface to save the element back to the document
|
80
|
+
for element in model(
|
81
|
+
doc=conv_res.document, element_batch=element_batch
|
82
|
+
): # Must exhaust!
|
83
|
+
pass
|
85
84
|
|
86
85
|
return conv_res
|
87
86
|
|
88
87
|
@abstractmethod
|
89
|
-
def _determine_status(
|
90
|
-
self, in_doc: InputDocument, conv_res: ConversionResult
|
91
|
-
) -> ConversionStatus:
|
88
|
+
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
92
89
|
pass
|
93
90
|
|
94
91
|
@classmethod
|
@@ -110,66 +107,68 @@ class BasePipeline(ABC):
|
|
110
107
|
|
111
108
|
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
112
109
|
|
113
|
-
def _apply_on_pages(
|
110
|
+
def _apply_on_pages(
|
111
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
112
|
+
) -> Iterable[Page]:
|
114
113
|
for model in self.build_pipe:
|
115
|
-
page_batch = model(page_batch)
|
114
|
+
page_batch = model(conv_res, page_batch)
|
116
115
|
|
117
116
|
yield from page_batch
|
118
117
|
|
119
|
-
def _build_document(
|
120
|
-
self, in_doc: InputDocument, conv_res: ConversionResult
|
121
|
-
) -> ConversionResult:
|
118
|
+
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
122
119
|
|
123
|
-
if not isinstance(
|
120
|
+
if not isinstance(conv_res.input._backend, PdfDocumentBackend):
|
124
121
|
raise RuntimeError(
|
125
|
-
f"The selected backend {type(
|
122
|
+
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
|
126
123
|
f"Can not convert this with a PDF pipeline. "
|
127
124
|
f"Please check your format configuration on DocumentConverter."
|
128
125
|
)
|
129
126
|
# conv_res.status = ConversionStatus.FAILURE
|
130
127
|
# return conv_res
|
131
128
|
|
132
|
-
|
133
|
-
conv_res.pages.append(Page(page_no=i))
|
129
|
+
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
134
130
|
|
135
|
-
|
136
|
-
|
137
|
-
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
|
138
|
-
start_pb_time = time.time()
|
131
|
+
for i in range(0, conv_res.input.page_count):
|
132
|
+
conv_res.pages.append(Page(page_no=i))
|
139
133
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
134
|
+
try:
|
135
|
+
# Iterate batches of pages (page_batch_size) in the doc
|
136
|
+
for page_batch in chunkify(
|
137
|
+
conv_res.pages, settings.perf.page_batch_size
|
138
|
+
):
|
139
|
+
start_pb_time = time.time()
|
144
140
|
|
145
|
-
|
146
|
-
|
141
|
+
# 1. Initialise the page resources
|
142
|
+
init_pages = map(
|
143
|
+
functools.partial(self.initialize_page, conv_res), page_batch
|
144
|
+
)
|
147
145
|
|
148
|
-
|
149
|
-
|
146
|
+
# 2. Run pipeline stages
|
147
|
+
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
|
150
148
|
|
151
|
-
|
152
|
-
|
149
|
+
for p in pipeline_pages: # Must exhaust!
|
150
|
+
pass
|
153
151
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
152
|
+
end_pb_time = time.time() - start_pb_time
|
153
|
+
_log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
|
154
|
+
|
155
|
+
except Exception as e:
|
156
|
+
conv_res.status = ConversionStatus.FAILURE
|
157
|
+
trace = "\n".join(traceback.format_exception(e))
|
158
|
+
_log.warning(
|
159
|
+
f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
|
160
|
+
f"{trace}"
|
161
|
+
)
|
162
|
+
raise e
|
162
163
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
164
|
+
finally:
|
165
|
+
# Always unload the PDF backend, even in case of failure
|
166
|
+
if conv_res.input._backend:
|
167
|
+
conv_res.input._backend.unload()
|
167
168
|
|
168
169
|
return conv_res
|
169
170
|
|
170
|
-
def _determine_status(
|
171
|
-
self, in_doc: InputDocument, conv_res: ConversionResult
|
172
|
-
) -> ConversionStatus:
|
171
|
+
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
173
172
|
status = ConversionStatus.SUCCESS
|
174
173
|
for page in conv_res.pages:
|
175
174
|
if page._backend is None or not page._backend.is_valid():
|
@@ -186,5 +185,5 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
186
185
|
|
187
186
|
# Initialise and load resources for a page
|
188
187
|
@abstractmethod
|
189
|
-
def initialize_page(self,
|
188
|
+
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
190
189
|
pass
|
@@ -5,9 +5,10 @@ from docling.backend.abstract_backend import (
|
|
5
5
|
DeclarativeDocumentBackend,
|
6
6
|
)
|
7
7
|
from docling.datamodel.base_models import ConversionStatus
|
8
|
-
from docling.datamodel.document import ConversionResult
|
8
|
+
from docling.datamodel.document import ConversionResult
|
9
9
|
from docling.datamodel.pipeline_options import PipelineOptions
|
10
10
|
from docling.pipeline.base_pipeline import BasePipeline
|
11
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
11
12
|
|
12
13
|
_log = logging.getLogger(__name__)
|
13
14
|
|
@@ -22,13 +23,11 @@ class SimplePipeline(BasePipeline):
|
|
22
23
|
def __init__(self, pipeline_options: PipelineOptions):
|
23
24
|
super().__init__(pipeline_options)
|
24
25
|
|
25
|
-
def _build_document(
|
26
|
-
self, in_doc: InputDocument, conv_res: ConversionResult
|
27
|
-
) -> ConversionResult:
|
26
|
+
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
28
27
|
|
29
|
-
if not isinstance(
|
28
|
+
if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
|
30
29
|
raise RuntimeError(
|
31
|
-
f"The selected backend {type(
|
30
|
+
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
|
32
31
|
f"Can not convert this with simple pipeline. "
|
33
32
|
f"Please check your format configuration on DocumentConverter."
|
34
33
|
)
|
@@ -38,13 +37,11 @@ class SimplePipeline(BasePipeline):
|
|
38
37
|
# Instead of running a page-level pipeline to build up the document structure,
|
39
38
|
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
|
40
39
|
# a DoclingDocument straight.
|
41
|
-
|
42
|
-
|
40
|
+
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
41
|
+
conv_res.document = conv_res.input._backend.convert()
|
43
42
|
return conv_res
|
44
43
|
|
45
|
-
def _determine_status(
|
46
|
-
self, in_doc: InputDocument, conv_res: ConversionResult
|
47
|
-
) -> ConversionStatus:
|
44
|
+
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
48
45
|
# This is called only if the previous steps didn't raise.
|
49
46
|
# Since we don't have anything else to evaluate, we can
|
50
47
|
# safely return SUCCESS.
|
@@ -7,7 +7,7 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
|
7
7
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
8
8
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
9
9
|
from docling.datamodel.base_models import AssembledUnit, Page
|
10
|
-
from docling.datamodel.document import ConversionResult
|
10
|
+
from docling.datamodel.document import ConversionResult
|
11
11
|
from docling.datamodel.pipeline_options import (
|
12
12
|
EasyOcrOptions,
|
13
13
|
PdfPipelineOptions,
|
@@ -27,6 +27,7 @@ from docling.models.table_structure_model import TableStructureModel
|
|
27
27
|
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
28
28
|
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
29
29
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
30
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
30
31
|
|
31
32
|
_log = logging.getLogger(__name__)
|
32
33
|
|
@@ -119,73 +120,75 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
119
120
|
)
|
120
121
|
return None
|
121
122
|
|
122
|
-
def initialize_page(self,
|
123
|
-
|
124
|
-
|
125
|
-
page.
|
123
|
+
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
124
|
+
with TimeRecorder(conv_res, "page_init"):
|
125
|
+
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
126
|
+
if page._backend is not None and page._backend.is_valid():
|
127
|
+
page.size = page._backend.get_size()
|
126
128
|
|
127
129
|
return page
|
128
130
|
|
129
|
-
def _assemble_document(
|
130
|
-
self, in_doc: InputDocument, conv_res: ConversionResult
|
131
|
-
) -> ConversionResult:
|
131
|
+
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
132
132
|
all_elements = []
|
133
133
|
all_headers = []
|
134
134
|
all_body = []
|
135
135
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
conv_res.document = self.glm_model(conv_res)
|
136
|
+
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
137
|
+
for p in conv_res.pages:
|
138
|
+
if p.assembled is not None:
|
139
|
+
for el in p.assembled.body:
|
140
|
+
all_body.append(el)
|
141
|
+
for el in p.assembled.headers:
|
142
|
+
all_headers.append(el)
|
143
|
+
for el in p.assembled.elements:
|
144
|
+
all_elements.append(el)
|
145
|
+
|
146
|
+
conv_res.assembled = AssembledUnit(
|
147
|
+
elements=all_elements, headers=all_headers, body=all_body
|
148
|
+
)
|
150
149
|
|
151
|
-
|
152
|
-
if self.pipeline_options.generate_page_images:
|
153
|
-
for page in conv_res.pages:
|
154
|
-
assert page.image is not None
|
155
|
-
page_no = page.page_no + 1
|
156
|
-
conv_res.document.pages[page_no].image = ImageRef.from_pil(
|
157
|
-
page.image, dpi=int(72 * self.pipeline_options.images_scale)
|
158
|
-
)
|
150
|
+
conv_res.document = self.glm_model(conv_res)
|
159
151
|
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
or self.pipeline_options.generate_table_images
|
164
|
-
):
|
165
|
-
scale = self.pipeline_options.images_scale
|
166
|
-
for element, _level in conv_res.document.iterate_items():
|
167
|
-
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
168
|
-
continue
|
169
|
-
if (
|
170
|
-
isinstance(element, PictureItem)
|
171
|
-
and self.pipeline_options.generate_picture_images
|
172
|
-
) or (
|
173
|
-
isinstance(element, TableItem)
|
174
|
-
and self.pipeline_options.generate_table_images
|
175
|
-
):
|
176
|
-
page_ix = element.prov[0].page_no - 1
|
177
|
-
page = conv_res.pages[page_ix]
|
178
|
-
assert page.size is not None
|
152
|
+
# Generate page images in the output
|
153
|
+
if self.pipeline_options.generate_page_images:
|
154
|
+
for page in conv_res.pages:
|
179
155
|
assert page.image is not None
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
.bbox.scaled(scale=scale)
|
184
|
-
.to_top_left_origin(page_height=page.size.height * scale)
|
156
|
+
page_no = page.page_no + 1
|
157
|
+
conv_res.document.pages[page_no].image = ImageRef.from_pil(
|
158
|
+
page.image, dpi=int(72 * self.pipeline_options.images_scale)
|
185
159
|
)
|
186
160
|
|
187
|
-
|
188
|
-
|
161
|
+
# Generate images of the requested element types
|
162
|
+
if (
|
163
|
+
self.pipeline_options.generate_picture_images
|
164
|
+
or self.pipeline_options.generate_table_images
|
165
|
+
):
|
166
|
+
scale = self.pipeline_options.images_scale
|
167
|
+
for element, _level in conv_res.document.iterate_items():
|
168
|
+
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
169
|
+
continue
|
170
|
+
if (
|
171
|
+
isinstance(element, PictureItem)
|
172
|
+
and self.pipeline_options.generate_picture_images
|
173
|
+
) or (
|
174
|
+
isinstance(element, TableItem)
|
175
|
+
and self.pipeline_options.generate_table_images
|
176
|
+
):
|
177
|
+
page_ix = element.prov[0].page_no - 1
|
178
|
+
page = conv_res.pages[page_ix]
|
179
|
+
assert page.size is not None
|
180
|
+
assert page.image is not None
|
181
|
+
|
182
|
+
crop_bbox = (
|
183
|
+
element.prov[0]
|
184
|
+
.bbox.scaled(scale=scale)
|
185
|
+
.to_top_left_origin(page_height=page.size.height * scale)
|
186
|
+
)
|
187
|
+
|
188
|
+
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
189
|
+
element.image = ImageRef.from_pil(
|
190
|
+
cropped_im, dpi=int(72 * scale)
|
191
|
+
)
|
189
192
|
|
190
193
|
return conv_res
|
191
194
|
|
@@ -0,0 +1,62 @@
|
|
1
|
+
import time
|
2
|
+
from datetime import datetime
|
3
|
+
from enum import Enum
|
4
|
+
from typing import TYPE_CHECKING, List
|
5
|
+
|
6
|
+
import numpy as np
|
7
|
+
from pydantic import BaseModel
|
8
|
+
|
9
|
+
from docling.datamodel.settings import settings
|
10
|
+
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
from docling.datamodel.document import ConversionResult
|
13
|
+
|
14
|
+
|
15
|
+
class ProfilingScope(str, Enum):
|
16
|
+
PAGE = "page"
|
17
|
+
DOCUMENT = "document"
|
18
|
+
|
19
|
+
|
20
|
+
class ProfilingItem(BaseModel):
|
21
|
+
scope: ProfilingScope
|
22
|
+
count: int = 0
|
23
|
+
times: List[float] = []
|
24
|
+
start_timestamps: List[datetime] = []
|
25
|
+
|
26
|
+
def avg(self) -> float:
|
27
|
+
return np.average(self.times) # type: ignore
|
28
|
+
|
29
|
+
def std(self) -> float:
|
30
|
+
return np.std(self.times) # type: ignore
|
31
|
+
|
32
|
+
def mean(self) -> float:
|
33
|
+
return np.mean(self.times) # type: ignore
|
34
|
+
|
35
|
+
def percentile(self, perc: float) -> float:
|
36
|
+
return np.percentile(self.times, perc) # type: ignore
|
37
|
+
|
38
|
+
|
39
|
+
class TimeRecorder:
|
40
|
+
def __init__(
|
41
|
+
self,
|
42
|
+
conv_res: "ConversionResult",
|
43
|
+
key: str,
|
44
|
+
scope: ProfilingScope = ProfilingScope.PAGE,
|
45
|
+
):
|
46
|
+
if settings.debug.profile_pipeline_timings:
|
47
|
+
if key not in conv_res.timings.keys():
|
48
|
+
conv_res.timings[key] = ProfilingItem(scope=scope)
|
49
|
+
self.conv_res = conv_res
|
50
|
+
self.key = key
|
51
|
+
|
52
|
+
def __enter__(self):
|
53
|
+
if settings.debug.profile_pipeline_timings:
|
54
|
+
self.start = time.monotonic()
|
55
|
+
self.conv_res.timings[self.key].start_timestamps.append(datetime.utcnow())
|
56
|
+
return self
|
57
|
+
|
58
|
+
def __exit__(self, *args):
|
59
|
+
if settings.debug.profile_pipeline_timings:
|
60
|
+
elapsed = time.monotonic() - self.start
|
61
|
+
self.conv_res.timings[self.key].times.append(elapsed)
|
62
|
+
self.conv_res.timings[self.key].count += 1
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.3.0
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -23,7 +23,7 @@ Provides-Extra: tesserocr
|
|
23
23
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
24
24
|
Requires-Dist: certifi (>=2024.7.4)
|
25
25
|
Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
|
26
|
-
Requires-Dist: docling-core (>=2.
|
26
|
+
Requires-Dist: docling-core (>=2.2.3,<3.0.0)
|
27
27
|
Requires-Dist: docling-ibm-models (>=2.0.1,<3.0.0)
|
28
28
|
Requires-Dist: docling-parse (>=2.0.0,<3.0.0)
|
29
29
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -73,8 +73,9 @@ Docling parses documents and exports them to the desired format with ease and sp
|
|
73
73
|
|
74
74
|
## Features
|
75
75
|
|
76
|
-
* 🗂️
|
77
|
-
* 📑 Advanced PDF document understanding
|
76
|
+
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON
|
77
|
+
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
78
|
+
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
78
79
|
* 📝 Metadata extraction, including title, authors, references & language
|
79
80
|
* 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
|
80
81
|
* 🔍 OCR support for scanned PDFs
|
@@ -0,0 +1,45 @@
|
|
1
|
+
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
|
4
|
+
docling/backend/asciidoc_backend.py,sha256=kXZxOLk_LvLFVZwnJVVwjmvc3QWZ0iiG7VnwjgtC3hI,14051
|
5
|
+
docling/backend/docling_parse_backend.py,sha256=TaIMli9vePd3fz9L6S4t75JPYZDpgYBLRGfWjbc9Hbk,7632
|
6
|
+
docling/backend/docling_parse_v2_backend.py,sha256=QlVU8NgqKvVCa99E8oDa2Xvy__kq30C-myGY3o9Qoq4,8588
|
7
|
+
docling/backend/html_backend.py,sha256=p3WlYta1f3e4osmvVR12KIUYLJimveTX8UwEkyPt7_g,15161
|
8
|
+
docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
|
9
|
+
docling/backend/mspowerpoint_backend.py,sha256=J472AIH_IXvGg3D0FDmXhue1At_VSBD6n15c64Kxttw,15446
|
10
|
+
docling/backend/msword_backend.py,sha256=FAUdP74QxGKo2xMZQ4WQGYwtpIBCTJ_FG17PBpRwhxI,17230
|
11
|
+
docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
|
12
|
+
docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
|
13
|
+
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
+
docling/cli/main.py,sha256=NRVGz0z-3EBwYNMJGVnLtDBcfOeutaUyYdkM0ymRnGA,8008
|
15
|
+
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
+
docling/datamodel/base_models.py,sha256=fmkS6iTxGZCTtNCo2zsgMmBC11Ogf2Ht-mNIlZ9GP-o,5375
|
17
|
+
docling/datamodel/document.py,sha256=9dQf_J18X_MEWs-Mg3Ed6BykFPJ79ETmkkxcssY-vYo,20698
|
18
|
+
docling/datamodel/pipeline_options.py,sha256=WNjluKC-Ww63ifkGMHwws8zIDHnOS1z5Hw7_j3S0qao,2446
|
19
|
+
docling/datamodel/settings.py,sha256=2-sYEnKLV_giGygUlBtiBd4CJYN5T9-3BdL6NpWkUYw,1155
|
20
|
+
docling/document_converter.py,sha256=Y0Tngh-seNSty7Ov71DDAJzbBgruoEdwYPunVn7DT00,10413
|
21
|
+
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
|
+
docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
|
23
|
+
docling/models/base_ocr_model.py,sha256=Ti0glL-_DVRfmP3MpywYVmkNf5RP6qhRg_UKzJuV1Dc,5663
|
24
|
+
docling/models/ds_glm_model.py,sha256=2OpWW8MMzCIshrtP36gDSRPYOCjv1ex34FqxD2nYjP4,11986
|
25
|
+
docling/models/easyocr_model.py,sha256=23hWq484qVS3nkch6nRRWowfQamN-McFZgfbHfp5Vuo,3818
|
26
|
+
docling/models/layout_model.py,sha256=ZvbTSyxvXB5yLHNEti0Wv3trz0vwGuHySI5TCdApb0U,14011
|
27
|
+
docling/models/page_assemble_model.py,sha256=kSGNiRKhmzkpFH7xCiT3rulMsgJmUXFa6Th_eB-cLEk,7103
|
28
|
+
docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
|
29
|
+
docling/models/table_structure_model.py,sha256=-ANSQpiN2avt3B9sbi7dHcoULUJbMBalAR5xxlrM7To,8421
|
30
|
+
docling/models/tesseract_ocr_cli_model.py,sha256=ZflwQcD7YjhPqEB8bbgNgP14OBD4NNEJefUS8Lbr5X0,6511
|
31
|
+
docling/models/tesseract_ocr_model.py,sha256=AccCgaYNzGryiJnkwR4sv2FeOdlSgO3uspdQOmo1sNY,5569
|
32
|
+
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
33
|
+
docling/pipeline/base_pipeline.py,sha256=IF1XWYgUGbdB4-teLkmM4Hvg_UNEfPrGuhExMRTUsk8,7168
|
34
|
+
docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
|
35
|
+
docling/pipeline/standard_pdf_pipeline.py,sha256=h59eA0CLMYuuJoH-0SyCRkYEregNs6i0pa46Ioqf8kU,7947
|
36
|
+
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
37
|
+
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
38
|
+
docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
|
39
|
+
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
40
|
+
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
41
|
+
docling-2.3.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
42
|
+
docling-2.3.0.dist-info/METADATA,sha256=e3LTQgbktuUHzQlI4qXDhIDMGOX0duC1EJWws6j6_y8,6373
|
43
|
+
docling-2.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
44
|
+
docling-2.3.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
45
|
+
docling-2.3.0.dist-info/RECORD,,
|
docling-2.2.0.dist-info/RECORD
DELETED
@@ -1,44 +0,0 @@
|
|
1
|
-
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
|
4
|
-
docling/backend/asciidoc_backend.py,sha256=WW0eIanPIObcg5ci9YcnqFxwipmqRFsRY8zjZDdKvJA,14116
|
5
|
-
docling/backend/docling_parse_backend.py,sha256=TaIMli9vePd3fz9L6S4t75JPYZDpgYBLRGfWjbc9Hbk,7632
|
6
|
-
docling/backend/docling_parse_v2_backend.py,sha256=QlVU8NgqKvVCa99E8oDa2Xvy__kq30C-myGY3o9Qoq4,8588
|
7
|
-
docling/backend/html_backend.py,sha256=wfh5PWEwoqsCXxFCQbFBdJvEtlqZhXgqfPfTYETWHfE,14974
|
8
|
-
docling/backend/md_backend.py,sha256=osYiNLnep9UgLq8mUH9bmwG3kP9RXxt69I8LlyeJN6g,11505
|
9
|
-
docling/backend/mspowerpoint_backend.py,sha256=J472AIH_IXvGg3D0FDmXhue1At_VSBD6n15c64Kxttw,15446
|
10
|
-
docling/backend/msword_backend.py,sha256=6bY0ebOaeSbpskUJY5t5pOf4a2VclWzeHeSo-vzsaO0,17470
|
11
|
-
docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
|
12
|
-
docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
|
13
|
-
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
-
docling/cli/main.py,sha256=NRVGz0z-3EBwYNMJGVnLtDBcfOeutaUyYdkM0ymRnGA,8008
|
15
|
-
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
-
docling/datamodel/base_models.py,sha256=Mx0xR6YmRP8thu8CjOxjbGHLUJctqIvFwRZQ-8tQowY,5380
|
17
|
-
docling/datamodel/document.py,sha256=mkPXDms9jtPFY1pfBSicNaVRZwbbfzYFUj0dJDbMgG8,20612
|
18
|
-
docling/datamodel/pipeline_options.py,sha256=WNjluKC-Ww63ifkGMHwws8zIDHnOS1z5Hw7_j3S0qao,2446
|
19
|
-
docling/datamodel/settings.py,sha256=KBFVeQviR1hoCFjA1ZwuLuQ6EAAYR7saIa6EUYiOkHI,767
|
20
|
-
docling/document_converter.py,sha256=T-Y2pWwbCIofW209XJ3wlc5TiGeQqMbDqgzcVWyZ_0Y,10227
|
21
|
-
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
|
-
docling/models/base_model.py,sha256=wSBGAIAbLqrqP_SMtkzXMuyFvvzjVU6iCqgSNnGIR4Y,603
|
23
|
-
docling/models/base_ocr_model.py,sha256=SYelQRValiUo6M_p_9-J7CqNIOFO-EkK58j90SMsKQY,5028
|
24
|
-
docling/models/ds_glm_model.py,sha256=vJLngchZonqFzGWbUr2izFSXk9DloPDhAfN2c3nkzNU,11254
|
25
|
-
docling/models/easyocr_model.py,sha256=YfvdodjZ20WuOfouQXJmDyQL78QDOqWYsWSs2zSxWFc,3327
|
26
|
-
docling/models/layout_model.py,sha256=zd2ULW3U6v9OJl4TnjWFEY6Q2O-lBfrIqtvrnDzF7HU,12596
|
27
|
-
docling/models/page_assemble_model.py,sha256=LOKHho-r-RpeIVh8CpJ9tid_QIp5um3ukcrucZsyUlY,6645
|
28
|
-
docling/models/page_preprocessing_model.py,sha256=cfhUIlGAGaX1RxILi69ZEV9Kmhhd3Y0XaSlQnGo18o4,1964
|
29
|
-
docling/models/table_structure_model.py,sha256=YWSZKOz56gvicjTzVgSE-8Z_hI3NcRD5EN0yOUoM-_g,6979
|
30
|
-
docling/models/tesseract_ocr_cli_model.py,sha256=fKc05V73ibMvAeuA4PForhYNtunpT5rR0k_xHZsew-E,5980
|
31
|
-
docling/models/tesseract_ocr_model.py,sha256=v6td0vq8NogePuRTJRZhKF0DtZXITj70r9rKJKO5u9k,4984
|
32
|
-
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
33
|
-
docling/pipeline/base_pipeline.py,sha256=7DTzVvM_jVHCxyY-BuuGRhmUsD_sgX4DD00oBFJWdB8,6723
|
34
|
-
docling/pipeline/simple_pipeline.py,sha256=pxce0-3He5Lqa-xXT-7h173XVOSMZiMHl6HOfAJmQ7o,2162
|
35
|
-
docling/pipeline/standard_pdf_pipeline.py,sha256=AVNSxGc6kPmBPDLWDc9eI8fryc25eOtiIVrOyVhZMZM,7527
|
36
|
-
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
37
|
-
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
38
|
-
docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
|
39
|
-
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
40
|
-
docling-2.2.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
41
|
-
docling-2.2.0.dist-info/METADATA,sha256=TkaywA2l2ImdMc9WpUYWUQy3n50zG9Y9eC7ziElBlU0,6205
|
42
|
-
docling-2.2.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
43
|
-
docling-2.2.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
44
|
-
docling-2.2.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|