docling 2.10.0__py3-none-any.whl → 2.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/cli/main.py +10 -16
- docling/datamodel/base_models.py +12 -12
- docling/datamodel/pipeline_options.py +22 -1
- docling/models/ds_glm_model.py +2 -5
- docling/models/rapid_ocr_model.py +18 -17
- docling/pipeline/base_pipeline.py +17 -3
- {docling-2.10.0.dist-info → docling-2.11.0.dist-info}/METADATA +1 -1
- {docling-2.10.0.dist-info → docling-2.11.0.dist-info}/RECORD +11 -11
- {docling-2.10.0.dist-info → docling-2.11.0.dist-info}/LICENSE +0 -0
- {docling-2.10.0.dist-info → docling-2.11.0.dist-info}/WHEEL +0 -0
- {docling-2.10.0.dist-info → docling-2.11.0.dist-info}/entry_points.txt +0 -0
docling/cli/main.py
CHANGED
@@ -27,8 +27,10 @@ from docling.datamodel.base_models import (
|
|
27
27
|
from docling.datamodel.document import ConversionResult
|
28
28
|
from docling.datamodel.pipeline_options import (
|
29
29
|
EasyOcrOptions,
|
30
|
+
OcrEngine,
|
30
31
|
OcrMacOptions,
|
31
32
|
OcrOptions,
|
33
|
+
PdfBackend,
|
32
34
|
PdfPipelineOptions,
|
33
35
|
RapidOcrOptions,
|
34
36
|
TableFormerMode,
|
@@ -68,22 +70,6 @@ def version_callback(value: bool):
|
|
68
70
|
raise typer.Exit()
|
69
71
|
|
70
72
|
|
71
|
-
# Define an enum for the backend options
|
72
|
-
class PdfBackend(str, Enum):
|
73
|
-
PYPDFIUM2 = "pypdfium2"
|
74
|
-
DLPARSE_V1 = "dlparse_v1"
|
75
|
-
DLPARSE_V2 = "dlparse_v2"
|
76
|
-
|
77
|
-
|
78
|
-
# Define an enum for the ocr engines
|
79
|
-
class OcrEngine(str, Enum):
|
80
|
-
EASYOCR = "easyocr"
|
81
|
-
TESSERACT_CLI = "tesseract_cli"
|
82
|
-
TESSERACT = "tesseract"
|
83
|
-
OCRMAC = "ocrmac"
|
84
|
-
RAPIDOCR = "rapidocr"
|
85
|
-
|
86
|
-
|
87
73
|
def export_documents(
|
88
74
|
conv_results: Iterable[ConversionResult],
|
89
75
|
output_dir: Path,
|
@@ -264,6 +250,13 @@ def convert(
|
|
264
250
|
help="Show version information.",
|
265
251
|
),
|
266
252
|
] = None,
|
253
|
+
document_timeout: Annotated[
|
254
|
+
Optional[float],
|
255
|
+
typer.Option(
|
256
|
+
...,
|
257
|
+
help="The timeout for processing each document, in seconds.",
|
258
|
+
),
|
259
|
+
] = None,
|
267
260
|
):
|
268
261
|
if verbose == 0:
|
269
262
|
logging.basicConfig(level=logging.WARNING)
|
@@ -347,6 +340,7 @@ def convert(
|
|
347
340
|
do_ocr=ocr,
|
348
341
|
ocr_options=ocr_options,
|
349
342
|
do_table_structure=True,
|
343
|
+
document_timeout=document_timeout,
|
350
344
|
)
|
351
345
|
pipeline_options.table_structure_options.do_cell_matching = (
|
352
346
|
True # do_cell_matching
|
docling/datamodel/base_models.py
CHANGED
@@ -19,12 +19,12 @@ if TYPE_CHECKING:
|
|
19
19
|
|
20
20
|
|
21
21
|
class ConversionStatus(str, Enum):
|
22
|
-
PENDING =
|
23
|
-
STARTED =
|
24
|
-
FAILURE =
|
25
|
-
SUCCESS =
|
26
|
-
PARTIAL_SUCCESS =
|
27
|
-
SKIPPED =
|
22
|
+
PENDING = "pending"
|
23
|
+
STARTED = "started"
|
24
|
+
FAILURE = "failure"
|
25
|
+
SUCCESS = "success"
|
26
|
+
PARTIAL_SUCCESS = "partial_success"
|
27
|
+
SKIPPED = "skipped"
|
28
28
|
|
29
29
|
|
30
30
|
class InputFormat(str, Enum):
|
@@ -89,15 +89,15 @@ MimeTypeToFormat = {
|
|
89
89
|
|
90
90
|
|
91
91
|
class DocInputType(str, Enum):
|
92
|
-
PATH =
|
93
|
-
STREAM =
|
92
|
+
PATH = "path"
|
93
|
+
STREAM = "stream"
|
94
94
|
|
95
95
|
|
96
96
|
class DoclingComponentType(str, Enum):
|
97
|
-
DOCUMENT_BACKEND =
|
98
|
-
MODEL =
|
99
|
-
DOC_ASSEMBLER =
|
100
|
-
USER_INPUT =
|
97
|
+
DOCUMENT_BACKEND = "document_backend"
|
98
|
+
MODEL = "model"
|
99
|
+
DOC_ASSEMBLER = "doc_assembler"
|
100
|
+
USER_INPUT = "user_input"
|
101
101
|
|
102
102
|
|
103
103
|
class ErrorItem(BaseModel):
|
@@ -126,12 +126,33 @@ class OcrMacOptions(OcrOptions):
|
|
126
126
|
)
|
127
127
|
|
128
128
|
|
129
|
+
# Define an enum for the backend options
|
130
|
+
class PdfBackend(str, Enum):
|
131
|
+
"""Enum of valid PDF backends."""
|
132
|
+
|
133
|
+
PYPDFIUM2 = "pypdfium2"
|
134
|
+
DLPARSE_V1 = "dlparse_v1"
|
135
|
+
DLPARSE_V2 = "dlparse_v2"
|
136
|
+
|
137
|
+
|
138
|
+
# Define an enum for the ocr engines
|
139
|
+
class OcrEngine(str, Enum):
|
140
|
+
"""Enum of valid OCR engines."""
|
141
|
+
|
142
|
+
EASYOCR = "easyocr"
|
143
|
+
TESSERACT_CLI = "tesseract_cli"
|
144
|
+
TESSERACT = "tesseract"
|
145
|
+
OCRMAC = "ocrmac"
|
146
|
+
RAPIDOCR = "rapidocr"
|
147
|
+
|
148
|
+
|
129
149
|
class PipelineOptions(BaseModel):
|
130
150
|
"""Base pipeline options."""
|
131
151
|
|
132
152
|
create_legacy_output: bool = (
|
133
|
-
True # This
|
153
|
+
True # This default will be set to False on a future version of docling
|
134
154
|
)
|
155
|
+
document_timeout: Optional[float] = None
|
135
156
|
|
136
157
|
|
137
158
|
class PdfPipelineOptions(PipelineOptions):
|
docling/models/ds_glm_model.py
CHANGED
@@ -3,8 +3,7 @@ import random
|
|
3
3
|
from pathlib import Path
|
4
4
|
from typing import List, Union
|
5
5
|
|
6
|
-
from deepsearch_glm.
|
7
|
-
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
|
6
|
+
from deepsearch_glm.andromeda_nlp import nlp_model
|
8
7
|
from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
|
9
8
|
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
|
10
9
|
from docling_core.types.legacy_doc.base import (
|
@@ -43,9 +42,7 @@ class GlmModel:
|
|
43
42
|
def __init__(self, options: GlmOptions):
|
44
43
|
self.options = options
|
45
44
|
|
46
|
-
|
47
|
-
load_pretrained_nlp_models()
|
48
|
-
self.model = init_nlp_model(model_names=self.options.model_names)
|
45
|
+
self.model = nlp_model(loglevel="error", text_ordering=True)
|
49
46
|
|
50
47
|
def _to_legacy_document(self, conv_res) -> DsDocument:
|
51
48
|
title = ""
|
@@ -118,24 +118,25 @@ class RapidOcrModel(BaseOcrModel):
|
|
118
118
|
del high_res_image
|
119
119
|
del im
|
120
120
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
(
|
129
|
-
|
130
|
-
|
131
|
-
|
121
|
+
if result is not None:
|
122
|
+
cells = [
|
123
|
+
OcrCell(
|
124
|
+
id=ix,
|
125
|
+
text=line[1],
|
126
|
+
confidence=line[2],
|
127
|
+
bbox=BoundingBox.from_tuple(
|
128
|
+
coord=(
|
129
|
+
(line[0][0][0] / self.scale) + ocr_rect.l,
|
130
|
+
(line[0][0][1] / self.scale) + ocr_rect.t,
|
131
|
+
(line[0][2][0] / self.scale) + ocr_rect.l,
|
132
|
+
(line[0][2][1] / self.scale) + ocr_rect.t,
|
133
|
+
),
|
134
|
+
origin=CoordOrigin.TOPLEFT,
|
132
135
|
),
|
133
|
-
|
134
|
-
)
|
135
|
-
|
136
|
-
|
137
|
-
]
|
138
|
-
all_ocr_cells.extend(cells)
|
136
|
+
)
|
137
|
+
for ix, line in enumerate(result)
|
138
|
+
]
|
139
|
+
all_ocr_cells.extend(cells)
|
139
140
|
|
140
141
|
# Post-process the cells
|
141
142
|
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
@@ -126,6 +126,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
126
126
|
# conv_res.status = ConversionStatus.FAILURE
|
127
127
|
# return conv_res
|
128
128
|
|
129
|
+
total_elapsed_time = 0.0
|
129
130
|
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
130
131
|
|
131
132
|
for i in range(0, conv_res.input.page_count):
|
@@ -136,7 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
136
137
|
for page_batch in chunkify(
|
137
138
|
conv_res.pages, settings.perf.page_batch_size
|
138
139
|
):
|
139
|
-
|
140
|
+
start_batch_time = time.monotonic()
|
140
141
|
|
141
142
|
# 1. Initialise the page resources
|
142
143
|
init_pages = map(
|
@@ -149,8 +150,21 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
149
150
|
for p in pipeline_pages: # Must exhaust!
|
150
151
|
pass
|
151
152
|
|
152
|
-
|
153
|
-
|
153
|
+
end_batch_time = time.monotonic()
|
154
|
+
total_elapsed_time += end_batch_time - start_batch_time
|
155
|
+
if (
|
156
|
+
self.pipeline_options.document_timeout is not None
|
157
|
+
and total_elapsed_time > self.pipeline_options.document_timeout
|
158
|
+
):
|
159
|
+
_log.warning(
|
160
|
+
f"Document processing time ({total_elapsed_time:.3f} seconds) exceeded the specified timeout of {self.pipeline_options.document_timeout:.3f} seconds"
|
161
|
+
)
|
162
|
+
conv_res.status = ConversionStatus.PARTIAL_SUCCESS
|
163
|
+
break
|
164
|
+
|
165
|
+
_log.debug(
|
166
|
+
f"Finished converting page batch time={end_batch_time:.3f}"
|
167
|
+
)
|
154
168
|
|
155
169
|
except Exception as e:
|
156
170
|
conv_res.status = ConversionStatus.FAILURE
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.11.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -13,29 +13,29 @@ docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6
|
|
13
13
|
docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
|
14
14
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
15
15
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
-
docling/cli/main.py,sha256=
|
16
|
+
docling/cli/main.py,sha256=FFDUDADvK7QNW7xCs6dlsC7Bt_BMyrKdbZewKTEjm54,14624
|
17
17
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
-
docling/datamodel/base_models.py,sha256=
|
18
|
+
docling/datamodel/base_models.py,sha256=vwy59eDrkzCSaay24RlUvx4zEyuaUukOdOhw3622u2I,5616
|
19
19
|
docling/datamodel/document.py,sha256=GNlTsgKgDqdqv2dfhpYmnqymxDQWWWC8HgE8uAta8V4,10265
|
20
|
-
docling/datamodel/pipeline_options.py,sha256=
|
20
|
+
docling/datamodel/pipeline_options.py,sha256=1ouWNE5VhZolrWMb4RE6s_AxgNFr3_3PMtxB_YQ391A,5495
|
21
21
|
docling/datamodel/settings.py,sha256=JK8lZPBjUx2kD2q-Qpg-o3vOElADMcyQbRUL0EHZ7us,1263
|
22
22
|
docling/document_converter.py,sha256=Iz5eerBWFPVJoXAMlXEivRQX2VLBiUkA07BL4NNbaEs,11583
|
23
23
|
docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
|
24
24
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
25
|
docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
|
26
26
|
docling/models/base_ocr_model.py,sha256=rGSpBF4dByITcsBaRIgvFKpiu0CrhmZS_PHIo686Dw0,6428
|
27
|
-
docling/models/ds_glm_model.py,sha256=
|
27
|
+
docling/models/ds_glm_model.py,sha256=YJkGxV46wh7G2Wr4vVzt9b8oewkUDPWpvI6AEaZDrs0,11872
|
28
28
|
docling/models/easyocr_model.py,sha256=c2m4x9dZpSc-cMgeEdFBRVBlB78uMGlYD8Q_2gzRuMU,3734
|
29
29
|
docling/models/layout_model.py,sha256=ZvbTSyxvXB5yLHNEti0Wv3trz0vwGuHySI5TCdApb0U,14011
|
30
30
|
docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
|
31
31
|
docling/models/page_assemble_model.py,sha256=kSGNiRKhmzkpFH7xCiT3rulMsgJmUXFa6Th_eB-cLEk,7103
|
32
32
|
docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
|
33
|
-
docling/models/rapid_ocr_model.py,sha256=
|
33
|
+
docling/models/rapid_ocr_model.py,sha256=ui152cerv9b9OeWyyyefs8qMLwYn0qsE2DFE_gHmaCM,6124
|
34
34
|
docling/models/table_structure_model.py,sha256=-ANSQpiN2avt3B9sbi7dHcoULUJbMBalAR5xxlrM7To,8421
|
35
35
|
docling/models/tesseract_ocr_cli_model.py,sha256=aKQBaty4cYu6zG_C5uy6Zm3eeRQo5fxIierbKixa2kc,6622
|
36
36
|
docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUsOkuz0neY,6095
|
37
37
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
|
-
docling/pipeline/base_pipeline.py,sha256=
|
38
|
+
docling/pipeline/base_pipeline.py,sha256=hVvtk5E4DVZdl_SyNs_pYRUjN9C8PABhpVaeN5Z_fAY,7885
|
39
39
|
docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
|
40
40
|
docling/pipeline/standard_pdf_pipeline.py,sha256=B1q8xt3Dfecpi8s8DrcfPzdATh8TYgL43FDzzcS4vEA,8885
|
41
41
|
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
@@ -45,8 +45,8 @@ docling/utils/glm_utils.py,sha256=H1O_tDiRksMgw45rY9LhK6GjcZSOq5IyoGurGjoo-Ac,11
|
|
45
45
|
docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
|
46
46
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
47
47
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
48
|
-
docling-2.
|
49
|
-
docling-2.
|
50
|
-
docling-2.
|
51
|
-
docling-2.
|
52
|
-
docling-2.
|
48
|
+
docling-2.11.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
49
|
+
docling-2.11.0.dist-info/METADATA,sha256=ajUVy5CuNDUp0x9tMCqO2px2M-ia-Vs7frIyb0_HxMo,7731
|
50
|
+
docling-2.11.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
51
|
+
docling-2.11.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
52
|
+
docling-2.11.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|