docling 2.10.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docling/cli/main.py CHANGED
@@ -27,8 +27,10 @@ from docling.datamodel.base_models import (
27
27
  from docling.datamodel.document import ConversionResult
28
28
  from docling.datamodel.pipeline_options import (
29
29
  EasyOcrOptions,
30
+ OcrEngine,
30
31
  OcrMacOptions,
31
32
  OcrOptions,
33
+ PdfBackend,
32
34
  PdfPipelineOptions,
33
35
  RapidOcrOptions,
34
36
  TableFormerMode,
@@ -68,22 +70,6 @@ def version_callback(value: bool):
68
70
  raise typer.Exit()
69
71
 
70
72
 
71
- # Define an enum for the backend options
72
- class PdfBackend(str, Enum):
73
- PYPDFIUM2 = "pypdfium2"
74
- DLPARSE_V1 = "dlparse_v1"
75
- DLPARSE_V2 = "dlparse_v2"
76
-
77
-
78
- # Define an enum for the ocr engines
79
- class OcrEngine(str, Enum):
80
- EASYOCR = "easyocr"
81
- TESSERACT_CLI = "tesseract_cli"
82
- TESSERACT = "tesseract"
83
- OCRMAC = "ocrmac"
84
- RAPIDOCR = "rapidocr"
85
-
86
-
87
73
  def export_documents(
88
74
  conv_results: Iterable[ConversionResult],
89
75
  output_dir: Path,
@@ -264,6 +250,13 @@ def convert(
264
250
  help="Show version information.",
265
251
  ),
266
252
  ] = None,
253
+ document_timeout: Annotated[
254
+ Optional[float],
255
+ typer.Option(
256
+ ...,
257
+ help="The timeout for processing each document, in seconds.",
258
+ ),
259
+ ] = None,
267
260
  ):
268
261
  if verbose == 0:
269
262
  logging.basicConfig(level=logging.WARNING)
@@ -347,6 +340,7 @@ def convert(
347
340
  do_ocr=ocr,
348
341
  ocr_options=ocr_options,
349
342
  do_table_structure=True,
343
+ document_timeout=document_timeout,
350
344
  )
351
345
  pipeline_options.table_structure_options.do_cell_matching = (
352
346
  True # do_cell_matching
@@ -19,12 +19,12 @@ if TYPE_CHECKING:
19
19
 
20
20
 
21
21
  class ConversionStatus(str, Enum):
22
- PENDING = auto()
23
- STARTED = auto()
24
- FAILURE = auto()
25
- SUCCESS = auto()
26
- PARTIAL_SUCCESS = auto()
27
- SKIPPED = auto()
22
+ PENDING = "pending"
23
+ STARTED = "started"
24
+ FAILURE = "failure"
25
+ SUCCESS = "success"
26
+ PARTIAL_SUCCESS = "partial_success"
27
+ SKIPPED = "skipped"
28
28
 
29
29
 
30
30
  class InputFormat(str, Enum):
@@ -89,15 +89,15 @@ MimeTypeToFormat = {
89
89
 
90
90
 
91
91
  class DocInputType(str, Enum):
92
- PATH = auto()
93
- STREAM = auto()
92
+ PATH = "path"
93
+ STREAM = "stream"
94
94
 
95
95
 
96
96
  class DoclingComponentType(str, Enum):
97
- DOCUMENT_BACKEND = auto()
98
- MODEL = auto()
99
- DOC_ASSEMBLER = auto()
100
- USER_INPUT = auto()
97
+ DOCUMENT_BACKEND = "document_backend"
98
+ MODEL = "model"
99
+ DOC_ASSEMBLER = "doc_assembler"
100
+ USER_INPUT = "user_input"
101
101
 
102
102
 
103
103
  class ErrorItem(BaseModel):
@@ -126,12 +126,33 @@ class OcrMacOptions(OcrOptions):
126
126
  )
127
127
 
128
128
 
129
+ # Define an enum for the backend options
130
+ class PdfBackend(str, Enum):
131
+ """Enum of valid PDF backends."""
132
+
133
+ PYPDFIUM2 = "pypdfium2"
134
+ DLPARSE_V1 = "dlparse_v1"
135
+ DLPARSE_V2 = "dlparse_v2"
136
+
137
+
138
+ # Define an enum for the ocr engines
139
+ class OcrEngine(str, Enum):
140
+ """Enum of valid OCR engines."""
141
+
142
+ EASYOCR = "easyocr"
143
+ TESSERACT_CLI = "tesseract_cli"
144
+ TESSERACT = "tesseract"
145
+ OCRMAC = "ocrmac"
146
+ RAPIDOCR = "rapidocr"
147
+
148
+
129
149
  class PipelineOptions(BaseModel):
130
150
  """Base pipeline options."""
131
151
 
132
152
  create_legacy_output: bool = (
133
- True # This defautl will be set to False on a future version of docling
153
+ True # This default will be set to False on a future version of docling
134
154
  )
155
+ document_timeout: Optional[float] = None
135
156
 
136
157
 
137
158
  class PdfPipelineOptions(PipelineOptions):
@@ -3,8 +3,7 @@ import random
3
3
  from pathlib import Path
4
4
  from typing import List, Union
5
5
 
6
- from deepsearch_glm.nlp_utils import init_nlp_model
7
- from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
6
+ from deepsearch_glm.andromeda_nlp import nlp_model
8
7
  from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
9
8
  from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
10
9
  from docling_core.types.legacy_doc.base import (
@@ -43,9 +42,7 @@ class GlmModel:
43
42
  def __init__(self, options: GlmOptions):
44
43
  self.options = options
45
44
 
46
- if self.options.model_names != "":
47
- load_pretrained_nlp_models()
48
- self.model = init_nlp_model(model_names=self.options.model_names)
45
+ self.model = nlp_model(loglevel="error", text_ordering=True)
49
46
 
50
47
  def _to_legacy_document(self, conv_res) -> DsDocument:
51
48
  title = ""
@@ -118,24 +118,25 @@ class RapidOcrModel(BaseOcrModel):
118
118
  del high_res_image
119
119
  del im
120
120
 
121
- cells = [
122
- OcrCell(
123
- id=ix,
124
- text=line[1],
125
- confidence=line[2],
126
- bbox=BoundingBox.from_tuple(
127
- coord=(
128
- (line[0][0][0] / self.scale) + ocr_rect.l,
129
- (line[0][0][1] / self.scale) + ocr_rect.t,
130
- (line[0][2][0] / self.scale) + ocr_rect.l,
131
- (line[0][2][1] / self.scale) + ocr_rect.t,
121
+ if result is not None:
122
+ cells = [
123
+ OcrCell(
124
+ id=ix,
125
+ text=line[1],
126
+ confidence=line[2],
127
+ bbox=BoundingBox.from_tuple(
128
+ coord=(
129
+ (line[0][0][0] / self.scale) + ocr_rect.l,
130
+ (line[0][0][1] / self.scale) + ocr_rect.t,
131
+ (line[0][2][0] / self.scale) + ocr_rect.l,
132
+ (line[0][2][1] / self.scale) + ocr_rect.t,
133
+ ),
134
+ origin=CoordOrigin.TOPLEFT,
132
135
  ),
133
- origin=CoordOrigin.TOPLEFT,
134
- ),
135
- )
136
- for ix, line in enumerate(result)
137
- ]
138
- all_ocr_cells.extend(cells)
136
+ )
137
+ for ix, line in enumerate(result)
138
+ ]
139
+ all_ocr_cells.extend(cells)
139
140
 
140
141
  # Post-process the cells
141
142
  page.cells = self.post_process_cells(all_ocr_cells, page.cells)
@@ -126,6 +126,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
126
126
  # conv_res.status = ConversionStatus.FAILURE
127
127
  # return conv_res
128
128
 
129
+ total_elapsed_time = 0.0
129
130
  with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
130
131
 
131
132
  for i in range(0, conv_res.input.page_count):
@@ -136,7 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
136
137
  for page_batch in chunkify(
137
138
  conv_res.pages, settings.perf.page_batch_size
138
139
  ):
139
- start_pb_time = time.time()
140
+ start_batch_time = time.monotonic()
140
141
 
141
142
  # 1. Initialise the page resources
142
143
  init_pages = map(
@@ -149,8 +150,21 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
149
150
  for p in pipeline_pages: # Must exhaust!
150
151
  pass
151
152
 
152
- end_pb_time = time.time() - start_pb_time
153
- _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
153
+ end_batch_time = time.monotonic()
154
+ total_elapsed_time += end_batch_time - start_batch_time
155
+ if (
156
+ self.pipeline_options.document_timeout is not None
157
+ and total_elapsed_time > self.pipeline_options.document_timeout
158
+ ):
159
+ _log.warning(
160
+ f"Document processing time ({total_elapsed_time:.3f} seconds) exceeded the specified timeout of {self.pipeline_options.document_timeout:.3f} seconds"
161
+ )
162
+ conv_res.status = ConversionStatus.PARTIAL_SUCCESS
163
+ break
164
+
165
+ _log.debug(
166
+ f"Finished converting page batch time={end_batch_time:.3f}"
167
+ )
154
168
 
155
169
  except Exception as e:
156
170
  conv_res.status = ConversionStatus.FAILURE
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.10.0
3
+ Version: 2.11.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -13,29 +13,29 @@ docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6
13
13
  docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
14
14
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
15
15
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- docling/cli/main.py,sha256=bLk1RG0jwM4dn6G5qa5Q-S4_N3agKnoE28pTfbpV4-k,14713
16
+ docling/cli/main.py,sha256=FFDUDADvK7QNW7xCs6dlsC7Bt_BMyrKdbZewKTEjm54,14624
17
17
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- docling/datamodel/base_models.py,sha256=627IB8HZdXGmHNfsX4Qhf7kKSxx2btPjS7z8hitvhyE,5560
18
+ docling/datamodel/base_models.py,sha256=vwy59eDrkzCSaay24RlUvx4zEyuaUukOdOhw3622u2I,5616
19
19
  docling/datamodel/document.py,sha256=GNlTsgKgDqdqv2dfhpYmnqymxDQWWWC8HgE8uAta8V4,10265
20
- docling/datamodel/pipeline_options.py,sha256=zQxLVioyBrldI4V9phQma1kTTgjmFQ6d3gVj2xq51gw,5010
20
+ docling/datamodel/pipeline_options.py,sha256=1ouWNE5VhZolrWMb4RE6s_AxgNFr3_3PMtxB_YQ391A,5495
21
21
  docling/datamodel/settings.py,sha256=JK8lZPBjUx2kD2q-Qpg-o3vOElADMcyQbRUL0EHZ7us,1263
22
22
  docling/document_converter.py,sha256=Iz5eerBWFPVJoXAMlXEivRQX2VLBiUkA07BL4NNbaEs,11583
23
23
  docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
24
24
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
26
26
  docling/models/base_ocr_model.py,sha256=rGSpBF4dByITcsBaRIgvFKpiu0CrhmZS_PHIo686Dw0,6428
27
- docling/models/ds_glm_model.py,sha256=3UpFu3Oavw9p0GItx2S9R7bPDdjY2NvpUQQDSVMctys,12045
27
+ docling/models/ds_glm_model.py,sha256=YJkGxV46wh7G2Wr4vVzt9b8oewkUDPWpvI6AEaZDrs0,11872
28
28
  docling/models/easyocr_model.py,sha256=c2m4x9dZpSc-cMgeEdFBRVBlB78uMGlYD8Q_2gzRuMU,3734
29
29
  docling/models/layout_model.py,sha256=ZvbTSyxvXB5yLHNEti0Wv3trz0vwGuHySI5TCdApb0U,14011
30
30
  docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
31
31
  docling/models/page_assemble_model.py,sha256=kSGNiRKhmzkpFH7xCiT3rulMsgJmUXFa6Th_eB-cLEk,7103
32
32
  docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
33
- docling/models/rapid_ocr_model.py,sha256=VQ0jaFmOzB9f-1JaqZ6d0o_El55Lr-nsFHfTNubMAuc,6005
33
+ docling/models/rapid_ocr_model.py,sha256=ui152cerv9b9OeWyyyefs8qMLwYn0qsE2DFE_gHmaCM,6124
34
34
  docling/models/table_structure_model.py,sha256=-ANSQpiN2avt3B9sbi7dHcoULUJbMBalAR5xxlrM7To,8421
35
35
  docling/models/tesseract_ocr_cli_model.py,sha256=aKQBaty4cYu6zG_C5uy6Zm3eeRQo5fxIierbKixa2kc,6622
36
36
  docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUsOkuz0neY,6095
37
37
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
- docling/pipeline/base_pipeline.py,sha256=IF1XWYgUGbdB4-teLkmM4Hvg_UNEfPrGuhExMRTUsk8,7168
38
+ docling/pipeline/base_pipeline.py,sha256=hVvtk5E4DVZdl_SyNs_pYRUjN9C8PABhpVaeN5Z_fAY,7885
39
39
  docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
40
40
  docling/pipeline/standard_pdf_pipeline.py,sha256=B1q8xt3Dfecpi8s8DrcfPzdATh8TYgL43FDzzcS4vEA,8885
41
41
  docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
@@ -45,8 +45,8 @@ docling/utils/glm_utils.py,sha256=H1O_tDiRksMgw45rY9LhK6GjcZSOq5IyoGurGjoo-Ac,11
45
45
  docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
46
46
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
47
47
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
48
- docling-2.10.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
49
- docling-2.10.0.dist-info/METADATA,sha256=YVI-dBKxqAxrLATigzeXPZvwDZUhLSl_doltc-HenQ4,7731
50
- docling-2.10.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
51
- docling-2.10.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
52
- docling-2.10.0.dist-info/RECORD,,
48
+ docling-2.11.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
49
+ docling-2.11.0.dist-info/METADATA,sha256=ajUVy5CuNDUp0x9tMCqO2px2M-ia-Vs7frIyb0_HxMo,7731
50
+ docling-2.11.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
51
+ docling-2.11.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
52
+ docling-2.11.0.dist-info/RECORD,,