docling 2.10.0__tar.gz → 2.11.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {docling-2.10.0 → docling-2.11.0}/PKG-INFO +1 -1
  2. {docling-2.10.0 → docling-2.11.0}/docling/cli/main.py +10 -16
  3. {docling-2.10.0 → docling-2.11.0}/docling/datamodel/base_models.py +12 -12
  4. {docling-2.10.0 → docling-2.11.0}/docling/datamodel/pipeline_options.py +22 -1
  5. {docling-2.10.0 → docling-2.11.0}/docling/models/ds_glm_model.py +2 -5
  6. {docling-2.10.0 → docling-2.11.0}/docling/models/rapid_ocr_model.py +18 -17
  7. {docling-2.10.0 → docling-2.11.0}/docling/pipeline/base_pipeline.py +17 -3
  8. {docling-2.10.0 → docling-2.11.0}/pyproject.toml +1 -1
  9. {docling-2.10.0 → docling-2.11.0}/LICENSE +0 -0
  10. {docling-2.10.0 → docling-2.11.0}/README.md +0 -0
  11. {docling-2.10.0 → docling-2.11.0}/docling/__init__.py +0 -0
  12. {docling-2.10.0 → docling-2.11.0}/docling/backend/__init__.py +0 -0
  13. {docling-2.10.0 → docling-2.11.0}/docling/backend/abstract_backend.py +0 -0
  14. {docling-2.10.0 → docling-2.11.0}/docling/backend/asciidoc_backend.py +0 -0
  15. {docling-2.10.0 → docling-2.11.0}/docling/backend/docling_parse_backend.py +0 -0
  16. {docling-2.10.0 → docling-2.11.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  17. {docling-2.10.0 → docling-2.11.0}/docling/backend/html_backend.py +0 -0
  18. {docling-2.10.0 → docling-2.11.0}/docling/backend/md_backend.py +0 -0
  19. {docling-2.10.0 → docling-2.11.0}/docling/backend/msexcel_backend.py +0 -0
  20. {docling-2.10.0 → docling-2.11.0}/docling/backend/mspowerpoint_backend.py +0 -0
  21. {docling-2.10.0 → docling-2.11.0}/docling/backend/msword_backend.py +0 -0
  22. {docling-2.10.0 → docling-2.11.0}/docling/backend/pdf_backend.py +0 -0
  23. {docling-2.10.0 → docling-2.11.0}/docling/backend/pypdfium2_backend.py +0 -0
  24. {docling-2.10.0 → docling-2.11.0}/docling/chunking/__init__.py +0 -0
  25. {docling-2.10.0 → docling-2.11.0}/docling/cli/__init__.py +0 -0
  26. {docling-2.10.0 → docling-2.11.0}/docling/datamodel/__init__.py +0 -0
  27. {docling-2.10.0 → docling-2.11.0}/docling/datamodel/document.py +0 -0
  28. {docling-2.10.0 → docling-2.11.0}/docling/datamodel/settings.py +0 -0
  29. {docling-2.10.0 → docling-2.11.0}/docling/document_converter.py +0 -0
  30. {docling-2.10.0 → docling-2.11.0}/docling/exceptions.py +0 -0
  31. {docling-2.10.0 → docling-2.11.0}/docling/models/__init__.py +0 -0
  32. {docling-2.10.0 → docling-2.11.0}/docling/models/base_model.py +0 -0
  33. {docling-2.10.0 → docling-2.11.0}/docling/models/base_ocr_model.py +0 -0
  34. {docling-2.10.0 → docling-2.11.0}/docling/models/easyocr_model.py +0 -0
  35. {docling-2.10.0 → docling-2.11.0}/docling/models/layout_model.py +0 -0
  36. {docling-2.10.0 → docling-2.11.0}/docling/models/ocr_mac_model.py +0 -0
  37. {docling-2.10.0 → docling-2.11.0}/docling/models/page_assemble_model.py +0 -0
  38. {docling-2.10.0 → docling-2.11.0}/docling/models/page_preprocessing_model.py +0 -0
  39. {docling-2.10.0 → docling-2.11.0}/docling/models/table_structure_model.py +0 -0
  40. {docling-2.10.0 → docling-2.11.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  41. {docling-2.10.0 → docling-2.11.0}/docling/models/tesseract_ocr_model.py +0 -0
  42. {docling-2.10.0 → docling-2.11.0}/docling/pipeline/__init__.py +0 -0
  43. {docling-2.10.0 → docling-2.11.0}/docling/pipeline/simple_pipeline.py +0 -0
  44. {docling-2.10.0 → docling-2.11.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  45. {docling-2.10.0 → docling-2.11.0}/docling/py.typed +0 -0
  46. {docling-2.10.0 → docling-2.11.0}/docling/utils/__init__.py +0 -0
  47. {docling-2.10.0 → docling-2.11.0}/docling/utils/export.py +0 -0
  48. {docling-2.10.0 → docling-2.11.0}/docling/utils/glm_utils.py +0 -0
  49. {docling-2.10.0 → docling-2.11.0}/docling/utils/layout_utils.py +0 -0
  50. {docling-2.10.0 → docling-2.11.0}/docling/utils/profiling.py +0 -0
  51. {docling-2.10.0 → docling-2.11.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.10.0
3
+ Version: 2.11.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -27,8 +27,10 @@ from docling.datamodel.base_models import (
27
27
  from docling.datamodel.document import ConversionResult
28
28
  from docling.datamodel.pipeline_options import (
29
29
  EasyOcrOptions,
30
+ OcrEngine,
30
31
  OcrMacOptions,
31
32
  OcrOptions,
33
+ PdfBackend,
32
34
  PdfPipelineOptions,
33
35
  RapidOcrOptions,
34
36
  TableFormerMode,
@@ -68,22 +70,6 @@ def version_callback(value: bool):
68
70
  raise typer.Exit()
69
71
 
70
72
 
71
- # Define an enum for the backend options
72
- class PdfBackend(str, Enum):
73
- PYPDFIUM2 = "pypdfium2"
74
- DLPARSE_V1 = "dlparse_v1"
75
- DLPARSE_V2 = "dlparse_v2"
76
-
77
-
78
- # Define an enum for the ocr engines
79
- class OcrEngine(str, Enum):
80
- EASYOCR = "easyocr"
81
- TESSERACT_CLI = "tesseract_cli"
82
- TESSERACT = "tesseract"
83
- OCRMAC = "ocrmac"
84
- RAPIDOCR = "rapidocr"
85
-
86
-
87
73
  def export_documents(
88
74
  conv_results: Iterable[ConversionResult],
89
75
  output_dir: Path,
@@ -264,6 +250,13 @@ def convert(
264
250
  help="Show version information.",
265
251
  ),
266
252
  ] = None,
253
+ document_timeout: Annotated[
254
+ Optional[float],
255
+ typer.Option(
256
+ ...,
257
+ help="The timeout for processing each document, in seconds.",
258
+ ),
259
+ ] = None,
267
260
  ):
268
261
  if verbose == 0:
269
262
  logging.basicConfig(level=logging.WARNING)
@@ -347,6 +340,7 @@ def convert(
347
340
  do_ocr=ocr,
348
341
  ocr_options=ocr_options,
349
342
  do_table_structure=True,
343
+ document_timeout=document_timeout,
350
344
  )
351
345
  pipeline_options.table_structure_options.do_cell_matching = (
352
346
  True # do_cell_matching
@@ -19,12 +19,12 @@ if TYPE_CHECKING:
19
19
 
20
20
 
21
21
  class ConversionStatus(str, Enum):
22
- PENDING = auto()
23
- STARTED = auto()
24
- FAILURE = auto()
25
- SUCCESS = auto()
26
- PARTIAL_SUCCESS = auto()
27
- SKIPPED = auto()
22
+ PENDING = "pending"
23
+ STARTED = "started"
24
+ FAILURE = "failure"
25
+ SUCCESS = "success"
26
+ PARTIAL_SUCCESS = "partial_success"
27
+ SKIPPED = "skipped"
28
28
 
29
29
 
30
30
  class InputFormat(str, Enum):
@@ -89,15 +89,15 @@ MimeTypeToFormat = {
89
89
 
90
90
 
91
91
  class DocInputType(str, Enum):
92
- PATH = auto()
93
- STREAM = auto()
92
+ PATH = "path"
93
+ STREAM = "stream"
94
94
 
95
95
 
96
96
  class DoclingComponentType(str, Enum):
97
- DOCUMENT_BACKEND = auto()
98
- MODEL = auto()
99
- DOC_ASSEMBLER = auto()
100
- USER_INPUT = auto()
97
+ DOCUMENT_BACKEND = "document_backend"
98
+ MODEL = "model"
99
+ DOC_ASSEMBLER = "doc_assembler"
100
+ USER_INPUT = "user_input"
101
101
 
102
102
 
103
103
  class ErrorItem(BaseModel):
@@ -126,12 +126,33 @@ class OcrMacOptions(OcrOptions):
126
126
  )
127
127
 
128
128
 
129
+ # Define an enum for the backend options
130
+ class PdfBackend(str, Enum):
131
+ """Enum of valid PDF backends."""
132
+
133
+ PYPDFIUM2 = "pypdfium2"
134
+ DLPARSE_V1 = "dlparse_v1"
135
+ DLPARSE_V2 = "dlparse_v2"
136
+
137
+
138
+ # Define an enum for the ocr engines
139
+ class OcrEngine(str, Enum):
140
+ """Enum of valid OCR engines."""
141
+
142
+ EASYOCR = "easyocr"
143
+ TESSERACT_CLI = "tesseract_cli"
144
+ TESSERACT = "tesseract"
145
+ OCRMAC = "ocrmac"
146
+ RAPIDOCR = "rapidocr"
147
+
148
+
129
149
  class PipelineOptions(BaseModel):
130
150
  """Base pipeline options."""
131
151
 
132
152
  create_legacy_output: bool = (
133
- True # This defautl will be set to False on a future version of docling
153
+ True # This default will be set to False on a future version of docling
134
154
  )
155
+ document_timeout: Optional[float] = None
135
156
 
136
157
 
137
158
  class PdfPipelineOptions(PipelineOptions):
@@ -3,8 +3,7 @@ import random
3
3
  from pathlib import Path
4
4
  from typing import List, Union
5
5
 
6
- from deepsearch_glm.nlp_utils import init_nlp_model
7
- from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
6
+ from deepsearch_glm.andromeda_nlp import nlp_model
8
7
  from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
9
8
  from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
10
9
  from docling_core.types.legacy_doc.base import (
@@ -43,9 +42,7 @@ class GlmModel:
43
42
  def __init__(self, options: GlmOptions):
44
43
  self.options = options
45
44
 
46
- if self.options.model_names != "":
47
- load_pretrained_nlp_models()
48
- self.model = init_nlp_model(model_names=self.options.model_names)
45
+ self.model = nlp_model(loglevel="error", text_ordering=True)
49
46
 
50
47
  def _to_legacy_document(self, conv_res) -> DsDocument:
51
48
  title = ""
@@ -118,24 +118,25 @@ class RapidOcrModel(BaseOcrModel):
118
118
  del high_res_image
119
119
  del im
120
120
 
121
- cells = [
122
- OcrCell(
123
- id=ix,
124
- text=line[1],
125
- confidence=line[2],
126
- bbox=BoundingBox.from_tuple(
127
- coord=(
128
- (line[0][0][0] / self.scale) + ocr_rect.l,
129
- (line[0][0][1] / self.scale) + ocr_rect.t,
130
- (line[0][2][0] / self.scale) + ocr_rect.l,
131
- (line[0][2][1] / self.scale) + ocr_rect.t,
121
+ if result is not None:
122
+ cells = [
123
+ OcrCell(
124
+ id=ix,
125
+ text=line[1],
126
+ confidence=line[2],
127
+ bbox=BoundingBox.from_tuple(
128
+ coord=(
129
+ (line[0][0][0] / self.scale) + ocr_rect.l,
130
+ (line[0][0][1] / self.scale) + ocr_rect.t,
131
+ (line[0][2][0] / self.scale) + ocr_rect.l,
132
+ (line[0][2][1] / self.scale) + ocr_rect.t,
133
+ ),
134
+ origin=CoordOrigin.TOPLEFT,
132
135
  ),
133
- origin=CoordOrigin.TOPLEFT,
134
- ),
135
- )
136
- for ix, line in enumerate(result)
137
- ]
138
- all_ocr_cells.extend(cells)
136
+ )
137
+ for ix, line in enumerate(result)
138
+ ]
139
+ all_ocr_cells.extend(cells)
139
140
 
140
141
  # Post-process the cells
141
142
  page.cells = self.post_process_cells(all_ocr_cells, page.cells)
@@ -126,6 +126,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
126
126
  # conv_res.status = ConversionStatus.FAILURE
127
127
  # return conv_res
128
128
 
129
+ total_elapsed_time = 0.0
129
130
  with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
130
131
 
131
132
  for i in range(0, conv_res.input.page_count):
@@ -136,7 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
136
137
  for page_batch in chunkify(
137
138
  conv_res.pages, settings.perf.page_batch_size
138
139
  ):
139
- start_pb_time = time.time()
140
+ start_batch_time = time.monotonic()
140
141
 
141
142
  # 1. Initialise the page resources
142
143
  init_pages = map(
@@ -149,8 +150,21 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
149
150
  for p in pipeline_pages: # Must exhaust!
150
151
  pass
151
152
 
152
- end_pb_time = time.time() - start_pb_time
153
- _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
153
+ end_batch_time = time.monotonic()
154
+ total_elapsed_time += end_batch_time - start_batch_time
155
+ if (
156
+ self.pipeline_options.document_timeout is not None
157
+ and total_elapsed_time > self.pipeline_options.document_timeout
158
+ ):
159
+ _log.warning(
160
+ f"Document processing time ({total_elapsed_time:.3f} seconds) exceeded the specified timeout of {self.pipeline_options.document_timeout:.3f} seconds"
161
+ )
162
+ conv_res.status = ConversionStatus.PARTIAL_SUCCESS
163
+ break
164
+
165
+ _log.debug(
166
+ f"Finished converting page batch time={end_batch_time:.3f}"
167
+ )
154
168
 
155
169
  except Exception as e:
156
170
  conv_res.status = ConversionStatus.FAILURE
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.10.0" # DO NOT EDIT, updated automatically
3
+ version = "2.11.0" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
File without changes
File without changes
File without changes
File without changes
File without changes