docling 2.2.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,24 +1,20 @@
1
1
  import logging
2
- import os
3
2
  import re
4
3
  from io import BytesIO
5
4
  from pathlib import Path
6
5
  from typing import Set, Union
7
6
 
8
7
  from docling_core.types.doc import (
9
- DocItem,
10
8
  DocItemLabel,
11
9
  DoclingDocument,
12
10
  DocumentOrigin,
13
11
  GroupItem,
14
12
  GroupLabel,
15
13
  ImageRef,
16
- NodeItem,
17
14
  Size,
18
15
  TableCell,
19
16
  TableData,
20
17
  )
21
- from pydantic import AnyUrl
22
18
 
23
19
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
24
20
  from docling.datamodel.base_models import InputFormat
@@ -136,7 +136,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
136
136
  def get_direct_text(self, item):
137
137
  """Get the direct text of the <li> element (ignoring nested lists)."""
138
138
  text = item.find(string=True, recursive=False)
139
-
140
139
  if isinstance(text, str):
141
140
  return text.strip()
142
141
 
@@ -149,21 +148,20 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
149
148
  if isinstance(item, str):
150
149
  return [item]
151
150
 
152
- result.append(self.get_direct_text(item))
153
-
154
- try:
155
- # Iterate over the children (and their text and tails)
156
- for child in item:
157
- try:
158
- # Recursively get the child's text content
159
- result.extend(self.extract_text_recursively(child))
160
- except:
161
- pass
162
- except:
163
- _log.warn("item has no children")
164
- pass
165
-
166
- return " ".join(result)
151
+ if item.name not in ["ul", "ol"]:
152
+ try:
153
+ # Iterate over the children (and their text and tails)
154
+ for child in item:
155
+ try:
156
+ # Recursively get the child's text content
157
+ result.extend(self.extract_text_recursively(child))
158
+ except:
159
+ pass
160
+ except:
161
+ _log.warn("item has no children")
162
+ pass
163
+
164
+ return "".join(result) + " "
167
165
 
168
166
  def handle_header(self, element, idx, doc):
169
167
  """Handles header tags (h1, h2, etc.)."""
@@ -181,38 +179,31 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
181
179
  self.parents[self.level] = doc.add_text(
182
180
  parent=self.parents[0], label=DocItemLabel.TITLE, text=text
183
181
  )
184
-
185
- elif hlevel == self.level:
186
- self.parents[hlevel] = doc.add_text(
187
- parent=self.parents[hlevel - 1], label=label, text=text
188
- )
189
-
190
- elif hlevel > self.level:
191
-
192
- # add invisible group
193
- for i in range(self.level + 1, hlevel):
194
- self.parents[i] = doc.add_group(
195
- name=f"header-{i}",
196
- label=GroupLabel.SECTION,
197
- parent=self.parents[i - 1],
198
- )
199
-
200
- self.parents[hlevel] = doc.add_text(
201
- parent=self.parents[hlevel - 1], label=label, text=text
202
- )
203
- self.level = hlevel
204
-
205
- elif hlevel < self.level:
206
-
207
- # remove the tail
208
- for key, val in self.parents.items():
209
- if key > hlevel:
210
- self.parents[key] = None
211
-
212
- self.parents[hlevel] = doc.add_text(
213
- parent=self.parents[hlevel - 1], label=label, text=text
182
+ else:
183
+ if hlevel > self.level:
184
+
185
+ # add invisible group
186
+ for i in range(self.level + 1, hlevel):
187
+ self.parents[i] = doc.add_group(
188
+ name=f"header-{i}",
189
+ label=GroupLabel.SECTION,
190
+ parent=self.parents[i - 1],
191
+ )
192
+ self.level = hlevel
193
+
194
+ elif hlevel < self.level:
195
+
196
+ # remove the tail
197
+ for key, val in self.parents.items():
198
+ if key > hlevel:
199
+ self.parents[key] = None
200
+ self.level = hlevel
201
+
202
+ self.parents[hlevel] = doc.add_heading(
203
+ parent=self.parents[hlevel - 1],
204
+ text=text,
205
+ level=hlevel,
214
206
  )
215
- self.level = hlevel
216
207
 
217
208
  def handle_paragraph(self, element, idx, doc):
218
209
  """Handles paragraph tags (p)."""
@@ -255,7 +246,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
255
246
 
256
247
  if nested_lists:
257
248
  name = element.name
258
- text = self.get_direct_text(element)
249
+ # Text in list item can be hidden within hierarchy, hence
250
+ # we need to extract it recursively
251
+ text = self.extract_text_recursively(element)
252
+ # Flatten text, remove break lines:
253
+ text = text.replace("\n", "").replace("\r", "")
254
+ text = " ".join(text.split()).strip()
259
255
 
260
256
  marker = ""
261
257
  enumerated = False
@@ -263,14 +259,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
263
259
  marker = str(index_in_list)
264
260
  enumerated = True
265
261
 
266
- # create a list-item
267
- self.parents[self.level + 1] = doc.add_list_item(
268
- text=text,
269
- enumerated=enumerated,
270
- marker=marker,
271
- parent=self.parents[self.level],
272
- )
273
- self.level += 1
262
+ if len(text) > 0:
263
+ # create a list-item
264
+ self.parents[self.level + 1] = doc.add_list_item(
265
+ text=text,
266
+ enumerated=enumerated,
267
+ marker=marker,
268
+ parent=self.parents[self.level],
269
+ )
270
+ self.level += 1
274
271
 
275
272
  self.walk(element, doc)
276
273
 
@@ -1,4 +1,6 @@
1
1
  import logging
2
+ import re
3
+ import warnings
2
4
  from io import BytesIO
3
5
  from pathlib import Path
4
6
  from typing import Set, Union
@@ -25,6 +27,30 @@ _log = logging.getLogger(__name__)
25
27
 
26
28
 
27
29
  class MarkdownDocumentBackend(DeclarativeDocumentBackend):
30
+
31
+ def shorten_underscore_sequences(self, markdown_text, max_length=10):
32
+ # This regex will match any sequence of underscores
33
+ pattern = r"_+"
34
+
35
+ def replace_match(match):
36
+ underscore_sequence = match.group(
37
+ 0
38
+ ) # Get the full match (sequence of underscores)
39
+
40
+ # Shorten the sequence if it exceeds max_length
41
+ if len(underscore_sequence) > max_length:
42
+ return "_" * max_length
43
+ else:
44
+ return underscore_sequence # Leave it unchanged if it is shorter or equal to max_length
45
+
46
+ # Use re.sub to replace long underscore sequences
47
+ shortened_text = re.sub(pattern, replace_match, markdown_text)
48
+
49
+ if len(shortened_text) != len(markdown_text):
50
+ warnings.warn("Detected potentially incorrect Markdown, correcting...")
51
+
52
+ return shortened_text
53
+
28
54
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
29
55
  super().__init__(in_doc, path_or_stream)
30
56
 
@@ -42,11 +68,19 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
42
68
  try:
43
69
  if isinstance(self.path_or_stream, BytesIO):
44
70
  text_stream = self.path_or_stream.getvalue().decode("utf-8")
45
- self.markdown = text_stream
71
+ # remove invalid sequences
72
+ # very long sequences of underscores will lead to unnecessary long processing times.
73
+ # In any proper Markdown files, underscores have to be escaped,
74
+ # otherwise they represent emphasis (bold or italic)
75
+ self.markdown = self.shorten_underscore_sequences(text_stream)
46
76
  if isinstance(self.path_or_stream, Path):
47
77
  with open(self.path_or_stream, "r", encoding="utf-8") as f:
48
78
  md_content = f.read()
49
- self.markdown = md_content
79
+ # remove invalid sequences
80
+ # very long sequences of underscores will lead to unnecessary long processing times.
81
+ # In any proper Markdown files, underscores have to be escaped,
82
+ # otherwise they represent emphasis (bold or italic)
83
+ self.markdown = self.shorten_underscore_sequences(md_content)
50
84
  self.valid = True
51
85
 
52
86
  _log.debug(self.markdown)
@@ -135,11 +169,29 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
135
169
  doc_label = DocItemLabel.TITLE
136
170
  else:
137
171
  doc_label = DocItemLabel.SECTION_HEADER
138
- snippet_text = element.children[0].children.strip()
139
172
 
140
- parent_element = doc.add_text(
141
- label=doc_label, parent=parent_element, text=snippet_text
142
- )
173
+ # Header could have arbitrary inclusion of bold, italic or emphasis,
174
+ # hence we need to traverse the tree to get full text of a header
175
+ strings = []
176
+
177
+ # Define a recursive function to traverse the tree
178
+ def traverse(node):
179
+ # Check if the node has a "children" attribute
180
+ if hasattr(node, "children"):
181
+ # If "children" is a list, continue traversal
182
+ if isinstance(node.children, list):
183
+ for child in node.children:
184
+ traverse(child)
185
+ # If "children" is text, add it to header text
186
+ elif isinstance(node.children, str):
187
+ strings.append(node.children)
188
+
189
+ traverse(element)
190
+ snippet_text = "".join(strings)
191
+ if len(snippet_text) > 0:
192
+ parent_element = doc.add_text(
193
+ label=doc_label, parent=parent_element, text=snippet_text
194
+ )
143
195
 
144
196
  elif isinstance(element, marko.block.List):
145
197
  self.close_table(doc)
@@ -286,6 +338,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
286
338
  parsed_ast = marko_parser.parse(self.markdown)
287
339
  # Start iterating from the root of the AST
288
340
  self.iterate_elements(parsed_ast, 0, doc, None)
341
+ self.process_inline_text(None, doc) # handle last hanging inline text
289
342
  else:
290
343
  raise RuntimeError(
291
344
  f"Cannot convert md with {self.document_hash} because the backend failed to init."
@@ -294,13 +294,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
294
294
  level = self.get_level()
295
295
  if isinstance(curr_level, int):
296
296
 
297
- if curr_level == level:
298
-
299
- self.parents[level] = doc.add_heading(
300
- parent=self.parents[level - 1], text=text
301
- )
302
-
303
- elif curr_level > level:
297
+ if curr_level > level:
304
298
 
305
299
  # add invisible group
306
300
  for i in range(level, curr_level):
@@ -310,10 +304,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
310
304
  name=f"header-{i}",
311
305
  )
312
306
 
313
- self.parents[curr_level] = doc.add_heading(
314
- parent=self.parents[curr_level - 1], text=text
315
- )
316
-
317
307
  elif curr_level < level:
318
308
 
319
309
  # remove the tail
@@ -321,13 +311,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
321
311
  if key >= curr_level:
322
312
  self.parents[key] = None
323
313
 
324
- self.parents[curr_level] = doc.add_heading(
325
- parent=self.parents[curr_level - 1], text=text
326
- )
314
+ self.parents[curr_level] = doc.add_heading(
315
+ parent=self.parents[curr_level - 1],
316
+ text=text,
317
+ level=curr_level,
318
+ )
327
319
 
328
320
  else:
329
321
  self.parents[self.level] = doc.add_heading(
330
- parent=self.parents[self.level - 1], text=text
322
+ parent=self.parents[self.level - 1],
323
+ text=text,
324
+ level=1,
331
325
  )
332
326
  return
333
327
 
@@ -1,6 +1,6 @@
1
1
  from enum import Enum, auto
2
2
  from io import BytesIO
3
- from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
3
+ from typing import TYPE_CHECKING, Dict, List, Optional, Union
4
4
 
5
5
  from docling_core.types.doc import (
6
6
  BoundingBox,
@@ -3,7 +3,7 @@ import re
3
3
  from enum import Enum
4
4
  from io import BytesIO
5
5
  from pathlib import Path, PurePath
6
- from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
6
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
7
7
 
8
8
  import filetype
9
9
  from docling_core.types.doc import (
@@ -52,6 +52,7 @@ from docling.datamodel.base_models import (
52
52
  Page,
53
53
  )
54
54
  from docling.datamodel.settings import DocumentLimits
55
+ from docling.utils.profiling import ProfilingItem
55
56
  from docling.utils.utils import create_file_hash, create_hash
56
57
 
57
58
  if TYPE_CHECKING:
@@ -187,6 +188,7 @@ class ConversionResult(BaseModel):
187
188
 
188
189
  pages: List[Page] = []
189
190
  assembled: AssembledUnit = AssembledUnit()
191
+ timings: Dict[str, ProfilingItem] = {}
190
192
 
191
193
  document: DoclingDocument = _EMPTY_DOCLING_DOC
192
194
 
@@ -1,4 +1,5 @@
1
1
  import sys
2
+ from pathlib import Path
2
3
 
3
4
  from pydantic import BaseModel
4
5
  from pydantic_settings import BaseSettings
@@ -26,8 +27,21 @@ class BatchConcurrencySettings(BaseModel):
26
27
  # To force models into single core: export OMP_NUM_THREADS=1
27
28
 
28
29
 
30
+ class DebugSettings(BaseModel):
31
+ visualize_cells: bool = False
32
+ visualize_ocr: bool = False
33
+ visualize_layout: bool = False
34
+ visualize_tables: bool = False
35
+
36
+ profile_pipeline_timings: bool = False
37
+
38
+ # Path used to output debug information.
39
+ debug_output_path: str = str(Path.cwd() / "debug")
40
+
41
+
29
42
  class AppSettings(BaseSettings):
30
43
  perf: BatchConcurrencySettings
44
+ debug: DebugSettings
31
45
 
32
46
 
33
- settings = AppSettings(perf=BatchConcurrencySettings())
47
+ settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
@@ -189,24 +189,35 @@ class DocumentConverter:
189
189
  ) -> Iterator[ConversionResult]:
190
190
  assert self.format_to_options is not None
191
191
 
192
+ start_time = time.monotonic()
193
+
192
194
  for input_batch in chunkify(
193
195
  conv_input.docs(self.format_to_options),
194
196
  settings.perf.doc_batch_size, # pass format_options
195
197
  ):
196
198
  _log.info(f"Going to convert document batch...")
199
+
197
200
  # parallel processing only within input_batch
198
201
  # with ThreadPoolExecutor(
199
202
  # max_workers=settings.perf.doc_batch_concurrency
200
203
  # ) as pool:
201
204
  # yield from pool.map(self.process_document, input_batch)
202
-
203
205
  # Note: PDF backends are not thread-safe, thread pool usage was disabled.
206
+
204
207
  for item in map(
205
208
  partial(self._process_document, raises_on_error=raises_on_error),
206
209
  input_batch,
207
210
  ):
211
+ elapsed = time.monotonic() - start_time
212
+ start_time = time.monotonic()
213
+
208
214
  if item is not None:
215
+ _log.info(
216
+ f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
217
+ )
209
218
  yield item
219
+ else:
220
+ _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
210
221
 
211
222
  def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
212
223
  assert self.format_to_options is not None
@@ -237,15 +248,8 @@ class DocumentConverter:
237
248
  assert self.allowed_formats is not None
238
249
  assert in_doc.format in self.allowed_formats
239
250
 
240
- start_doc_time = time.time()
241
-
242
251
  conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
243
252
 
244
- end_doc_time = time.time() - start_doc_time
245
- _log.info(
246
- f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
247
- )
248
-
249
253
  return conv_res
250
254
 
251
255
  def _execute_pipeline(
@@ -4,11 +4,14 @@ from typing import Any, Iterable
4
4
  from docling_core.types.doc import DoclingDocument, NodeItem
5
5
 
6
6
  from docling.datamodel.base_models import Page
7
+ from docling.datamodel.document import ConversionResult
7
8
 
8
9
 
9
10
  class BasePageModel(ABC):
10
11
  @abstractmethod
11
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
12
+ def __call__(
13
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
14
+ ) -> Iterable[Page]:
12
15
  pass
13
16
 
14
17
 
@@ -1,6 +1,7 @@
1
1
  import copy
2
2
  import logging
3
3
  from abc import abstractmethod
4
+ from pathlib import Path
4
5
  from typing import Iterable, List
5
6
 
6
7
  import numpy as np
@@ -10,12 +11,15 @@ from rtree import index
10
11
  from scipy.ndimage import find_objects, label
11
12
 
12
13
  from docling.datamodel.base_models import OcrCell, Page
14
+ from docling.datamodel.document import ConversionResult
13
15
  from docling.datamodel.pipeline_options import OcrOptions
16
+ from docling.datamodel.settings import settings
17
+ from docling.models.base_model import BasePageModel
14
18
 
15
19
  _log = logging.getLogger(__name__)
16
20
 
17
21
 
18
- class BaseOcrModel:
22
+ class BaseOcrModel(BasePageModel):
19
23
  def __init__(self, enabled: bool, options: OcrOptions):
20
24
  self.enabled = enabled
21
25
  self.options = options
@@ -113,7 +117,7 @@ class BaseOcrModel:
113
117
  ]
114
118
  return filtered_ocr_cells
115
119
 
116
- def draw_ocr_rects_and_cells(self, page, ocr_rects):
120
+ def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
117
121
  image = copy.deepcopy(page.image)
118
122
  draw = ImageDraw.Draw(image, "RGBA")
119
123
 
@@ -130,8 +134,21 @@ class BaseOcrModel:
130
134
  if isinstance(tc, OcrCell):
131
135
  color = "magenta"
132
136
  draw.rectangle([(x0, y0), (x1, y1)], outline=color)
133
- image.show()
137
+
138
+ if show:
139
+ image.show()
140
+ else:
141
+ out_path: Path = (
142
+ Path(settings.debug.debug_output_path)
143
+ / f"debug_{conv_res.input.file.stem}"
144
+ )
145
+ out_path.mkdir(parents=True, exist_ok=True)
146
+
147
+ out_file = out_path / f"ocr_page_{page.page_no:05}.png"
148
+ image.save(str(out_file), format="png")
134
149
 
135
150
  @abstractmethod
136
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
151
+ def __call__(
152
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
153
+ ) -> Iterable[Page]:
137
154
  pass
@@ -1,5 +1,6 @@
1
1
  import copy
2
2
  import random
3
+ from pathlib import Path
3
4
  from typing import List, Union
4
5
 
5
6
  from deepsearch_glm.nlp_utils import init_nlp_model
@@ -27,6 +28,8 @@ from pydantic import BaseModel, ConfigDict
27
28
 
28
29
  from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
29
30
  from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
31
+ from docling.datamodel.settings import settings
32
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
30
33
  from docling.utils.utils import create_hash
31
34
 
32
35
 
@@ -226,23 +229,24 @@ class GlmModel:
226
229
  return ds_doc
227
230
 
228
231
  def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
229
- ds_doc = self._to_legacy_document(conv_res)
230
- ds_doc_dict = ds_doc.model_dump(by_alias=True)
232
+ with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
233
+ ds_doc = self._to_legacy_document(conv_res)
234
+ ds_doc_dict = ds_doc.model_dump(by_alias=True)
231
235
 
232
- glm_doc = self.model.apply_on_doc(ds_doc_dict)
236
+ glm_doc = self.model.apply_on_doc(ds_doc_dict)
233
237
 
234
- docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
238
+ docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
235
239
 
236
240
  # DEBUG code:
237
- def draw_clusters_and_cells(ds_document, page_no):
241
+ def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
238
242
  clusters_to_draw = []
239
243
  image = copy.deepcopy(conv_res.pages[page_no].image)
240
244
  for ix, elem in enumerate(ds_document.main_text):
241
245
  if isinstance(elem, BaseText):
242
- prov = elem.prov[0]
246
+ prov = elem.prov[0] # type: ignore
243
247
  elif isinstance(elem, Ref):
244
248
  _, arr, index = elem.ref.split("/")
245
- index = int(index)
249
+ index = int(index) # type: ignore
246
250
  if arr == "tables":
247
251
  prov = ds_document.tables[index].prov[0]
248
252
  elif arr == "figures":
@@ -256,7 +260,7 @@ class GlmModel:
256
260
  id=ix,
257
261
  label=elem.name,
258
262
  bbox=BoundingBox.from_tuple(
259
- coord=prov.bbox,
263
+ coord=prov.bbox, # type: ignore
260
264
  origin=CoordOrigin.BOTTOMLEFT,
261
265
  ).to_top_left_origin(conv_res.pages[page_no].size.height),
262
266
  )
@@ -276,9 +280,21 @@ class GlmModel:
276
280
  for tc in c.cells: # [:1]:
277
281
  x0, y0, x1, y1 = tc.bbox.as_tuple()
278
282
  draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
279
- image.show()
280
283
 
281
- # draw_clusters_and_cells(ds_doc, 0)
282
- # draw_clusters_and_cells(exported_doc, 0)
284
+ if show:
285
+ image.show()
286
+ else:
287
+ out_path: Path = (
288
+ Path(settings.debug.debug_output_path)
289
+ / f"debug_{conv_res.input.file.stem}"
290
+ )
291
+ out_path.mkdir(parents=True, exist_ok=True)
292
+
293
+ out_file = out_path / f"doc_page_{page_no:05}.png"
294
+ image.save(str(out_file), format="png")
295
+
296
+ # for item in ds_doc.page_dimensions:
297
+ # page_no = item.page
298
+ # draw_clusters_and_cells(ds_doc, page_no)
283
299
 
284
300
  return docling_doc