docling 2.30.0__py3-none-any.whl → 2.31.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. docling/backend/asciidoc_backend.py +7 -15
  2. docling/backend/csv_backend.py +1 -1
  3. docling/backend/docling_parse_backend.py +2 -2
  4. docling/backend/docling_parse_v2_backend.py +2 -2
  5. docling/backend/docling_parse_v4_backend.py +3 -4
  6. docling/backend/docx/latex/latex_dict.py +0 -5
  7. docling/backend/docx/latex/omml.py +4 -7
  8. docling/backend/html_backend.py +26 -9
  9. docling/backend/md_backend.py +5 -7
  10. docling/backend/msexcel_backend.py +1 -7
  11. docling/backend/mspowerpoint_backend.py +4 -7
  12. docling/backend/msword_backend.py +4 -4
  13. docling/backend/pdf_backend.py +2 -1
  14. docling/backend/pypdfium2_backend.py +3 -3
  15. docling/backend/xml/jats_backend.py +10 -13
  16. docling/backend/xml/uspto_backend.py +15 -19
  17. docling/cli/main.py +7 -7
  18. docling/cli/models.py +2 -3
  19. docling/datamodel/base_models.py +7 -5
  20. docling/datamodel/document.py +11 -10
  21. docling/datamodel/pipeline_options.py +0 -1
  22. docling/document_converter.py +5 -5
  23. docling/models/api_vlm_model.py +1 -2
  24. docling/models/base_model.py +2 -4
  25. docling/models/base_ocr_model.py +2 -2
  26. docling/models/code_formula_model.py +2 -1
  27. docling/models/document_picture_classifier.py +2 -1
  28. docling/models/easyocr_model.py +10 -11
  29. docling/models/factories/__init__.py +2 -2
  30. docling/models/factories/base_factory.py +1 -1
  31. docling/models/hf_mlx_model.py +4 -6
  32. docling/models/hf_vlm_model.py +7 -5
  33. docling/models/layout_model.py +2 -2
  34. docling/models/ocr_mac_model.py +3 -4
  35. docling/models/page_assemble_model.py +7 -12
  36. docling/models/page_preprocessing_model.py +2 -1
  37. docling/models/picture_description_api_model.py +2 -1
  38. docling/models/picture_description_base_model.py +2 -3
  39. docling/models/picture_description_vlm_model.py +2 -3
  40. docling/models/rapid_ocr_model.py +2 -3
  41. docling/models/readingorder_model.py +8 -23
  42. docling/models/table_structure_model.py +2 -6
  43. docling/models/tesseract_ocr_cli_model.py +17 -16
  44. docling/models/tesseract_ocr_model.py +8 -6
  45. docling/pipeline/base_pipeline.py +4 -8
  46. docling/pipeline/simple_pipeline.py +0 -1
  47. docling/pipeline/standard_pdf_pipeline.py +0 -1
  48. docling/pipeline/vlm_pipeline.py +0 -3
  49. docling/utils/export.py +2 -4
  50. docling/utils/glm_utils.py +2 -2
  51. docling/utils/layout_postprocessor.py +4 -2
  52. docling/utils/model_downloader.py +7 -7
  53. docling/utils/utils.py +1 -1
  54. {docling-2.30.0.dist-info → docling-2.31.0.dist-info}/METADATA +2 -1
  55. docling-2.31.0.dist-info/RECORD +86 -0
  56. docling-2.30.0.dist-info/RECORD +0 -86
  57. {docling-2.30.0.dist-info → docling-2.31.0.dist-info}/LICENSE +0 -0
  58. {docling-2.30.0.dist-info → docling-2.31.0.dist-info}/WHEEL +0 -0
  59. {docling-2.30.0.dist-info → docling-2.31.0.dist-info}/entry_points.txt +0 -0
@@ -1,25 +1,22 @@
1
1
  import logging
2
2
  import time
3
+ from collections.abc import Iterable
3
4
  from pathlib import Path
4
- from typing import Iterable, List, Optional
5
+ from typing import Optional
5
6
 
6
7
  from docling.datamodel.base_models import Page, VlmPrediction
7
8
  from docling.datamodel.document import ConversionResult
8
9
  from docling.datamodel.pipeline_options import (
9
- AcceleratorDevice,
10
10
  AcceleratorOptions,
11
11
  HuggingFaceVlmOptions,
12
12
  )
13
- from docling.datamodel.settings import settings
14
13
  from docling.models.base_model import BasePageModel
15
- from docling.utils.accelerator_utils import decide_device
16
14
  from docling.utils.profiling import TimeRecorder
17
15
 
18
16
  _log = logging.getLogger(__name__)
19
17
 
20
18
 
21
19
  class HuggingFaceMlxModel(BasePageModel):
22
-
23
20
  def __init__(
24
21
  self,
25
22
  enabled: bool,
@@ -32,7 +29,6 @@ class HuggingFaceMlxModel(BasePageModel):
32
29
  self.vlm_options = vlm_options
33
30
 
34
31
  if self.enabled:
35
-
36
32
  try:
37
33
  from mlx_vlm import generate, load # type: ignore
38
34
  from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
@@ -125,6 +121,8 @@ class HuggingFaceMlxModel(BasePageModel):
125
121
  generation_time = time.time() - start_time
126
122
  page_tags = output
127
123
 
124
+ _log.debug(f"Generation time {generation_time:.2f} seconds.")
125
+
128
126
  # inference_time = time.time() - start_time
129
127
  # tokens_per_second = num_tokens / generation_time
130
128
  # print("")
@@ -1,16 +1,15 @@
1
1
  import logging
2
2
  import time
3
+ from collections.abc import Iterable
3
4
  from pathlib import Path
4
- from typing import Iterable, List, Optional
5
+ from typing import Optional
5
6
 
6
7
  from docling.datamodel.base_models import Page, VlmPrediction
7
8
  from docling.datamodel.document import ConversionResult
8
9
  from docling.datamodel.pipeline_options import (
9
- AcceleratorDevice,
10
10
  AcceleratorOptions,
11
11
  HuggingFaceVlmOptions,
12
12
  )
13
- from docling.datamodel.settings import settings
14
13
  from docling.models.base_model import BasePageModel
15
14
  from docling.utils.accelerator_utils import decide_device
16
15
  from docling.utils.profiling import TimeRecorder
@@ -19,7 +18,6 @@ _log = logging.getLogger(__name__)
19
18
 
20
19
 
21
20
  class HuggingFaceVlmModel(BasePageModel):
22
-
23
21
  def __init__(
24
22
  self,
25
23
  enabled: bool,
@@ -42,7 +40,7 @@ class HuggingFaceVlmModel(BasePageModel):
42
40
  device = decide_device(accelerator_options.device)
43
41
  self.device = device
44
42
 
45
- _log.debug("Available device for HuggingFace VLM: {}".format(device))
43
+ _log.debug(f"Available device for HuggingFace VLM: {device}")
46
44
 
47
45
  repo_cache_folder = vlm_options.repo_id.replace("/", "--")
48
46
 
@@ -168,6 +166,10 @@ class HuggingFaceVlmModel(BasePageModel):
168
166
  num_tokens = len(generated_ids[0])
169
167
  page_tags = generated_texts
170
168
 
169
+ _log.debug(
170
+ f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
171
+ )
172
+
171
173
  # inference_time = time.time() - start_time
172
174
  # tokens_per_second = num_tokens / generation_time
173
175
  # print("")
@@ -1,8 +1,9 @@
1
1
  import copy
2
2
  import logging
3
3
  import warnings
4
+ from collections.abc import Iterable
4
5
  from pathlib import Path
5
- from typing import Iterable, Optional, Union
6
+ from typing import Optional
6
7
 
7
8
  from docling_core.types.doc import DocItemLabel
8
9
  from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
@@ -142,7 +143,6 @@ class LayoutModel(BasePageModel):
142
143
  def __call__(
143
144
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
144
145
  ) -> Iterable[Page]:
145
-
146
146
  for page in page_batch:
147
147
  assert page._backend is not None
148
148
  if not page._backend.is_valid():
@@ -1,8 +1,9 @@
1
1
  import logging
2
2
  import sys
3
3
  import tempfile
4
+ from collections.abc import Iterable
4
5
  from pathlib import Path
5
- from typing import Iterable, Optional, Tuple, Type
6
+ from typing import Optional, Type
6
7
 
7
8
  from docling_core.types.doc import BoundingBox, CoordOrigin
8
9
  from docling_core.types.doc.page import BoundingRectangle, TextCell
@@ -41,7 +42,7 @@ class OcrMacModel(BaseOcrModel):
41
42
 
42
43
  if self.enabled:
43
44
  if "darwin" != sys.platform:
44
- raise RuntimeError(f"OcrMac is only supported on Mac.")
45
+ raise RuntimeError("OcrMac is only supported on Mac.")
45
46
  install_errmsg = (
46
47
  "ocrmac is not correctly installed. "
47
48
  "Please install it via `pip install ocrmac` to use this OCR engine. "
@@ -58,7 +59,6 @@ class OcrMacModel(BaseOcrModel):
58
59
  def __call__(
59
60
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
60
61
  ) -> Iterable[Page]:
61
-
62
62
  if not self.enabled:
63
63
  yield from page_batch
64
64
  return
@@ -69,7 +69,6 @@ class OcrMacModel(BaseOcrModel):
69
69
  yield page
70
70
  else:
71
71
  with TimeRecorder(conv_res, "ocr"):
72
-
73
72
  ocr_rects = self.get_ocr_rects(page)
74
73
 
75
74
  all_ocr_cells = []
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  import re
3
- from typing import Iterable, List
3
+ from collections.abc import Iterable
4
+ from typing import List
4
5
 
5
6
  from pydantic import BaseModel
6
7
 
@@ -53,9 +54,9 @@ class PageAssembleModel(BasePageModel):
53
54
  sanitized_text = "".join(lines)
54
55
 
55
56
  # Text normalization
56
- sanitized_text = sanitized_text.replace("⁄", "/")
57
- sanitized_text = sanitized_text.replace("’", "'")
58
- sanitized_text = sanitized_text.replace("‘", "'")
57
+ sanitized_text = sanitized_text.replace("⁄", "/") # noqa: RUF001
58
+ sanitized_text = sanitized_text.replace("’", "'") # noqa: RUF001
59
+ sanitized_text = sanitized_text.replace("‘", "'") # noqa: RUF001
59
60
  sanitized_text = sanitized_text.replace("“", '"')
60
61
  sanitized_text = sanitized_text.replace("”", '"')
61
62
  sanitized_text = sanitized_text.replace("•", "·")
@@ -71,7 +72,6 @@ class PageAssembleModel(BasePageModel):
71
72
  yield page
72
73
  else:
73
74
  with TimeRecorder(conv_res, "page_assemble"):
74
-
75
75
  assert page.predictions.layout is not None
76
76
 
77
77
  # assembles some JSON output page by page.
@@ -83,7 +83,6 @@ class PageAssembleModel(BasePageModel):
83
83
  for cluster in page.predictions.layout.clusters:
84
84
  # _log.info("Cluster label seen:", cluster.label)
85
85
  if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
86
-
87
86
  textlines = [
88
87
  cell.text.replace("\x02", "-").strip()
89
88
  for cell in cluster.cells
@@ -109,9 +108,7 @@ class PageAssembleModel(BasePageModel):
109
108
  tbl = page.predictions.tablestructure.table_map.get(
110
109
  cluster.id, None
111
110
  )
112
- if (
113
- not tbl
114
- ): # fallback: add table without structure, if it isn't present
111
+ if not tbl: # fallback: add table without structure, if it isn't present
115
112
  tbl = Table(
116
113
  label=cluster.label,
117
114
  id=cluster.id,
@@ -130,9 +127,7 @@ class PageAssembleModel(BasePageModel):
130
127
  fig = page.predictions.figures_classification.figure_map.get(
131
128
  cluster.id, None
132
129
  )
133
- if (
134
- not fig
135
- ): # fallback: add figure without classification, if it isn't present
130
+ if not fig: # fallback: add figure without classification, if it isn't present
136
131
  fig = FigureElement(
137
132
  label=cluster.label,
138
133
  id=cluster.id,
@@ -1,5 +1,6 @@
1
+ from collections.abc import Iterable
1
2
  from pathlib import Path
2
- from typing import Iterable, Optional
3
+ from typing import Optional
3
4
 
4
5
  from PIL import ImageDraw
5
6
  from pydantic import BaseModel
@@ -1,5 +1,6 @@
1
+ from collections.abc import Iterable
1
2
  from pathlib import Path
2
- from typing import Iterable, Optional, Type, Union
3
+ from typing import Optional, Type, Union
3
4
 
4
5
  from PIL import Image
5
6
 
@@ -1,12 +1,11 @@
1
- import logging
2
1
  from abc import abstractmethod
2
+ from collections.abc import Iterable
3
3
  from pathlib import Path
4
- from typing import Any, Iterable, List, Optional, Type, Union
4
+ from typing import List, Optional, Type, Union
5
5
 
6
6
  from docling_core.types.doc import (
7
7
  DoclingDocument,
8
8
  NodeItem,
9
- PictureClassificationClass,
10
9
  PictureItem,
11
10
  )
12
11
  from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
@@ -1,5 +1,6 @@
1
+ from collections.abc import Iterable
1
2
  from pathlib import Path
2
- from typing import Iterable, Optional, Type, Union
3
+ from typing import Optional, Type, Union
3
4
 
4
5
  from PIL import Image
5
6
 
@@ -13,7 +14,6 @@ from docling.utils.accelerator_utils import decide_device
13
14
 
14
15
 
15
16
  class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
16
-
17
17
  @classmethod
18
18
  def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
19
19
  return PictureDescriptionVlmOptions
@@ -36,7 +36,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
36
36
  self.options: PictureDescriptionVlmOptions
37
37
 
38
38
  if self.enabled:
39
-
40
39
  if artifacts_path is None:
41
40
  artifacts_path = self.download_models(repo_id=self.options.repo_id)
42
41
  else:
@@ -1,6 +1,7 @@
1
1
  import logging
2
+ from collections.abc import Iterable
2
3
  from pathlib import Path
3
- from typing import Iterable, Optional, Type
4
+ from typing import Optional, Type
4
5
 
5
6
  import numpy
6
7
  from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -74,13 +75,11 @@ class RapidOcrModel(BaseOcrModel):
74
75
  def __call__(
75
76
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
76
77
  ) -> Iterable[Page]:
77
-
78
78
  if not self.enabled:
79
79
  yield from page_batch
80
80
  return
81
81
 
82
82
  for page in page_batch:
83
-
84
83
  assert page._backend is not None
85
84
  if not page._backend.is_valid():
86
85
  yield page
@@ -1,12 +1,7 @@
1
- import copy
2
- import random
3
1
  from pathlib import Path
4
2
  from typing import Dict, List
5
3
 
6
4
  from docling_core.types.doc import (
7
- BoundingBox,
8
- CoordOrigin,
9
- DocItem,
10
5
  DocItemLabel,
11
6
  DoclingDocument,
12
7
  DocumentOrigin,
@@ -17,13 +12,10 @@ from docling_core.types.doc import (
17
12
  TableData,
18
13
  )
19
14
  from docling_core.types.doc.document import ContentLayer
20
- from docling_core.types.legacy_doc.base import Ref
21
- from docling_core.types.legacy_doc.document import BaseText
22
15
  from docling_ibm_models.reading_order.reading_order_rb import (
23
16
  PageElement as ReadingOrderPageElement,
17
+ ReadingOrderPredictor,
24
18
  )
25
- from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
26
- from PIL import ImageDraw
27
19
  from pydantic import BaseModel, ConfigDict
28
20
 
29
21
  from docling.datamodel.base_models import (
@@ -35,7 +27,6 @@ from docling.datamodel.base_models import (
35
27
  TextElement,
36
28
  )
37
29
  from docling.datamodel.document import ConversionResult
38
- from docling.datamodel.settings import settings
39
30
  from docling.utils.profiling import ProfilingScope, TimeRecorder
40
31
 
41
32
 
@@ -53,12 +44,10 @@ class ReadingOrderModel:
53
44
  def _assembled_to_readingorder_elements(
54
45
  self, conv_res: ConversionResult
55
46
  ) -> List[ReadingOrderPageElement]:
56
-
57
47
  elements: List[ReadingOrderPageElement] = []
58
48
  page_no_to_pages = {p.page_no: p for p in conv_res.pages}
59
49
 
60
50
  for element in conv_res.assembled.elements:
61
-
62
51
  page_height = page_no_to_pages[element.page_no].size.height # type: ignore
63
52
  bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
64
53
  text = element.text or ""
@@ -84,7 +73,6 @@ class ReadingOrderModel:
84
73
  def _add_child_elements(
85
74
  self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
86
75
  ):
87
-
88
76
  child: Cluster
89
77
  for child in element.cluster.children:
90
78
  c_label = child.label
@@ -110,7 +98,7 @@ class ReadingOrderModel:
110
98
  else:
111
99
  doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
112
100
 
113
- def _readingorder_elements_to_docling_doc(
101
+ def _readingorder_elements_to_docling_doc( # noqa: C901
114
102
  self,
115
103
  conv_res: ConversionResult,
116
104
  ro_elements: List[ReadingOrderPageElement],
@@ -118,7 +106,6 @@ class ReadingOrderModel:
118
106
  el_to_footnotes_mapping: Dict[int, List[int]],
119
107
  el_merges_mapping: Dict[int, List[int]],
120
108
  ) -> DoclingDocument:
121
-
122
109
  id_to_elem = {
123
110
  RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
124
111
  for elem in conv_res.assembled.elements
@@ -192,7 +179,6 @@ class ReadingOrderModel:
192
179
 
193
180
  code_item.footnotes.append(new_footnote_item.get_ref())
194
181
  else:
195
-
196
182
  new_item, current_list = self._handle_text_element(
197
183
  element, out_doc, current_list, page_height
198
184
  )
@@ -206,7 +192,6 @@ class ReadingOrderModel:
206
192
  )
207
193
 
208
194
  elif isinstance(element, Table):
209
-
210
195
  tbl_data = TableData(
211
196
  num_rows=element.num_rows,
212
197
  num_cols=element.num_cols,
@@ -342,12 +327,12 @@ class ReadingOrderModel:
342
327
  return new_item, current_list
343
328
 
344
329
  def _merge_elements(self, element, merged_elem, new_item, page_height):
345
- assert isinstance(
346
- merged_elem, type(element)
347
- ), "Merged element must be of same type as element."
348
- assert (
349
- merged_elem.label == new_item.label
350
- ), "Labels of merged elements must match."
330
+ assert isinstance(merged_elem, type(element)), (
331
+ "Merged element must be of same type as element."
332
+ )
333
+ assert merged_elem.label == new_item.label, (
334
+ "Labels of merged elements must match."
335
+ )
351
336
  prov = ProvenanceItem(
352
337
  page_no=element.page_no + 1,
353
338
  charspan=(
@@ -1,13 +1,13 @@
1
1
  import copy
2
2
  import warnings
3
+ from collections.abc import Iterable
3
4
  from pathlib import Path
4
- from typing import Iterable, Optional, Union
5
+ from typing import Optional
5
6
 
6
7
  import numpy
7
8
  from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
8
9
  from docling_core.types.doc.page import (
9
10
  BoundingRectangle,
10
- SegmentedPdfPage,
11
11
  TextCellUnit,
12
12
  )
13
13
  from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
@@ -44,7 +44,6 @@ class TableStructureModel(BasePageModel):
44
44
 
45
45
  self.enabled = enabled
46
46
  if self.enabled:
47
-
48
47
  if artifacts_path is None:
49
48
  artifacts_path = self.download_models() / self._model_path
50
49
  else:
@@ -175,7 +174,6 @@ class TableStructureModel(BasePageModel):
175
174
  def __call__(
176
175
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
177
176
  ) -> Iterable[Page]:
178
-
179
177
  if not self.enabled:
180
178
  yield from page_batch
181
179
  return
@@ -186,7 +184,6 @@ class TableStructureModel(BasePageModel):
186
184
  yield page
187
185
  else:
188
186
  with TimeRecorder(conv_res, "table_structure"):
189
-
190
187
  assert page.predictions.layout is not None
191
188
  assert page.size is not None
192
189
 
@@ -260,7 +257,6 @@ class TableStructureModel(BasePageModel):
260
257
  table_out = tf_output[0]
261
258
  table_cells = []
262
259
  for element in table_out["tf_responses"]:
263
-
264
260
  if not self.do_cell_matching:
265
261
  the_bbox = BoundingBox.model_validate(
266
262
  element["bbox"]
@@ -3,9 +3,10 @@ import io
3
3
  import logging
4
4
  import os
5
5
  import tempfile
6
+ from collections.abc import Iterable
6
7
  from pathlib import Path
7
8
  from subprocess import DEVNULL, PIPE, Popen
8
- from typing import Iterable, List, Optional, Tuple, Type
9
+ from typing import List, Optional, Tuple, Type
9
10
 
10
11
  import pandas as pd
11
12
  from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -63,8 +64,7 @@ class TesseractOcrCliModel(BaseOcrModel):
63
64
  )
64
65
 
65
66
  def _get_name_and_version(self) -> Tuple[str, str]:
66
-
67
- if self._name != None and self._version != None:
67
+ if self._name is not None and self._version is not None:
68
68
  return self._name, self._version # type: ignore
69
69
 
70
70
  cmd = [self.options.tesseract_cmd, "--version"]
@@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel):
125
125
  # _log.info(decoded_data)
126
126
 
127
127
  # Read the TSV file generated by Tesseract
128
- df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
128
+ df_result = pd.read_csv(
129
+ io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
130
+ )
129
131
 
130
132
  # Display the dataframe (optional)
131
133
  # _log.info("df: ", df.head())
132
134
 
133
135
  # Filter rows that contain actual text (ignore header or empty rows)
134
- df_filtered = df[
135
- df["text"].notnull() & (df["text"].apply(str).str.strip() != "")
136
+ df_filtered = df_result[
137
+ df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
136
138
  ]
137
139
 
138
140
  return df_filtered
@@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel):
149
151
  proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
150
152
  output, _ = proc.communicate()
151
153
  decoded_data = output.decode("utf-8")
152
- df = pd.read_csv(
154
+ df_detected = pd.read_csv(
153
155
  io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
154
156
  )
155
- scripts = df.loc[df["key"] == "Script"].value.tolist()
157
+ scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
156
158
  if len(scripts) == 0:
157
159
  _log.warning("Tesseract cannot detect the script of the page")
158
160
  return None
@@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel):
183
185
  proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
184
186
  output, _ = proc.communicate()
185
187
  decoded_data = output.decode("utf-8")
186
- df = pd.read_csv(io.StringIO(decoded_data), header=None)
187
- self._tesseract_languages = df[0].tolist()[1:]
188
+ df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
189
+ self._tesseract_languages = df_list[0].tolist()[1:]
188
190
 
189
191
  # Decide the script prefix
190
- if any([l.startswith("script/") for l in self._tesseract_languages]):
192
+ if any(lang.startswith("script/") for lang in self._tesseract_languages):
191
193
  script_prefix = "script/"
192
194
  else:
193
195
  script_prefix = ""
@@ -197,7 +199,6 @@ class TesseractOcrCliModel(BaseOcrModel):
197
199
  def __call__(
198
200
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
199
201
  ) -> Iterable[Page]:
200
-
201
202
  if not self.enabled:
202
203
  yield from page_batch
203
204
  return
@@ -225,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel):
225
226
  fname = image_file.name
226
227
  high_res_image.save(image_file)
227
228
 
228
- df = self._run_tesseract(fname)
229
+ df_result = self._run_tesseract(fname)
229
230
  finally:
230
231
  if os.path.exists(fname):
231
232
  os.remove(fname)
232
233
 
233
- # _log.info(df)
234
+ # _log.info(df_result)
234
235
 
235
236
  # Print relevant columns (bounding box and text)
236
- for ix, row in df.iterrows():
237
+ for ix, row in df_result.iterrows():
237
238
  text = row["text"]
238
239
  conf = row["conf"]
239
240
 
240
- l = float(row["left"])
241
+ l = float(row["left"]) # noqa: E741
241
242
  b = float(row["top"])
242
243
  w = float(row["width"])
243
244
  h = float(row["height"])
@@ -1,6 +1,7 @@
1
1
  import logging
2
+ from collections.abc import Iterable
2
3
  from pathlib import Path
3
- from typing import Iterable, Optional, Type
4
+ from typing import Optional, Type
4
5
 
5
6
  from docling_core.types.doc import BoundingBox, CoordOrigin
6
7
  from docling_core.types.doc.page import BoundingRectangle, TextCell
@@ -37,9 +38,6 @@ class TesseractOcrModel(BaseOcrModel):
37
38
  self.options: TesseractOcrOptions
38
39
 
39
40
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
40
- self.reader = None
41
- self.osd_reader = None
42
- self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
43
41
 
44
42
  if self.enabled:
45
43
  install_errmsg = (
@@ -64,7 +62,7 @@ class TesseractOcrModel(BaseOcrModel):
64
62
  raise ImportError(install_errmsg)
65
63
  try:
66
64
  tesseract_version = tesserocr.tesseract_version()
67
- except:
65
+ except Exception:
68
66
  raise ImportError(install_errmsg)
69
67
 
70
68
  _, self._tesserocr_languages = tesserocr.get_languages()
@@ -75,7 +73,7 @@ class TesseractOcrModel(BaseOcrModel):
75
73
  _log.debug("Initializing TesserOCR: %s", tesseract_version)
76
74
  lang = "+".join(self.options.lang)
77
75
 
78
- if any([l.startswith("script/") for l in self._tesserocr_languages]):
76
+ if any(lang.startswith("script/") for lang in self._tesserocr_languages):
79
77
  self.script_prefix = "script/"
80
78
  else:
81
79
  self.script_prefix = ""
@@ -86,6 +84,10 @@ class TesseractOcrModel(BaseOcrModel):
86
84
  "oem": tesserocr.OEM.DEFAULT,
87
85
  }
88
86
 
87
+ self.reader = None
88
+ self.osd_reader = None
89
+ self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
90
+
89
91
  if self.options.path is not None:
90
92
  tesserocr_kwargs["path"] = self.options.path
91
93
 
@@ -3,9 +3,10 @@ import logging
3
3
  import time
4
4
  import traceback
5
5
  from abc import ABC, abstractmethod
6
- from typing import Any, Callable, Iterable, List
6
+ from collections.abc import Iterable
7
+ from typing import Any, Callable, List
7
8
 
8
- from docling_core.types.doc import DoclingDocument, NodeItem
9
+ from docling_core.types.doc import NodeItem
9
10
 
10
11
  from docling.backend.abstract_backend import AbstractDocumentBackend
11
12
  from docling.backend.pdf_backend import PdfDocumentBackend
@@ -64,7 +65,6 @@ class BasePipeline(ABC):
64
65
  return conv_res
65
66
 
66
67
  def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
67
-
68
68
  def _prepare_elements(
69
69
  conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
70
70
  ) -> Iterable[NodeItem]:
@@ -113,7 +113,6 @@ class BasePipeline(ABC):
113
113
 
114
114
 
115
115
  class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
116
-
117
116
  def __init__(self, pipeline_options: PipelineOptions):
118
117
  super().__init__(pipeline_options)
119
118
  self.keep_backend = False
@@ -127,7 +126,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
127
126
  yield from page_batch
128
127
 
129
128
  def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
130
-
131
129
  if not isinstance(conv_res.input._backend, PdfDocumentBackend):
132
130
  raise RuntimeError(
133
131
  f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
@@ -139,8 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
139
137
 
140
138
  total_elapsed_time = 0.0
141
139
  with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
142
-
143
- for i in range(0, conv_res.input.page_count):
140
+ for i in range(conv_res.input.page_count):
144
141
  start_page, end_page = conv_res.input.limits.page_range
145
142
  if (start_page - 1) <= i <= (end_page - 1):
146
143
  conv_res.pages.append(Page(page_no=i))
@@ -161,7 +158,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
161
158
  pipeline_pages = self._apply_on_pages(conv_res, init_pages)
162
159
 
163
160
  for p in pipeline_pages: # Must exhaust!
164
-
165
161
  # Cleanup cached images
166
162
  if not self.keep_images:
167
163
  p._image_cache = {}
@@ -24,7 +24,6 @@ class SimplePipeline(BasePipeline):
24
24
  super().__init__(pipeline_options)
25
25
 
26
26
  def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
27
-
28
27
  if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
29
28
  raise RuntimeError(
30
29
  f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
@@ -1,5 +1,4 @@
1
1
  import logging
2
- import sys
3
2
  import warnings
4
3
  from pathlib import Path
5
4
  from typing import Optional, cast