docling 2.30.0__py3-none-any.whl → 2.31.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. docling/backend/asciidoc_backend.py +7 -15
  2. docling/backend/csv_backend.py +1 -1
  3. docling/backend/docling_parse_backend.py +2 -2
  4. docling/backend/docling_parse_v2_backend.py +2 -2
  5. docling/backend/docling_parse_v4_backend.py +3 -4
  6. docling/backend/docx/latex/latex_dict.py +0 -5
  7. docling/backend/docx/latex/omml.py +4 -7
  8. docling/backend/html_backend.py +66 -25
  9. docling/backend/md_backend.py +6 -8
  10. docling/backend/msexcel_backend.py +1 -7
  11. docling/backend/mspowerpoint_backend.py +4 -7
  12. docling/backend/msword_backend.py +5 -5
  13. docling/backend/pdf_backend.py +2 -1
  14. docling/backend/pypdfium2_backend.py +3 -3
  15. docling/backend/xml/jats_backend.py +11 -14
  16. docling/backend/xml/uspto_backend.py +19 -23
  17. docling/cli/main.py +8 -8
  18. docling/cli/models.py +6 -3
  19. docling/datamodel/base_models.py +7 -5
  20. docling/datamodel/document.py +19 -10
  21. docling/datamodel/pipeline_options.py +0 -1
  22. docling/document_converter.py +8 -6
  23. docling/models/api_vlm_model.py +1 -2
  24. docling/models/base_model.py +2 -4
  25. docling/models/base_ocr_model.py +2 -2
  26. docling/models/code_formula_model.py +2 -1
  27. docling/models/document_picture_classifier.py +2 -1
  28. docling/models/easyocr_model.py +10 -11
  29. docling/models/factories/__init__.py +2 -2
  30. docling/models/factories/base_factory.py +1 -1
  31. docling/models/hf_mlx_model.py +4 -6
  32. docling/models/hf_vlm_model.py +7 -5
  33. docling/models/layout_model.py +2 -2
  34. docling/models/ocr_mac_model.py +3 -4
  35. docling/models/page_assemble_model.py +7 -12
  36. docling/models/page_preprocessing_model.py +2 -1
  37. docling/models/picture_description_api_model.py +2 -1
  38. docling/models/picture_description_base_model.py +2 -3
  39. docling/models/picture_description_vlm_model.py +6 -4
  40. docling/models/rapid_ocr_model.py +2 -3
  41. docling/models/readingorder_model.py +9 -24
  42. docling/models/table_structure_model.py +4 -8
  43. docling/models/tesseract_ocr_cli_model.py +17 -16
  44. docling/models/tesseract_ocr_model.py +9 -5
  45. docling/pipeline/base_pipeline.py +4 -8
  46. docling/pipeline/simple_pipeline.py +0 -1
  47. docling/pipeline/standard_pdf_pipeline.py +0 -1
  48. docling/pipeline/vlm_pipeline.py +0 -3
  49. docling/utils/export.py +2 -4
  50. docling/utils/glm_utils.py +2 -2
  51. docling/utils/layout_postprocessor.py +4 -2
  52. docling/utils/model_downloader.py +31 -7
  53. docling/utils/utils.py +3 -3
  54. {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/METADATA +2 -1
  55. docling-2.31.1.dist-info/RECORD +86 -0
  56. docling-2.30.0.dist-info/RECORD +0 -86
  57. {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/LICENSE +0 -0
  58. {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/WHEEL +0 -0
  59. {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/entry_points.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  from abc import ABC, abstractmethod
2
- from typing import Any, Generic, Iterable, Optional, Protocol, Type
2
+ from collections.abc import Iterable
3
+ from typing import Generic, Optional, Protocol, Type
3
4
 
4
5
  from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
5
6
  from typing_extensions import TypeVar
@@ -29,7 +30,6 @@ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
29
30
 
30
31
 
31
32
  class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
32
-
33
33
  elements_batch_size: int = settings.perf.elements_batch_size
34
34
 
35
35
  @abstractmethod
@@ -50,7 +50,6 @@ class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
50
50
 
51
51
 
52
52
  class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
53
-
54
53
  def prepare_element(
55
54
  self, conv_res: ConversionResult, element: NodeItem
56
55
  ) -> Optional[NodeItem]:
@@ -62,7 +61,6 @@ class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
62
61
  class BaseItemAndImageEnrichmentModel(
63
62
  GenericEnrichmentModel[ItemAndImageEnrichmentElement]
64
63
  ):
65
-
66
64
  images_scale: float
67
65
  expansion_factor: float = 0.0
68
66
 
@@ -1,12 +1,12 @@
1
1
  import copy
2
2
  import logging
3
3
  from abc import abstractmethod
4
+ from collections.abc import Iterable
4
5
  from pathlib import Path
5
- from typing import Iterable, List, Optional, Type
6
+ from typing import List, Optional, Type
6
7
 
7
8
  import numpy as np
8
9
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
- from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
10
10
  from PIL import Image, ImageDraw
11
11
  from rtree import index
12
12
  from scipy.ndimage import binary_dilation, find_objects, label
@@ -1,7 +1,8 @@
1
1
  import re
2
2
  from collections import Counter
3
+ from collections.abc import Iterable
3
4
  from pathlib import Path
4
- from typing import Iterable, List, Literal, Optional, Tuple, Union
5
+ from typing import List, Literal, Optional, Tuple, Union
5
6
 
6
7
  import numpy as np
7
8
  from docling_core.types.doc import (
@@ -1,5 +1,6 @@
1
+ from collections.abc import Iterable
1
2
  from pathlib import Path
2
- from typing import Iterable, List, Literal, Optional, Tuple, Union
3
+ from typing import List, Literal, Optional, Union
3
4
 
4
5
  import numpy as np
5
6
  from docling_core.types.doc import (
@@ -1,8 +1,9 @@
1
1
  import logging
2
2
  import warnings
3
3
  import zipfile
4
+ from collections.abc import Iterable
4
5
  from pathlib import Path
5
- from typing import Iterable, List, Optional, Type
6
+ from typing import List, Optional, Type
6
7
 
7
8
  import numpy
8
9
  from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -58,12 +59,10 @@ class EasyOcrModel(BaseOcrModel):
58
59
  device = decide_device(accelerator_options.device)
59
60
  # Enable easyocr GPU if running on CUDA, MPS
60
61
  use_gpu = any(
61
- [
62
- device.startswith(x)
63
- for x in [
64
- AcceleratorDevice.CUDA.value,
65
- AcceleratorDevice.MPS.value,
66
- ]
62
+ device.startswith(x)
63
+ for x in [
64
+ AcceleratorDevice.CUDA.value,
65
+ AcceleratorDevice.MPS.value,
67
66
  ]
68
67
  )
69
68
  else:
@@ -98,8 +97,10 @@ class EasyOcrModel(BaseOcrModel):
98
97
  progress: bool = False,
99
98
  ) -> Path:
100
99
  # Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
101
- from easyocr.config import detection_models as det_models_dict
102
- from easyocr.config import recognition_models as rec_models_dict
100
+ from easyocr.config import (
101
+ detection_models as det_models_dict,
102
+ recognition_models as rec_models_dict,
103
+ )
103
104
 
104
105
  if local_dir is None:
105
106
  local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
@@ -126,13 +127,11 @@ class EasyOcrModel(BaseOcrModel):
126
127
  def __call__(
127
128
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
128
129
  ) -> Iterable[Page]:
129
-
130
130
  if not self.enabled:
131
131
  yield from page_batch
132
132
  return
133
133
 
134
134
  for page in page_batch:
135
-
136
135
  assert page._backend is not None
137
136
  if not page._backend.is_valid():
138
137
  yield page
@@ -9,7 +9,7 @@ from docling.models.factories.picture_description_factory import (
9
9
  logger = logging.getLogger(__name__)
10
10
 
11
11
 
12
- @lru_cache()
12
+ @lru_cache
13
13
  def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
14
14
  factory = OcrFactory()
15
15
  factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
@@ -17,7 +17,7 @@ def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
17
17
  return factory
18
18
 
19
19
 
20
- @lru_cache()
20
+ @lru_cache
21
21
  def get_picture_description_factory(
22
22
  allow_external_plugins: bool = False,
23
23
  ) -> PictureDescriptionFactory:
@@ -33,7 +33,7 @@ class BaseFactory(Generic[A], metaclass=ABCMeta):
33
33
 
34
34
  @property
35
35
  def registered_kind(self) -> list[str]:
36
- return list(opt.kind for opt in self._classes.keys())
36
+ return [opt.kind for opt in self._classes.keys()]
37
37
 
38
38
  def get_enum(self) -> enum.Enum:
39
39
  return enum.Enum(
@@ -1,25 +1,22 @@
1
1
  import logging
2
2
  import time
3
+ from collections.abc import Iterable
3
4
  from pathlib import Path
4
- from typing import Iterable, List, Optional
5
+ from typing import Optional
5
6
 
6
7
  from docling.datamodel.base_models import Page, VlmPrediction
7
8
  from docling.datamodel.document import ConversionResult
8
9
  from docling.datamodel.pipeline_options import (
9
- AcceleratorDevice,
10
10
  AcceleratorOptions,
11
11
  HuggingFaceVlmOptions,
12
12
  )
13
- from docling.datamodel.settings import settings
14
13
  from docling.models.base_model import BasePageModel
15
- from docling.utils.accelerator_utils import decide_device
16
14
  from docling.utils.profiling import TimeRecorder
17
15
 
18
16
  _log = logging.getLogger(__name__)
19
17
 
20
18
 
21
19
  class HuggingFaceMlxModel(BasePageModel):
22
-
23
20
  def __init__(
24
21
  self,
25
22
  enabled: bool,
@@ -32,7 +29,6 @@ class HuggingFaceMlxModel(BasePageModel):
32
29
  self.vlm_options = vlm_options
33
30
 
34
31
  if self.enabled:
35
-
36
32
  try:
37
33
  from mlx_vlm import generate, load # type: ignore
38
34
  from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
@@ -125,6 +121,8 @@ class HuggingFaceMlxModel(BasePageModel):
125
121
  generation_time = time.time() - start_time
126
122
  page_tags = output
127
123
 
124
+ _log.debug(f"Generation time {generation_time:.2f} seconds.")
125
+
128
126
  # inference_time = time.time() - start_time
129
127
  # tokens_per_second = num_tokens / generation_time
130
128
  # print("")
@@ -1,16 +1,15 @@
1
1
  import logging
2
2
  import time
3
+ from collections.abc import Iterable
3
4
  from pathlib import Path
4
- from typing import Iterable, List, Optional
5
+ from typing import Optional
5
6
 
6
7
  from docling.datamodel.base_models import Page, VlmPrediction
7
8
  from docling.datamodel.document import ConversionResult
8
9
  from docling.datamodel.pipeline_options import (
9
- AcceleratorDevice,
10
10
  AcceleratorOptions,
11
11
  HuggingFaceVlmOptions,
12
12
  )
13
- from docling.datamodel.settings import settings
14
13
  from docling.models.base_model import BasePageModel
15
14
  from docling.utils.accelerator_utils import decide_device
16
15
  from docling.utils.profiling import TimeRecorder
@@ -19,7 +18,6 @@ _log = logging.getLogger(__name__)
19
18
 
20
19
 
21
20
  class HuggingFaceVlmModel(BasePageModel):
22
-
23
21
  def __init__(
24
22
  self,
25
23
  enabled: bool,
@@ -42,7 +40,7 @@ class HuggingFaceVlmModel(BasePageModel):
42
40
  device = decide_device(accelerator_options.device)
43
41
  self.device = device
44
42
 
45
- _log.debug("Available device for HuggingFace VLM: {}".format(device))
43
+ _log.debug(f"Available device for HuggingFace VLM: {device}")
46
44
 
47
45
  repo_cache_folder = vlm_options.repo_id.replace("/", "--")
48
46
 
@@ -168,6 +166,10 @@ class HuggingFaceVlmModel(BasePageModel):
168
166
  num_tokens = len(generated_ids[0])
169
167
  page_tags = generated_texts
170
168
 
169
+ _log.debug(
170
+ f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
171
+ )
172
+
171
173
  # inference_time = time.time() - start_time
172
174
  # tokens_per_second = num_tokens / generation_time
173
175
  # print("")
@@ -1,8 +1,9 @@
1
1
  import copy
2
2
  import logging
3
3
  import warnings
4
+ from collections.abc import Iterable
4
5
  from pathlib import Path
5
- from typing import Iterable, Optional, Union
6
+ from typing import Optional
6
7
 
7
8
  from docling_core.types.doc import DocItemLabel
8
9
  from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
@@ -142,7 +143,6 @@ class LayoutModel(BasePageModel):
142
143
  def __call__(
143
144
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
144
145
  ) -> Iterable[Page]:
145
-
146
146
  for page in page_batch:
147
147
  assert page._backend is not None
148
148
  if not page._backend.is_valid():
@@ -1,8 +1,9 @@
1
1
  import logging
2
2
  import sys
3
3
  import tempfile
4
+ from collections.abc import Iterable
4
5
  from pathlib import Path
5
- from typing import Iterable, Optional, Tuple, Type
6
+ from typing import Optional, Type
6
7
 
7
8
  from docling_core.types.doc import BoundingBox, CoordOrigin
8
9
  from docling_core.types.doc.page import BoundingRectangle, TextCell
@@ -41,7 +42,7 @@ class OcrMacModel(BaseOcrModel):
41
42
 
42
43
  if self.enabled:
43
44
  if "darwin" != sys.platform:
44
- raise RuntimeError(f"OcrMac is only supported on Mac.")
45
+ raise RuntimeError("OcrMac is only supported on Mac.")
45
46
  install_errmsg = (
46
47
  "ocrmac is not correctly installed. "
47
48
  "Please install it via `pip install ocrmac` to use this OCR engine. "
@@ -58,7 +59,6 @@ class OcrMacModel(BaseOcrModel):
58
59
  def __call__(
59
60
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
60
61
  ) -> Iterable[Page]:
61
-
62
62
  if not self.enabled:
63
63
  yield from page_batch
64
64
  return
@@ -69,7 +69,6 @@ class OcrMacModel(BaseOcrModel):
69
69
  yield page
70
70
  else:
71
71
  with TimeRecorder(conv_res, "ocr"):
72
-
73
72
  ocr_rects = self.get_ocr_rects(page)
74
73
 
75
74
  all_ocr_cells = []
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  import re
3
- from typing import Iterable, List
3
+ from collections.abc import Iterable
4
+ from typing import List
4
5
 
5
6
  from pydantic import BaseModel
6
7
 
@@ -53,9 +54,9 @@ class PageAssembleModel(BasePageModel):
53
54
  sanitized_text = "".join(lines)
54
55
 
55
56
  # Text normalization
56
- sanitized_text = sanitized_text.replace("⁄", "/")
57
- sanitized_text = sanitized_text.replace("’", "'")
58
- sanitized_text = sanitized_text.replace("‘", "'")
57
+ sanitized_text = sanitized_text.replace("⁄", "/") # noqa: RUF001
58
+ sanitized_text = sanitized_text.replace("’", "'") # noqa: RUF001
59
+ sanitized_text = sanitized_text.replace("‘", "'") # noqa: RUF001
59
60
  sanitized_text = sanitized_text.replace("“", '"')
60
61
  sanitized_text = sanitized_text.replace("”", '"')
61
62
  sanitized_text = sanitized_text.replace("•", "·")
@@ -71,7 +72,6 @@ class PageAssembleModel(BasePageModel):
71
72
  yield page
72
73
  else:
73
74
  with TimeRecorder(conv_res, "page_assemble"):
74
-
75
75
  assert page.predictions.layout is not None
76
76
 
77
77
  # assembles some JSON output page by page.
@@ -83,7 +83,6 @@ class PageAssembleModel(BasePageModel):
83
83
  for cluster in page.predictions.layout.clusters:
84
84
  # _log.info("Cluster label seen:", cluster.label)
85
85
  if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
86
-
87
86
  textlines = [
88
87
  cell.text.replace("\x02", "-").strip()
89
88
  for cell in cluster.cells
@@ -109,9 +108,7 @@ class PageAssembleModel(BasePageModel):
109
108
  tbl = page.predictions.tablestructure.table_map.get(
110
109
  cluster.id, None
111
110
  )
112
- if (
113
- not tbl
114
- ): # fallback: add table without structure, if it isn't present
111
+ if not tbl: # fallback: add table without structure, if it isn't present
115
112
  tbl = Table(
116
113
  label=cluster.label,
117
114
  id=cluster.id,
@@ -130,9 +127,7 @@ class PageAssembleModel(BasePageModel):
130
127
  fig = page.predictions.figures_classification.figure_map.get(
131
128
  cluster.id, None
132
129
  )
133
- if (
134
- not fig
135
- ): # fallback: add figure without classification, if it isn't present
130
+ if not fig: # fallback: add figure without classification, if it isn't present
136
131
  fig = FigureElement(
137
132
  label=cluster.label,
138
133
  id=cluster.id,
@@ -1,5 +1,6 @@
1
+ from collections.abc import Iterable
1
2
  from pathlib import Path
2
- from typing import Iterable, Optional
3
+ from typing import Optional
3
4
 
4
5
  from PIL import ImageDraw
5
6
  from pydantic import BaseModel
@@ -1,5 +1,6 @@
1
+ from collections.abc import Iterable
1
2
  from pathlib import Path
2
- from typing import Iterable, Optional, Type, Union
3
+ from typing import Optional, Type, Union
3
4
 
4
5
  from PIL import Image
5
6
 
@@ -1,12 +1,11 @@
1
- import logging
2
1
  from abc import abstractmethod
2
+ from collections.abc import Iterable
3
3
  from pathlib import Path
4
- from typing import Any, Iterable, List, Optional, Type, Union
4
+ from typing import List, Optional, Type, Union
5
5
 
6
6
  from docling_core.types.doc import (
7
7
  DoclingDocument,
8
8
  NodeItem,
9
- PictureClassificationClass,
10
9
  PictureItem,
11
10
  )
12
11
  from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
@@ -1,5 +1,6 @@
1
+ from collections.abc import Iterable
1
2
  from pathlib import Path
2
- from typing import Iterable, Optional, Type, Union
3
+ from typing import Optional, Type, Union
3
4
 
4
5
  from PIL import Image
5
6
 
@@ -13,7 +14,6 @@ from docling.utils.accelerator_utils import decide_device
13
14
 
14
15
 
15
16
  class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
16
-
17
17
  @classmethod
18
18
  def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
19
19
  return PictureDescriptionVlmOptions
@@ -36,7 +36,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
36
36
  self.options: PictureDescriptionVlmOptions
37
37
 
38
38
  if self.enabled:
39
-
40
39
  if artifacts_path is None:
41
40
  artifacts_path = self.download_models(repo_id=self.options.repo_id)
42
41
  else:
@@ -58,7 +57,10 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
58
57
  artifacts_path,
59
58
  torch_dtype=torch.bfloat16,
60
59
  _attn_implementation=(
61
- "flash_attention_2" if self.device.startswith("cuda") else "eager"
60
+ "flash_attention_2"
61
+ if self.device.startswith("cuda")
62
+ and accelerator_options.cuda_use_flash_attention2
63
+ else "eager"
62
64
  ),
63
65
  ).to(self.device)
64
66
 
@@ -1,6 +1,7 @@
1
1
  import logging
2
+ from collections.abc import Iterable
2
3
  from pathlib import Path
3
- from typing import Iterable, Optional, Type
4
+ from typing import Optional, Type
4
5
 
5
6
  import numpy
6
7
  from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -74,13 +75,11 @@ class RapidOcrModel(BaseOcrModel):
74
75
  def __call__(
75
76
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
76
77
  ) -> Iterable[Page]:
77
-
78
78
  if not self.enabled:
79
79
  yield from page_batch
80
80
  return
81
81
 
82
82
  for page in page_batch:
83
-
84
83
  assert page._backend is not None
85
84
  if not page._backend.is_valid():
86
85
  yield page
@@ -1,12 +1,7 @@
1
- import copy
2
- import random
3
1
  from pathlib import Path
4
2
  from typing import Dict, List
5
3
 
6
4
  from docling_core.types.doc import (
7
- BoundingBox,
8
- CoordOrigin,
9
- DocItem,
10
5
  DocItemLabel,
11
6
  DoclingDocument,
12
7
  DocumentOrigin,
@@ -17,13 +12,10 @@ from docling_core.types.doc import (
17
12
  TableData,
18
13
  )
19
14
  from docling_core.types.doc.document import ContentLayer
20
- from docling_core.types.legacy_doc.base import Ref
21
- from docling_core.types.legacy_doc.document import BaseText
22
15
  from docling_ibm_models.reading_order.reading_order_rb import (
23
16
  PageElement as ReadingOrderPageElement,
17
+ ReadingOrderPredictor,
24
18
  )
25
- from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
26
- from PIL import ImageDraw
27
19
  from pydantic import BaseModel, ConfigDict
28
20
 
29
21
  from docling.datamodel.base_models import (
@@ -35,7 +27,6 @@ from docling.datamodel.base_models import (
35
27
  TextElement,
36
28
  )
37
29
  from docling.datamodel.document import ConversionResult
38
- from docling.datamodel.settings import settings
39
30
  from docling.utils.profiling import ProfilingScope, TimeRecorder
40
31
 
41
32
 
@@ -53,12 +44,10 @@ class ReadingOrderModel:
53
44
  def _assembled_to_readingorder_elements(
54
45
  self, conv_res: ConversionResult
55
46
  ) -> List[ReadingOrderPageElement]:
56
-
57
47
  elements: List[ReadingOrderPageElement] = []
58
48
  page_no_to_pages = {p.page_no: p for p in conv_res.pages}
59
49
 
60
50
  for element in conv_res.assembled.elements:
61
-
62
51
  page_height = page_no_to_pages[element.page_no].size.height # type: ignore
63
52
  bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
64
53
  text = element.text or ""
@@ -84,7 +73,6 @@ class ReadingOrderModel:
84
73
  def _add_child_elements(
85
74
  self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
86
75
  ):
87
-
88
76
  child: Cluster
89
77
  for child in element.cluster.children:
90
78
  c_label = child.label
@@ -110,7 +98,7 @@ class ReadingOrderModel:
110
98
  else:
111
99
  doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
112
100
 
113
- def _readingorder_elements_to_docling_doc(
101
+ def _readingorder_elements_to_docling_doc( # noqa: C901
114
102
  self,
115
103
  conv_res: ConversionResult,
116
104
  ro_elements: List[ReadingOrderPageElement],
@@ -118,7 +106,6 @@ class ReadingOrderModel:
118
106
  el_to_footnotes_mapping: Dict[int, List[int]],
119
107
  el_merges_mapping: Dict[int, List[int]],
120
108
  ) -> DoclingDocument:
121
-
122
109
  id_to_elem = {
123
110
  RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
124
111
  for elem in conv_res.assembled.elements
@@ -192,7 +179,6 @@ class ReadingOrderModel:
192
179
 
193
180
  code_item.footnotes.append(new_footnote_item.get_ref())
194
181
  else:
195
-
196
182
  new_item, current_list = self._handle_text_element(
197
183
  element, out_doc, current_list, page_height
198
184
  )
@@ -206,7 +192,6 @@ class ReadingOrderModel:
206
192
  )
207
193
 
208
194
  elif isinstance(element, Table):
209
-
210
195
  tbl_data = TableData(
211
196
  num_rows=element.num_rows,
212
197
  num_cols=element.num_cols,
@@ -342,12 +327,12 @@ class ReadingOrderModel:
342
327
  return new_item, current_list
343
328
 
344
329
  def _merge_elements(self, element, merged_elem, new_item, page_height):
345
- assert isinstance(
346
- merged_elem, type(element)
347
- ), "Merged element must be of same type as element."
348
- assert (
349
- merged_elem.label == new_item.label
350
- ), "Labels of merged elements must match."
330
+ assert isinstance(merged_elem, type(element)), (
331
+ "Merged element must be of same type as element."
332
+ )
333
+ assert merged_elem.label == new_item.label, (
334
+ "Labels of merged elements must match."
335
+ )
351
336
  prov = ProvenanceItem(
352
337
  page_no=element.page_no + 1,
353
338
  charspan=(
@@ -361,7 +346,7 @@ class ReadingOrderModel:
361
346
  new_item.prov.append(prov)
362
347
 
363
348
  def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
364
- with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
349
+ with TimeRecorder(conv_res, "reading_order", scope=ProfilingScope.DOCUMENT):
365
350
  page_elements = self._assembled_to_readingorder_elements(conv_res)
366
351
 
367
352
  # Apply reading order
@@ -1,13 +1,13 @@
1
1
  import copy
2
2
  import warnings
3
+ from collections.abc import Iterable
3
4
  from pathlib import Path
4
- from typing import Iterable, Optional, Union
5
+ from typing import Optional
5
6
 
6
7
  import numpy
7
8
  from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
8
9
  from docling_core.types.doc.page import (
9
10
  BoundingRectangle,
10
- SegmentedPdfPage,
11
11
  TextCellUnit,
12
12
  )
13
13
  from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
@@ -44,7 +44,6 @@ class TableStructureModel(BasePageModel):
44
44
 
45
45
  self.enabled = enabled
46
46
  if self.enabled:
47
-
48
47
  if artifacts_path is None:
49
48
  artifacts_path = self.download_models() / self._model_path
50
49
  else:
@@ -175,7 +174,6 @@ class TableStructureModel(BasePageModel):
175
174
  def __call__(
176
175
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
177
176
  ) -> Iterable[Page]:
178
-
179
177
  if not self.enabled:
180
178
  yield from page_batch
181
179
  return
@@ -186,7 +184,6 @@ class TableStructureModel(BasePageModel):
186
184
  yield page
187
185
  else:
188
186
  with TimeRecorder(conv_res, "table_structure"):
189
-
190
187
  assert page.predictions.layout is not None
191
188
  assert page.size is not None
192
189
 
@@ -237,7 +234,7 @@ class TableStructureModel(BasePageModel):
237
234
  tcells = table_cluster.cells
238
235
  tokens = []
239
236
  for c in tcells:
240
- # Only allow non empty stings (spaces) into the cells of a table
237
+ # Only allow non empty strings (spaces) into the cells of a table
241
238
  if len(c.text.strip()) > 0:
242
239
  new_cell = copy.deepcopy(c)
243
240
  new_cell.rect = BoundingRectangle.from_bounding_box(
@@ -260,7 +257,6 @@ class TableStructureModel(BasePageModel):
260
257
  table_out = tf_output[0]
261
258
  table_cells = []
262
259
  for element in table_out["tf_responses"]:
263
-
264
260
  if not self.do_cell_matching:
265
261
  the_bbox = BoundingBox.model_validate(
266
262
  element["bbox"]
@@ -271,7 +267,7 @@ class TableStructureModel(BasePageModel):
271
267
  element["bbox"]["token"] = text_piece
272
268
 
273
269
  tc = TableCell.model_validate(element)
274
- if self.do_cell_matching and tc.bbox is not None:
270
+ if tc.bbox is not None:
275
271
  tc.bbox = tc.bbox.scaled(1 / self.scale)
276
272
  table_cells.append(tc)
277
273