docling 2.30.0__py3-none-any.whl → 2.31.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +7 -15
- docling/backend/csv_backend.py +1 -1
- docling/backend/docling_parse_backend.py +2 -2
- docling/backend/docling_parse_v2_backend.py +2 -2
- docling/backend/docling_parse_v4_backend.py +3 -4
- docling/backend/docx/latex/latex_dict.py +0 -5
- docling/backend/docx/latex/omml.py +4 -7
- docling/backend/html_backend.py +66 -25
- docling/backend/md_backend.py +6 -8
- docling/backend/msexcel_backend.py +1 -7
- docling/backend/mspowerpoint_backend.py +4 -7
- docling/backend/msword_backend.py +5 -5
- docling/backend/pdf_backend.py +2 -1
- docling/backend/pypdfium2_backend.py +3 -3
- docling/backend/xml/jats_backend.py +11 -14
- docling/backend/xml/uspto_backend.py +19 -23
- docling/cli/main.py +8 -8
- docling/cli/models.py +6 -3
- docling/datamodel/base_models.py +7 -5
- docling/datamodel/document.py +19 -10
- docling/datamodel/pipeline_options.py +0 -1
- docling/document_converter.py +8 -6
- docling/models/api_vlm_model.py +1 -2
- docling/models/base_model.py +2 -4
- docling/models/base_ocr_model.py +2 -2
- docling/models/code_formula_model.py +2 -1
- docling/models/document_picture_classifier.py +2 -1
- docling/models/easyocr_model.py +10 -11
- docling/models/factories/__init__.py +2 -2
- docling/models/factories/base_factory.py +1 -1
- docling/models/hf_mlx_model.py +4 -6
- docling/models/hf_vlm_model.py +7 -5
- docling/models/layout_model.py +2 -2
- docling/models/ocr_mac_model.py +3 -4
- docling/models/page_assemble_model.py +7 -12
- docling/models/page_preprocessing_model.py +2 -1
- docling/models/picture_description_api_model.py +2 -1
- docling/models/picture_description_base_model.py +2 -3
- docling/models/picture_description_vlm_model.py +6 -4
- docling/models/rapid_ocr_model.py +2 -3
- docling/models/readingorder_model.py +9 -24
- docling/models/table_structure_model.py +4 -8
- docling/models/tesseract_ocr_cli_model.py +17 -16
- docling/models/tesseract_ocr_model.py +9 -5
- docling/pipeline/base_pipeline.py +4 -8
- docling/pipeline/simple_pipeline.py +0 -1
- docling/pipeline/standard_pdf_pipeline.py +0 -1
- docling/pipeline/vlm_pipeline.py +0 -3
- docling/utils/export.py +2 -4
- docling/utils/glm_utils.py +2 -2
- docling/utils/layout_postprocessor.py +4 -2
- docling/utils/model_downloader.py +31 -7
- docling/utils/utils.py +3 -3
- {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/METADATA +2 -1
- docling-2.31.1.dist-info/RECORD +86 -0
- docling-2.30.0.dist-info/RECORD +0 -86
- {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/LICENSE +0 -0
- {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/WHEEL +0 -0
- {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/entry_points.txt +0 -0
docling/models/base_model.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
|
-
from
|
2
|
+
from collections.abc import Iterable
|
3
|
+
from typing import Generic, Optional, Protocol, Type
|
3
4
|
|
4
5
|
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
|
5
6
|
from typing_extensions import TypeVar
|
@@ -29,7 +30,6 @@ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
|
|
29
30
|
|
30
31
|
|
31
32
|
class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
|
32
|
-
|
33
33
|
elements_batch_size: int = settings.perf.elements_batch_size
|
34
34
|
|
35
35
|
@abstractmethod
|
@@ -50,7 +50,6 @@ class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
|
|
50
50
|
|
51
51
|
|
52
52
|
class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
|
53
|
-
|
54
53
|
def prepare_element(
|
55
54
|
self, conv_res: ConversionResult, element: NodeItem
|
56
55
|
) -> Optional[NodeItem]:
|
@@ -62,7 +61,6 @@ class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
|
|
62
61
|
class BaseItemAndImageEnrichmentModel(
|
63
62
|
GenericEnrichmentModel[ItemAndImageEnrichmentElement]
|
64
63
|
):
|
65
|
-
|
66
64
|
images_scale: float
|
67
65
|
expansion_factor: float = 0.0
|
68
66
|
|
docling/models/base_ocr_model.py
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
import copy
|
2
2
|
import logging
|
3
3
|
from abc import abstractmethod
|
4
|
+
from collections.abc import Iterable
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import
|
6
|
+
from typing import List, Optional, Type
|
6
7
|
|
7
8
|
import numpy as np
|
8
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
9
|
-
from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
|
10
10
|
from PIL import Image, ImageDraw
|
11
11
|
from rtree import index
|
12
12
|
from scipy.ndimage import binary_dilation, find_objects, label
|
@@ -1,7 +1,8 @@
|
|
1
1
|
import re
|
2
2
|
from collections import Counter
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from pathlib import Path
|
4
|
-
from typing import
|
5
|
+
from typing import List, Literal, Optional, Tuple, Union
|
5
6
|
|
6
7
|
import numpy as np
|
7
8
|
from docling_core.types.doc import (
|
docling/models/easyocr_model.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
import warnings
|
3
3
|
import zipfile
|
4
|
+
from collections.abc import Iterable
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import
|
6
|
+
from typing import List, Optional, Type
|
6
7
|
|
7
8
|
import numpy
|
8
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
@@ -58,12 +59,10 @@ class EasyOcrModel(BaseOcrModel):
|
|
58
59
|
device = decide_device(accelerator_options.device)
|
59
60
|
# Enable easyocr GPU if running on CUDA, MPS
|
60
61
|
use_gpu = any(
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
AcceleratorDevice.MPS.value,
|
66
|
-
]
|
62
|
+
device.startswith(x)
|
63
|
+
for x in [
|
64
|
+
AcceleratorDevice.CUDA.value,
|
65
|
+
AcceleratorDevice.MPS.value,
|
67
66
|
]
|
68
67
|
)
|
69
68
|
else:
|
@@ -98,8 +97,10 @@ class EasyOcrModel(BaseOcrModel):
|
|
98
97
|
progress: bool = False,
|
99
98
|
) -> Path:
|
100
99
|
# Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
|
101
|
-
from easyocr.config import
|
102
|
-
|
100
|
+
from easyocr.config import (
|
101
|
+
detection_models as det_models_dict,
|
102
|
+
recognition_models as rec_models_dict,
|
103
|
+
)
|
103
104
|
|
104
105
|
if local_dir is None:
|
105
106
|
local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
|
@@ -126,13 +127,11 @@ class EasyOcrModel(BaseOcrModel):
|
|
126
127
|
def __call__(
|
127
128
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
128
129
|
) -> Iterable[Page]:
|
129
|
-
|
130
130
|
if not self.enabled:
|
131
131
|
yield from page_batch
|
132
132
|
return
|
133
133
|
|
134
134
|
for page in page_batch:
|
135
|
-
|
136
135
|
assert page._backend is not None
|
137
136
|
if not page._backend.is_valid():
|
138
137
|
yield page
|
@@ -9,7 +9,7 @@ from docling.models.factories.picture_description_factory import (
|
|
9
9
|
logger = logging.getLogger(__name__)
|
10
10
|
|
11
11
|
|
12
|
-
@lru_cache
|
12
|
+
@lru_cache
|
13
13
|
def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
|
14
14
|
factory = OcrFactory()
|
15
15
|
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
|
@@ -17,7 +17,7 @@ def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
|
|
17
17
|
return factory
|
18
18
|
|
19
19
|
|
20
|
-
@lru_cache
|
20
|
+
@lru_cache
|
21
21
|
def get_picture_description_factory(
|
22
22
|
allow_external_plugins: bool = False,
|
23
23
|
) -> PictureDescriptionFactory:
|
@@ -33,7 +33,7 @@ class BaseFactory(Generic[A], metaclass=ABCMeta):
|
|
33
33
|
|
34
34
|
@property
|
35
35
|
def registered_kind(self) -> list[str]:
|
36
|
-
return
|
36
|
+
return [opt.kind for opt in self._classes.keys()]
|
37
37
|
|
38
38
|
def get_enum(self) -> enum.Enum:
|
39
39
|
return enum.Enum(
|
docling/models/hf_mlx_model.py
CHANGED
@@ -1,25 +1,22 @@
|
|
1
1
|
import logging
|
2
2
|
import time
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from pathlib import Path
|
4
|
-
from typing import
|
5
|
+
from typing import Optional
|
5
6
|
|
6
7
|
from docling.datamodel.base_models import Page, VlmPrediction
|
7
8
|
from docling.datamodel.document import ConversionResult
|
8
9
|
from docling.datamodel.pipeline_options import (
|
9
|
-
AcceleratorDevice,
|
10
10
|
AcceleratorOptions,
|
11
11
|
HuggingFaceVlmOptions,
|
12
12
|
)
|
13
|
-
from docling.datamodel.settings import settings
|
14
13
|
from docling.models.base_model import BasePageModel
|
15
|
-
from docling.utils.accelerator_utils import decide_device
|
16
14
|
from docling.utils.profiling import TimeRecorder
|
17
15
|
|
18
16
|
_log = logging.getLogger(__name__)
|
19
17
|
|
20
18
|
|
21
19
|
class HuggingFaceMlxModel(BasePageModel):
|
22
|
-
|
23
20
|
def __init__(
|
24
21
|
self,
|
25
22
|
enabled: bool,
|
@@ -32,7 +29,6 @@ class HuggingFaceMlxModel(BasePageModel):
|
|
32
29
|
self.vlm_options = vlm_options
|
33
30
|
|
34
31
|
if self.enabled:
|
35
|
-
|
36
32
|
try:
|
37
33
|
from mlx_vlm import generate, load # type: ignore
|
38
34
|
from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
|
@@ -125,6 +121,8 @@ class HuggingFaceMlxModel(BasePageModel):
|
|
125
121
|
generation_time = time.time() - start_time
|
126
122
|
page_tags = output
|
127
123
|
|
124
|
+
_log.debug(f"Generation time {generation_time:.2f} seconds.")
|
125
|
+
|
128
126
|
# inference_time = time.time() - start_time
|
129
127
|
# tokens_per_second = num_tokens / generation_time
|
130
128
|
# print("")
|
docling/models/hf_vlm_model.py
CHANGED
@@ -1,16 +1,15 @@
|
|
1
1
|
import logging
|
2
2
|
import time
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from pathlib import Path
|
4
|
-
from typing import
|
5
|
+
from typing import Optional
|
5
6
|
|
6
7
|
from docling.datamodel.base_models import Page, VlmPrediction
|
7
8
|
from docling.datamodel.document import ConversionResult
|
8
9
|
from docling.datamodel.pipeline_options import (
|
9
|
-
AcceleratorDevice,
|
10
10
|
AcceleratorOptions,
|
11
11
|
HuggingFaceVlmOptions,
|
12
12
|
)
|
13
|
-
from docling.datamodel.settings import settings
|
14
13
|
from docling.models.base_model import BasePageModel
|
15
14
|
from docling.utils.accelerator_utils import decide_device
|
16
15
|
from docling.utils.profiling import TimeRecorder
|
@@ -19,7 +18,6 @@ _log = logging.getLogger(__name__)
|
|
19
18
|
|
20
19
|
|
21
20
|
class HuggingFaceVlmModel(BasePageModel):
|
22
|
-
|
23
21
|
def __init__(
|
24
22
|
self,
|
25
23
|
enabled: bool,
|
@@ -42,7 +40,7 @@ class HuggingFaceVlmModel(BasePageModel):
|
|
42
40
|
device = decide_device(accelerator_options.device)
|
43
41
|
self.device = device
|
44
42
|
|
45
|
-
_log.debug("Available device for HuggingFace VLM: {}"
|
43
|
+
_log.debug(f"Available device for HuggingFace VLM: {device}")
|
46
44
|
|
47
45
|
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
48
46
|
|
@@ -168,6 +166,10 @@ class HuggingFaceVlmModel(BasePageModel):
|
|
168
166
|
num_tokens = len(generated_ids[0])
|
169
167
|
page_tags = generated_texts
|
170
168
|
|
169
|
+
_log.debug(
|
170
|
+
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
|
171
|
+
)
|
172
|
+
|
171
173
|
# inference_time = time.time() - start_time
|
172
174
|
# tokens_per_second = num_tokens / generation_time
|
173
175
|
# print("")
|
docling/models/layout_model.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
import copy
|
2
2
|
import logging
|
3
3
|
import warnings
|
4
|
+
from collections.abc import Iterable
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import
|
6
|
+
from typing import Optional
|
6
7
|
|
7
8
|
from docling_core.types.doc import DocItemLabel
|
8
9
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
@@ -142,7 +143,6 @@ class LayoutModel(BasePageModel):
|
|
142
143
|
def __call__(
|
143
144
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
144
145
|
) -> Iterable[Page]:
|
145
|
-
|
146
146
|
for page in page_batch:
|
147
147
|
assert page._backend is not None
|
148
148
|
if not page._backend.is_valid():
|
docling/models/ocr_mac_model.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
import sys
|
3
3
|
import tempfile
|
4
|
+
from collections.abc import Iterable
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import
|
6
|
+
from typing import Optional, Type
|
6
7
|
|
7
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
8
9
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
@@ -41,7 +42,7 @@ class OcrMacModel(BaseOcrModel):
|
|
41
42
|
|
42
43
|
if self.enabled:
|
43
44
|
if "darwin" != sys.platform:
|
44
|
-
raise RuntimeError(
|
45
|
+
raise RuntimeError("OcrMac is only supported on Mac.")
|
45
46
|
install_errmsg = (
|
46
47
|
"ocrmac is not correctly installed. "
|
47
48
|
"Please install it via `pip install ocrmac` to use this OCR engine. "
|
@@ -58,7 +59,6 @@ class OcrMacModel(BaseOcrModel):
|
|
58
59
|
def __call__(
|
59
60
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
60
61
|
) -> Iterable[Page]:
|
61
|
-
|
62
62
|
if not self.enabled:
|
63
63
|
yield from page_batch
|
64
64
|
return
|
@@ -69,7 +69,6 @@ class OcrMacModel(BaseOcrModel):
|
|
69
69
|
yield page
|
70
70
|
else:
|
71
71
|
with TimeRecorder(conv_res, "ocr"):
|
72
|
-
|
73
72
|
ocr_rects = self.get_ocr_rects(page)
|
74
73
|
|
75
74
|
all_ocr_cells = []
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import logging
|
2
2
|
import re
|
3
|
-
from
|
3
|
+
from collections.abc import Iterable
|
4
|
+
from typing import List
|
4
5
|
|
5
6
|
from pydantic import BaseModel
|
6
7
|
|
@@ -53,9 +54,9 @@ class PageAssembleModel(BasePageModel):
|
|
53
54
|
sanitized_text = "".join(lines)
|
54
55
|
|
55
56
|
# Text normalization
|
56
|
-
sanitized_text = sanitized_text.replace("⁄", "/")
|
57
|
-
sanitized_text = sanitized_text.replace("’", "'")
|
58
|
-
sanitized_text = sanitized_text.replace("‘", "'")
|
57
|
+
sanitized_text = sanitized_text.replace("⁄", "/") # noqa: RUF001
|
58
|
+
sanitized_text = sanitized_text.replace("’", "'") # noqa: RUF001
|
59
|
+
sanitized_text = sanitized_text.replace("‘", "'") # noqa: RUF001
|
59
60
|
sanitized_text = sanitized_text.replace("“", '"')
|
60
61
|
sanitized_text = sanitized_text.replace("”", '"')
|
61
62
|
sanitized_text = sanitized_text.replace("•", "·")
|
@@ -71,7 +72,6 @@ class PageAssembleModel(BasePageModel):
|
|
71
72
|
yield page
|
72
73
|
else:
|
73
74
|
with TimeRecorder(conv_res, "page_assemble"):
|
74
|
-
|
75
75
|
assert page.predictions.layout is not None
|
76
76
|
|
77
77
|
# assembles some JSON output page by page.
|
@@ -83,7 +83,6 @@ class PageAssembleModel(BasePageModel):
|
|
83
83
|
for cluster in page.predictions.layout.clusters:
|
84
84
|
# _log.info("Cluster label seen:", cluster.label)
|
85
85
|
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
86
|
-
|
87
86
|
textlines = [
|
88
87
|
cell.text.replace("\x02", "-").strip()
|
89
88
|
for cell in cluster.cells
|
@@ -109,9 +108,7 @@ class PageAssembleModel(BasePageModel):
|
|
109
108
|
tbl = page.predictions.tablestructure.table_map.get(
|
110
109
|
cluster.id, None
|
111
110
|
)
|
112
|
-
if
|
113
|
-
not tbl
|
114
|
-
): # fallback: add table without structure, if it isn't present
|
111
|
+
if not tbl: # fallback: add table without structure, if it isn't present
|
115
112
|
tbl = Table(
|
116
113
|
label=cluster.label,
|
117
114
|
id=cluster.id,
|
@@ -130,9 +127,7 @@ class PageAssembleModel(BasePageModel):
|
|
130
127
|
fig = page.predictions.figures_classification.figure_map.get(
|
131
128
|
cluster.id, None
|
132
129
|
)
|
133
|
-
if
|
134
|
-
not fig
|
135
|
-
): # fallback: add figure without classification, if it isn't present
|
130
|
+
if not fig: # fallback: add figure without classification, if it isn't present
|
136
131
|
fig = FigureElement(
|
137
132
|
label=cluster.label,
|
138
133
|
id=cluster.id,
|
@@ -1,12 +1,11 @@
|
|
1
|
-
import logging
|
2
1
|
from abc import abstractmethod
|
2
|
+
from collections.abc import Iterable
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import
|
4
|
+
from typing import List, Optional, Type, Union
|
5
5
|
|
6
6
|
from docling_core.types.doc import (
|
7
7
|
DoclingDocument,
|
8
8
|
NodeItem,
|
9
|
-
PictureClassificationClass,
|
10
9
|
PictureItem,
|
11
10
|
)
|
12
11
|
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
|
@@ -1,5 +1,6 @@
|
|
1
|
+
from collections.abc import Iterable
|
1
2
|
from pathlib import Path
|
2
|
-
from typing import
|
3
|
+
from typing import Optional, Type, Union
|
3
4
|
|
4
5
|
from PIL import Image
|
5
6
|
|
@@ -13,7 +14,6 @@ from docling.utils.accelerator_utils import decide_device
|
|
13
14
|
|
14
15
|
|
15
16
|
class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
16
|
-
|
17
17
|
@classmethod
|
18
18
|
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
|
19
19
|
return PictureDescriptionVlmOptions
|
@@ -36,7 +36,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
|
36
36
|
self.options: PictureDescriptionVlmOptions
|
37
37
|
|
38
38
|
if self.enabled:
|
39
|
-
|
40
39
|
if artifacts_path is None:
|
41
40
|
artifacts_path = self.download_models(repo_id=self.options.repo_id)
|
42
41
|
else:
|
@@ -58,7 +57,10 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
|
58
57
|
artifacts_path,
|
59
58
|
torch_dtype=torch.bfloat16,
|
60
59
|
_attn_implementation=(
|
61
|
-
"flash_attention_2"
|
60
|
+
"flash_attention_2"
|
61
|
+
if self.device.startswith("cuda")
|
62
|
+
and accelerator_options.cuda_use_flash_attention2
|
63
|
+
else "eager"
|
62
64
|
),
|
63
65
|
).to(self.device)
|
64
66
|
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import logging
|
2
|
+
from collections.abc import Iterable
|
2
3
|
from pathlib import Path
|
3
|
-
from typing import
|
4
|
+
from typing import Optional, Type
|
4
5
|
|
5
6
|
import numpy
|
6
7
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
@@ -74,13 +75,11 @@ class RapidOcrModel(BaseOcrModel):
|
|
74
75
|
def __call__(
|
75
76
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
76
77
|
) -> Iterable[Page]:
|
77
|
-
|
78
78
|
if not self.enabled:
|
79
79
|
yield from page_batch
|
80
80
|
return
|
81
81
|
|
82
82
|
for page in page_batch:
|
83
|
-
|
84
83
|
assert page._backend is not None
|
85
84
|
if not page._backend.is_valid():
|
86
85
|
yield page
|
@@ -1,12 +1,7 @@
|
|
1
|
-
import copy
|
2
|
-
import random
|
3
1
|
from pathlib import Path
|
4
2
|
from typing import Dict, List
|
5
3
|
|
6
4
|
from docling_core.types.doc import (
|
7
|
-
BoundingBox,
|
8
|
-
CoordOrigin,
|
9
|
-
DocItem,
|
10
5
|
DocItemLabel,
|
11
6
|
DoclingDocument,
|
12
7
|
DocumentOrigin,
|
@@ -17,13 +12,10 @@ from docling_core.types.doc import (
|
|
17
12
|
TableData,
|
18
13
|
)
|
19
14
|
from docling_core.types.doc.document import ContentLayer
|
20
|
-
from docling_core.types.legacy_doc.base import Ref
|
21
|
-
from docling_core.types.legacy_doc.document import BaseText
|
22
15
|
from docling_ibm_models.reading_order.reading_order_rb import (
|
23
16
|
PageElement as ReadingOrderPageElement,
|
17
|
+
ReadingOrderPredictor,
|
24
18
|
)
|
25
|
-
from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
|
26
|
-
from PIL import ImageDraw
|
27
19
|
from pydantic import BaseModel, ConfigDict
|
28
20
|
|
29
21
|
from docling.datamodel.base_models import (
|
@@ -35,7 +27,6 @@ from docling.datamodel.base_models import (
|
|
35
27
|
TextElement,
|
36
28
|
)
|
37
29
|
from docling.datamodel.document import ConversionResult
|
38
|
-
from docling.datamodel.settings import settings
|
39
30
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
40
31
|
|
41
32
|
|
@@ -53,12 +44,10 @@ class ReadingOrderModel:
|
|
53
44
|
def _assembled_to_readingorder_elements(
|
54
45
|
self, conv_res: ConversionResult
|
55
46
|
) -> List[ReadingOrderPageElement]:
|
56
|
-
|
57
47
|
elements: List[ReadingOrderPageElement] = []
|
58
48
|
page_no_to_pages = {p.page_no: p for p in conv_res.pages}
|
59
49
|
|
60
50
|
for element in conv_res.assembled.elements:
|
61
|
-
|
62
51
|
page_height = page_no_to_pages[element.page_no].size.height # type: ignore
|
63
52
|
bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
|
64
53
|
text = element.text or ""
|
@@ -84,7 +73,6 @@ class ReadingOrderModel:
|
|
84
73
|
def _add_child_elements(
|
85
74
|
self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
|
86
75
|
):
|
87
|
-
|
88
76
|
child: Cluster
|
89
77
|
for child in element.cluster.children:
|
90
78
|
c_label = child.label
|
@@ -110,7 +98,7 @@ class ReadingOrderModel:
|
|
110
98
|
else:
|
111
99
|
doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
|
112
100
|
|
113
|
-
def _readingorder_elements_to_docling_doc(
|
101
|
+
def _readingorder_elements_to_docling_doc( # noqa: C901
|
114
102
|
self,
|
115
103
|
conv_res: ConversionResult,
|
116
104
|
ro_elements: List[ReadingOrderPageElement],
|
@@ -118,7 +106,6 @@ class ReadingOrderModel:
|
|
118
106
|
el_to_footnotes_mapping: Dict[int, List[int]],
|
119
107
|
el_merges_mapping: Dict[int, List[int]],
|
120
108
|
) -> DoclingDocument:
|
121
|
-
|
122
109
|
id_to_elem = {
|
123
110
|
RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
|
124
111
|
for elem in conv_res.assembled.elements
|
@@ -192,7 +179,6 @@ class ReadingOrderModel:
|
|
192
179
|
|
193
180
|
code_item.footnotes.append(new_footnote_item.get_ref())
|
194
181
|
else:
|
195
|
-
|
196
182
|
new_item, current_list = self._handle_text_element(
|
197
183
|
element, out_doc, current_list, page_height
|
198
184
|
)
|
@@ -206,7 +192,6 @@ class ReadingOrderModel:
|
|
206
192
|
)
|
207
193
|
|
208
194
|
elif isinstance(element, Table):
|
209
|
-
|
210
195
|
tbl_data = TableData(
|
211
196
|
num_rows=element.num_rows,
|
212
197
|
num_cols=element.num_cols,
|
@@ -342,12 +327,12 @@ class ReadingOrderModel:
|
|
342
327
|
return new_item, current_list
|
343
328
|
|
344
329
|
def _merge_elements(self, element, merged_elem, new_item, page_height):
|
345
|
-
assert isinstance(
|
346
|
-
|
347
|
-
)
|
348
|
-
assert (
|
349
|
-
|
350
|
-
)
|
330
|
+
assert isinstance(merged_elem, type(element)), (
|
331
|
+
"Merged element must be of same type as element."
|
332
|
+
)
|
333
|
+
assert merged_elem.label == new_item.label, (
|
334
|
+
"Labels of merged elements must match."
|
335
|
+
)
|
351
336
|
prov = ProvenanceItem(
|
352
337
|
page_no=element.page_no + 1,
|
353
338
|
charspan=(
|
@@ -361,7 +346,7 @@ class ReadingOrderModel:
|
|
361
346
|
new_item.prov.append(prov)
|
362
347
|
|
363
348
|
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
364
|
-
with TimeRecorder(conv_res, "
|
349
|
+
with TimeRecorder(conv_res, "reading_order", scope=ProfilingScope.DOCUMENT):
|
365
350
|
page_elements = self._assembled_to_readingorder_elements(conv_res)
|
366
351
|
|
367
352
|
# Apply reading order
|
@@ -1,13 +1,13 @@
|
|
1
1
|
import copy
|
2
2
|
import warnings
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from pathlib import Path
|
4
|
-
from typing import
|
5
|
+
from typing import Optional
|
5
6
|
|
6
7
|
import numpy
|
7
8
|
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
8
9
|
from docling_core.types.doc.page import (
|
9
10
|
BoundingRectangle,
|
10
|
-
SegmentedPdfPage,
|
11
11
|
TextCellUnit,
|
12
12
|
)
|
13
13
|
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
@@ -44,7 +44,6 @@ class TableStructureModel(BasePageModel):
|
|
44
44
|
|
45
45
|
self.enabled = enabled
|
46
46
|
if self.enabled:
|
47
|
-
|
48
47
|
if artifacts_path is None:
|
49
48
|
artifacts_path = self.download_models() / self._model_path
|
50
49
|
else:
|
@@ -175,7 +174,6 @@ class TableStructureModel(BasePageModel):
|
|
175
174
|
def __call__(
|
176
175
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
177
176
|
) -> Iterable[Page]:
|
178
|
-
|
179
177
|
if not self.enabled:
|
180
178
|
yield from page_batch
|
181
179
|
return
|
@@ -186,7 +184,6 @@ class TableStructureModel(BasePageModel):
|
|
186
184
|
yield page
|
187
185
|
else:
|
188
186
|
with TimeRecorder(conv_res, "table_structure"):
|
189
|
-
|
190
187
|
assert page.predictions.layout is not None
|
191
188
|
assert page.size is not None
|
192
189
|
|
@@ -237,7 +234,7 @@ class TableStructureModel(BasePageModel):
|
|
237
234
|
tcells = table_cluster.cells
|
238
235
|
tokens = []
|
239
236
|
for c in tcells:
|
240
|
-
# Only allow non empty
|
237
|
+
# Only allow non empty strings (spaces) into the cells of a table
|
241
238
|
if len(c.text.strip()) > 0:
|
242
239
|
new_cell = copy.deepcopy(c)
|
243
240
|
new_cell.rect = BoundingRectangle.from_bounding_box(
|
@@ -260,7 +257,6 @@ class TableStructureModel(BasePageModel):
|
|
260
257
|
table_out = tf_output[0]
|
261
258
|
table_cells = []
|
262
259
|
for element in table_out["tf_responses"]:
|
263
|
-
|
264
260
|
if not self.do_cell_matching:
|
265
261
|
the_bbox = BoundingBox.model_validate(
|
266
262
|
element["bbox"]
|
@@ -271,7 +267,7 @@ class TableStructureModel(BasePageModel):
|
|
271
267
|
element["bbox"]["token"] = text_piece
|
272
268
|
|
273
269
|
tc = TableCell.model_validate(element)
|
274
|
-
if
|
270
|
+
if tc.bbox is not None:
|
275
271
|
tc.bbox = tc.bbox.scaled(1 / self.scale)
|
276
272
|
table_cells.append(tc)
|
277
273
|
|