docling 2.26.0__py3-none-any.whl → 2.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +1 -1
- docling/backend/csv_backend.py +1 -1
- docling/backend/docling_parse_backend.py +21 -13
- docling/backend/docling_parse_v2_backend.py +20 -12
- docling/backend/docling_parse_v4_backend.py +192 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +271 -0
- docling/backend/docx/latex/omml.py +453 -0
- docling/backend/html_backend.py +7 -7
- docling/backend/md_backend.py +1 -1
- docling/backend/msexcel_backend.py +2 -45
- docling/backend/mspowerpoint_backend.py +19 -1
- docling/backend/msword_backend.py +68 -3
- docling/backend/pdf_backend.py +7 -2
- docling/backend/pypdfium2_backend.py +52 -30
- docling/backend/xml/uspto_backend.py +1 -1
- docling/cli/main.py +135 -53
- docling/cli/models.py +1 -1
- docling/datamodel/base_models.py +8 -10
- docling/datamodel/pipeline_options.py +54 -32
- docling/document_converter.py +5 -5
- docling/models/base_model.py +9 -1
- docling/models/base_ocr_model.py +27 -16
- docling/models/easyocr_model.py +28 -13
- docling/models/factories/__init__.py +27 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/hf_mlx_model.py +137 -0
- docling/models/ocr_mac_model.py +39 -11
- docling/models/page_preprocessing_model.py +4 -0
- docling/models/picture_description_api_model.py +20 -3
- docling/models/picture_description_base_model.py +19 -3
- docling/models/picture_description_vlm_model.py +14 -2
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +28 -0
- docling/models/rapid_ocr_model.py +34 -13
- docling/models/table_structure_model.py +13 -4
- docling/models/tesseract_ocr_cli_model.py +40 -15
- docling/models/tesseract_ocr_model.py +37 -12
- docling/pipeline/standard_pdf_pipeline.py +25 -78
- docling/pipeline/vlm_pipeline.py +78 -398
- docling/utils/export.py +8 -6
- docling/utils/layout_postprocessor.py +26 -23
- docling/utils/visualization.py +1 -1
- {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/METADATA +47 -23
- docling-2.28.0.dist-info/RECORD +84 -0
- {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/entry_points.txt +3 -0
- docling-2.26.0.dist-info/RECORD +0 -72
- {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/LICENSE +0 -0
- {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,137 @@
|
|
1
|
+
import logging
|
2
|
+
import time
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Iterable, List, Optional
|
5
|
+
|
6
|
+
from docling.datamodel.base_models import Page, VlmPrediction
|
7
|
+
from docling.datamodel.document import ConversionResult
|
8
|
+
from docling.datamodel.pipeline_options import (
|
9
|
+
AcceleratorDevice,
|
10
|
+
AcceleratorOptions,
|
11
|
+
HuggingFaceVlmOptions,
|
12
|
+
)
|
13
|
+
from docling.datamodel.settings import settings
|
14
|
+
from docling.models.base_model import BasePageModel
|
15
|
+
from docling.utils.accelerator_utils import decide_device
|
16
|
+
from docling.utils.profiling import TimeRecorder
|
17
|
+
|
18
|
+
_log = logging.getLogger(__name__)
|
19
|
+
|
20
|
+
|
21
|
+
class HuggingFaceMlxModel(BasePageModel):
|
22
|
+
|
23
|
+
def __init__(
|
24
|
+
self,
|
25
|
+
enabled: bool,
|
26
|
+
artifacts_path: Optional[Path],
|
27
|
+
accelerator_options: AcceleratorOptions,
|
28
|
+
vlm_options: HuggingFaceVlmOptions,
|
29
|
+
):
|
30
|
+
self.enabled = enabled
|
31
|
+
|
32
|
+
self.vlm_options = vlm_options
|
33
|
+
|
34
|
+
if self.enabled:
|
35
|
+
|
36
|
+
try:
|
37
|
+
from mlx_vlm import generate, load # type: ignore
|
38
|
+
from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
|
39
|
+
from mlx_vlm.utils import load_config, stream_generate # type: ignore
|
40
|
+
except ImportError:
|
41
|
+
raise ImportError(
|
42
|
+
"mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
|
43
|
+
)
|
44
|
+
|
45
|
+
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
46
|
+
self.apply_chat_template = apply_chat_template
|
47
|
+
self.stream_generate = stream_generate
|
48
|
+
|
49
|
+
# PARAMETERS:
|
50
|
+
if artifacts_path is None:
|
51
|
+
artifacts_path = self.download_models(self.vlm_options.repo_id)
|
52
|
+
elif (artifacts_path / repo_cache_folder).exists():
|
53
|
+
artifacts_path = artifacts_path / repo_cache_folder
|
54
|
+
|
55
|
+
self.param_question = vlm_options.prompt # "Perform Layout Analysis."
|
56
|
+
|
57
|
+
## Load the model
|
58
|
+
self.vlm_model, self.processor = load(artifacts_path)
|
59
|
+
self.config = load_config(artifacts_path)
|
60
|
+
|
61
|
+
@staticmethod
|
62
|
+
def download_models(
|
63
|
+
repo_id: str,
|
64
|
+
local_dir: Optional[Path] = None,
|
65
|
+
force: bool = False,
|
66
|
+
progress: bool = False,
|
67
|
+
) -> Path:
|
68
|
+
from huggingface_hub import snapshot_download
|
69
|
+
from huggingface_hub.utils import disable_progress_bars
|
70
|
+
|
71
|
+
if not progress:
|
72
|
+
disable_progress_bars()
|
73
|
+
download_path = snapshot_download(
|
74
|
+
repo_id=repo_id,
|
75
|
+
force_download=force,
|
76
|
+
local_dir=local_dir,
|
77
|
+
# revision="v0.0.1",
|
78
|
+
)
|
79
|
+
|
80
|
+
return Path(download_path)
|
81
|
+
|
82
|
+
def __call__(
|
83
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
84
|
+
) -> Iterable[Page]:
|
85
|
+
for page in page_batch:
|
86
|
+
assert page._backend is not None
|
87
|
+
if not page._backend.is_valid():
|
88
|
+
yield page
|
89
|
+
else:
|
90
|
+
with TimeRecorder(conv_res, "vlm"):
|
91
|
+
assert page.size is not None
|
92
|
+
|
93
|
+
hi_res_image = page.get_image(scale=2.0) # 144dpi
|
94
|
+
# hi_res_image = page.get_image(scale=1.0) # 72dpi
|
95
|
+
|
96
|
+
if hi_res_image is not None:
|
97
|
+
im_width, im_height = hi_res_image.size
|
98
|
+
|
99
|
+
# populate page_tags with predicted doc tags
|
100
|
+
page_tags = ""
|
101
|
+
|
102
|
+
if hi_res_image:
|
103
|
+
if hi_res_image.mode != "RGB":
|
104
|
+
hi_res_image = hi_res_image.convert("RGB")
|
105
|
+
|
106
|
+
prompt = self.apply_chat_template(
|
107
|
+
self.processor, self.config, self.param_question, num_images=1
|
108
|
+
)
|
109
|
+
|
110
|
+
start_time = time.time()
|
111
|
+
# Call model to generate:
|
112
|
+
output = ""
|
113
|
+
for token in self.stream_generate(
|
114
|
+
self.vlm_model,
|
115
|
+
self.processor,
|
116
|
+
prompt,
|
117
|
+
[hi_res_image],
|
118
|
+
max_tokens=4096,
|
119
|
+
verbose=False,
|
120
|
+
):
|
121
|
+
output += token.text
|
122
|
+
if "</doctag>" in token.text:
|
123
|
+
break
|
124
|
+
|
125
|
+
generation_time = time.time() - start_time
|
126
|
+
page_tags = output
|
127
|
+
|
128
|
+
# inference_time = time.time() - start_time
|
129
|
+
# tokens_per_second = num_tokens / generation_time
|
130
|
+
# print("")
|
131
|
+
# print(f"Page Inference Time: {inference_time:.2f} seconds")
|
132
|
+
# print(f"Total tokens on page: {num_tokens:.2f}")
|
133
|
+
# print(f"Tokens/sec: {tokens_per_second:.2f}")
|
134
|
+
# print("")
|
135
|
+
page.predictions.vlm_response = VlmPrediction(text=page_tags)
|
136
|
+
|
137
|
+
yield page
|
docling/models/ocr_mac_model.py
CHANGED
@@ -1,12 +1,19 @@
|
|
1
1
|
import logging
|
2
|
+
import sys
|
2
3
|
import tempfile
|
3
|
-
from
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Iterable, Optional, Tuple, Type
|
4
6
|
|
5
7
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
8
|
+
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
6
9
|
|
7
|
-
from docling.datamodel.base_models import
|
10
|
+
from docling.datamodel.base_models import Page
|
8
11
|
from docling.datamodel.document import ConversionResult
|
9
|
-
from docling.datamodel.pipeline_options import
|
12
|
+
from docling.datamodel.pipeline_options import (
|
13
|
+
AcceleratorOptions,
|
14
|
+
OcrMacOptions,
|
15
|
+
OcrOptions,
|
16
|
+
)
|
10
17
|
from docling.datamodel.settings import settings
|
11
18
|
from docling.models.base_ocr_model import BaseOcrModel
|
12
19
|
from docling.utils.profiling import TimeRecorder
|
@@ -15,18 +22,31 @@ _log = logging.getLogger(__name__)
|
|
15
22
|
|
16
23
|
|
17
24
|
class OcrMacModel(BaseOcrModel):
|
18
|
-
def __init__(
|
19
|
-
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
enabled: bool,
|
28
|
+
artifacts_path: Optional[Path],
|
29
|
+
options: OcrMacOptions,
|
30
|
+
accelerator_options: AcceleratorOptions,
|
31
|
+
):
|
32
|
+
super().__init__(
|
33
|
+
enabled=enabled,
|
34
|
+
artifacts_path=artifacts_path,
|
35
|
+
options=options,
|
36
|
+
accelerator_options=accelerator_options,
|
37
|
+
)
|
20
38
|
self.options: OcrMacOptions
|
21
39
|
|
22
40
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
23
41
|
|
24
42
|
if self.enabled:
|
43
|
+
if "darwin" != sys.platform:
|
44
|
+
raise RuntimeError(f"OcrMac is only supported on Mac.")
|
25
45
|
install_errmsg = (
|
26
46
|
"ocrmac is not correctly installed. "
|
27
47
|
"Please install it via `pip install ocrmac` to use this OCR engine. "
|
28
48
|
"Alternatively, Docling has support for other OCR engines. See the documentation: "
|
29
|
-
"https://
|
49
|
+
"https://docling-project.github.io/docling/installation/"
|
30
50
|
)
|
31
51
|
try:
|
32
52
|
from ocrmac import ocrmac
|
@@ -94,13 +114,17 @@ class OcrMacModel(BaseOcrModel):
|
|
94
114
|
bottom = y2 / self.scale
|
95
115
|
|
96
116
|
cells.append(
|
97
|
-
|
98
|
-
|
117
|
+
TextCell(
|
118
|
+
index=ix,
|
99
119
|
text=text,
|
120
|
+
orig=text,
|
121
|
+
from_ocr=True,
|
100
122
|
confidence=confidence,
|
101
|
-
|
102
|
-
|
103
|
-
|
123
|
+
rect=BoundingRectangle.from_bounding_box(
|
124
|
+
BoundingBox.from_tuple(
|
125
|
+
coord=(left, top, right, bottom),
|
126
|
+
origin=CoordOrigin.TOPLEFT,
|
127
|
+
)
|
104
128
|
),
|
105
129
|
)
|
106
130
|
)
|
@@ -116,3 +140,7 @@ class OcrMacModel(BaseOcrModel):
|
|
116
140
|
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
117
141
|
|
118
142
|
yield page
|
143
|
+
|
144
|
+
@classmethod
|
145
|
+
def get_options_type(cls) -> Type[OcrOptions]:
|
146
|
+
return OcrMacOptions
|
@@ -13,6 +13,7 @@ from docling.utils.profiling import TimeRecorder
|
|
13
13
|
|
14
14
|
class PagePreprocessingOptions(BaseModel):
|
15
15
|
images_scale: Optional[float]
|
16
|
+
create_parsed_page: bool
|
16
17
|
|
17
18
|
|
18
19
|
class PagePreprocessingModel(BasePageModel):
|
@@ -55,6 +56,9 @@ class PagePreprocessingModel(BasePageModel):
|
|
55
56
|
|
56
57
|
page.cells = list(page._backend.get_text_cells())
|
57
58
|
|
59
|
+
if self.options.create_parsed_page:
|
60
|
+
page.parsed_page = page._backend.get_segmented_page()
|
61
|
+
|
58
62
|
# DEBUG code:
|
59
63
|
def draw_text_boxes(image, cells, show: bool = False):
|
60
64
|
draw = ImageDraw.Draw(image)
|
@@ -1,13 +1,18 @@
|
|
1
1
|
import base64
|
2
2
|
import io
|
3
3
|
import logging
|
4
|
-
from
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Iterable, List, Optional, Type, Union
|
5
6
|
|
6
7
|
import requests
|
7
8
|
from PIL import Image
|
8
9
|
from pydantic import BaseModel, ConfigDict
|
9
10
|
|
10
|
-
from docling.datamodel.pipeline_options import
|
11
|
+
from docling.datamodel.pipeline_options import (
|
12
|
+
AcceleratorOptions,
|
13
|
+
PictureDescriptionApiOptions,
|
14
|
+
PictureDescriptionBaseOptions,
|
15
|
+
)
|
11
16
|
from docling.exceptions import OperationNotAllowed
|
12
17
|
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
13
18
|
|
@@ -46,13 +51,25 @@ class ApiResponse(BaseModel):
|
|
46
51
|
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
47
52
|
# elements_batch_size = 4
|
48
53
|
|
54
|
+
@classmethod
|
55
|
+
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
|
56
|
+
return PictureDescriptionApiOptions
|
57
|
+
|
49
58
|
def __init__(
|
50
59
|
self,
|
51
60
|
enabled: bool,
|
52
61
|
enable_remote_services: bool,
|
62
|
+
artifacts_path: Optional[Union[Path, str]],
|
53
63
|
options: PictureDescriptionApiOptions,
|
64
|
+
accelerator_options: AcceleratorOptions,
|
54
65
|
):
|
55
|
-
super().__init__(
|
66
|
+
super().__init__(
|
67
|
+
enabled=enabled,
|
68
|
+
enable_remote_services=enable_remote_services,
|
69
|
+
artifacts_path=artifacts_path,
|
70
|
+
options=options,
|
71
|
+
accelerator_options=accelerator_options,
|
72
|
+
)
|
56
73
|
self.options: PictureDescriptionApiOptions
|
57
74
|
|
58
75
|
if self.enabled:
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import logging
|
2
|
+
from abc import abstractmethod
|
2
3
|
from pathlib import Path
|
3
|
-
from typing import Any, Iterable, List, Optional, Union
|
4
|
+
from typing import Any, Iterable, List, Optional, Type, Union
|
4
5
|
|
5
6
|
from docling_core.types.doc import (
|
6
7
|
DoclingDocument,
|
@@ -13,20 +14,30 @@ from docling_core.types.doc.document import ( # TODO: move import to docling_co
|
|
13
14
|
)
|
14
15
|
from PIL import Image
|
15
16
|
|
16
|
-
from docling.datamodel.pipeline_options import
|
17
|
+
from docling.datamodel.pipeline_options import (
|
18
|
+
AcceleratorOptions,
|
19
|
+
PictureDescriptionBaseOptions,
|
20
|
+
)
|
17
21
|
from docling.models.base_model import (
|
18
22
|
BaseItemAndImageEnrichmentModel,
|
23
|
+
BaseModelWithOptions,
|
19
24
|
ItemAndImageEnrichmentElement,
|
20
25
|
)
|
21
26
|
|
22
27
|
|
23
|
-
class PictureDescriptionBaseModel(
|
28
|
+
class PictureDescriptionBaseModel(
|
29
|
+
BaseItemAndImageEnrichmentModel, BaseModelWithOptions
|
30
|
+
):
|
24
31
|
images_scale: float = 2.0
|
25
32
|
|
26
33
|
def __init__(
|
27
34
|
self,
|
35
|
+
*,
|
28
36
|
enabled: bool,
|
37
|
+
enable_remote_services: bool,
|
38
|
+
artifacts_path: Optional[Union[Path, str]],
|
29
39
|
options: PictureDescriptionBaseOptions,
|
40
|
+
accelerator_options: AcceleratorOptions,
|
30
41
|
):
|
31
42
|
self.enabled = enabled
|
32
43
|
self.options = options
|
@@ -62,3 +73,8 @@ class PictureDescriptionBaseModel(BaseItemAndImageEnrichmentModel):
|
|
62
73
|
PictureDescriptionData(text=output, provenance=self.provenance)
|
63
74
|
)
|
64
75
|
yield item
|
76
|
+
|
77
|
+
@classmethod
|
78
|
+
@abstractmethod
|
79
|
+
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
|
80
|
+
pass
|
@@ -1,10 +1,11 @@
|
|
1
1
|
from pathlib import Path
|
2
|
-
from typing import Iterable, Optional, Union
|
2
|
+
from typing import Iterable, Optional, Type, Union
|
3
3
|
|
4
4
|
from PIL import Image
|
5
5
|
|
6
6
|
from docling.datamodel.pipeline_options import (
|
7
7
|
AcceleratorOptions,
|
8
|
+
PictureDescriptionBaseOptions,
|
8
9
|
PictureDescriptionVlmOptions,
|
9
10
|
)
|
10
11
|
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
@@ -13,14 +14,25 @@ from docling.utils.accelerator_utils import decide_device
|
|
13
14
|
|
14
15
|
class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
15
16
|
|
17
|
+
@classmethod
|
18
|
+
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
|
19
|
+
return PictureDescriptionVlmOptions
|
20
|
+
|
16
21
|
def __init__(
|
17
22
|
self,
|
18
23
|
enabled: bool,
|
24
|
+
enable_remote_services: bool,
|
19
25
|
artifacts_path: Optional[Union[Path, str]],
|
20
26
|
options: PictureDescriptionVlmOptions,
|
21
27
|
accelerator_options: AcceleratorOptions,
|
22
28
|
):
|
23
|
-
super().__init__(
|
29
|
+
super().__init__(
|
30
|
+
enabled=enabled,
|
31
|
+
enable_remote_services=enable_remote_services,
|
32
|
+
artifacts_path=artifacts_path,
|
33
|
+
options=options,
|
34
|
+
accelerator_options=accelerator_options,
|
35
|
+
)
|
24
36
|
self.options: PictureDescriptionVlmOptions
|
25
37
|
|
26
38
|
if self.enabled:
|
File without changes
|
@@ -0,0 +1,28 @@
|
|
1
|
+
from docling.models.easyocr_model import EasyOcrModel
|
2
|
+
from docling.models.ocr_mac_model import OcrMacModel
|
3
|
+
from docling.models.picture_description_api_model import PictureDescriptionApiModel
|
4
|
+
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
5
|
+
from docling.models.rapid_ocr_model import RapidOcrModel
|
6
|
+
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
7
|
+
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
8
|
+
|
9
|
+
|
10
|
+
def ocr_engines():
|
11
|
+
return {
|
12
|
+
"ocr_engines": [
|
13
|
+
EasyOcrModel,
|
14
|
+
OcrMacModel,
|
15
|
+
RapidOcrModel,
|
16
|
+
TesseractOcrModel,
|
17
|
+
TesseractOcrCliModel,
|
18
|
+
]
|
19
|
+
}
|
20
|
+
|
21
|
+
|
22
|
+
def picture_description():
|
23
|
+
return {
|
24
|
+
"picture_description": [
|
25
|
+
PictureDescriptionVlmModel,
|
26
|
+
PictureDescriptionApiModel,
|
27
|
+
]
|
28
|
+
}
|
@@ -1,14 +1,17 @@
|
|
1
1
|
import logging
|
2
|
-
from
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Iterable, Optional, Type
|
3
4
|
|
4
5
|
import numpy
|
5
6
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
7
|
+
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
6
8
|
|
7
|
-
from docling.datamodel.base_models import
|
9
|
+
from docling.datamodel.base_models import Page
|
8
10
|
from docling.datamodel.document import ConversionResult
|
9
11
|
from docling.datamodel.pipeline_options import (
|
10
12
|
AcceleratorDevice,
|
11
13
|
AcceleratorOptions,
|
14
|
+
OcrOptions,
|
12
15
|
RapidOcrOptions,
|
13
16
|
)
|
14
17
|
from docling.datamodel.settings import settings
|
@@ -23,10 +26,16 @@ class RapidOcrModel(BaseOcrModel):
|
|
23
26
|
def __init__(
|
24
27
|
self,
|
25
28
|
enabled: bool,
|
29
|
+
artifacts_path: Optional[Path],
|
26
30
|
options: RapidOcrOptions,
|
27
31
|
accelerator_options: AcceleratorOptions,
|
28
32
|
):
|
29
|
-
super().__init__(
|
33
|
+
super().__init__(
|
34
|
+
enabled=enabled,
|
35
|
+
artifacts_path=artifacts_path,
|
36
|
+
options=options,
|
37
|
+
accelerator_options=accelerator_options,
|
38
|
+
)
|
30
39
|
self.options: RapidOcrOptions
|
31
40
|
|
32
41
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
@@ -100,18 +109,26 @@ class RapidOcrModel(BaseOcrModel):
|
|
100
109
|
|
101
110
|
if result is not None:
|
102
111
|
cells = [
|
103
|
-
|
104
|
-
|
112
|
+
TextCell(
|
113
|
+
index=ix,
|
105
114
|
text=line[1],
|
115
|
+
orig=line[1],
|
106
116
|
confidence=line[2],
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
(
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
117
|
+
from_ocr=True,
|
118
|
+
rect=BoundingRectangle.from_bounding_box(
|
119
|
+
BoundingBox.from_tuple(
|
120
|
+
coord=(
|
121
|
+
(line[0][0][0] / self.scale)
|
122
|
+
+ ocr_rect.l,
|
123
|
+
(line[0][0][1] / self.scale)
|
124
|
+
+ ocr_rect.t,
|
125
|
+
(line[0][2][0] / self.scale)
|
126
|
+
+ ocr_rect.l,
|
127
|
+
(line[0][2][1] / self.scale)
|
128
|
+
+ ocr_rect.t,
|
129
|
+
),
|
130
|
+
origin=CoordOrigin.TOPLEFT,
|
131
|
+
)
|
115
132
|
),
|
116
133
|
)
|
117
134
|
for ix, line in enumerate(result)
|
@@ -126,3 +143,7 @@ class RapidOcrModel(BaseOcrModel):
|
|
126
143
|
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
127
144
|
|
128
145
|
yield page
|
146
|
+
|
147
|
+
@classmethod
|
148
|
+
def get_options_type(cls) -> Type[OcrOptions]:
|
149
|
+
return RapidOcrOptions
|
@@ -5,6 +5,7 @@ from typing import Iterable, Optional, Union
|
|
5
5
|
|
6
6
|
import numpy
|
7
7
|
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
8
|
+
from docling_core.types.doc.page import BoundingRectangle
|
8
9
|
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
9
10
|
from PIL import ImageDraw
|
10
11
|
|
@@ -129,7 +130,7 @@ class TableStructureModel(BasePageModel):
|
|
129
130
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
130
131
|
|
131
132
|
for cell in table_element.cluster.cells:
|
132
|
-
x0, y0, x1, y1 = cell.
|
133
|
+
x0, y0, x1, y1 = cell.rect.to_bounding_box().as_tuple()
|
133
134
|
x0 *= scale_x
|
134
135
|
x1 *= scale_x
|
135
136
|
y0 *= scale_x
|
@@ -223,11 +224,19 @@ class TableStructureModel(BasePageModel):
|
|
223
224
|
# Only allow non empty stings (spaces) into the cells of a table
|
224
225
|
if len(c.text.strip()) > 0:
|
225
226
|
new_cell = copy.deepcopy(c)
|
226
|
-
new_cell.
|
227
|
-
|
227
|
+
new_cell.rect = BoundingRectangle.from_bounding_box(
|
228
|
+
new_cell.rect.to_bounding_box().scaled(
|
229
|
+
scale=self.scale
|
230
|
+
)
|
228
231
|
)
|
229
232
|
|
230
|
-
tokens.append(
|
233
|
+
tokens.append(
|
234
|
+
{
|
235
|
+
"id": new_cell.index,
|
236
|
+
"text": new_cell.text,
|
237
|
+
"bbox": new_cell.rect.to_bounding_box().model_dump(),
|
238
|
+
}
|
239
|
+
)
|
231
240
|
page_input["tokens"] = tokens
|
232
241
|
|
233
242
|
tf_output = self.tf_predictor.multi_table_predict(
|
@@ -3,15 +3,21 @@ import io
|
|
3
3
|
import logging
|
4
4
|
import os
|
5
5
|
import tempfile
|
6
|
+
from pathlib import Path
|
6
7
|
from subprocess import DEVNULL, PIPE, Popen
|
7
|
-
from typing import Iterable, List, Optional, Tuple
|
8
|
+
from typing import Iterable, List, Optional, Tuple, Type
|
8
9
|
|
9
10
|
import pandas as pd
|
10
11
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
12
|
+
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
11
13
|
|
12
|
-
from docling.datamodel.base_models import
|
14
|
+
from docling.datamodel.base_models import Page
|
13
15
|
from docling.datamodel.document import ConversionResult
|
14
|
-
from docling.datamodel.pipeline_options import
|
16
|
+
from docling.datamodel.pipeline_options import (
|
17
|
+
AcceleratorOptions,
|
18
|
+
OcrOptions,
|
19
|
+
TesseractCliOcrOptions,
|
20
|
+
)
|
15
21
|
from docling.datamodel.settings import settings
|
16
22
|
from docling.models.base_ocr_model import BaseOcrModel
|
17
23
|
from docling.utils.ocr_utils import map_tesseract_script
|
@@ -21,8 +27,19 @@ _log = logging.getLogger(__name__)
|
|
21
27
|
|
22
28
|
|
23
29
|
class TesseractOcrCliModel(BaseOcrModel):
|
24
|
-
def __init__(
|
25
|
-
|
30
|
+
def __init__(
|
31
|
+
self,
|
32
|
+
enabled: bool,
|
33
|
+
artifacts_path: Optional[Path],
|
34
|
+
options: TesseractCliOcrOptions,
|
35
|
+
accelerator_options: AcceleratorOptions,
|
36
|
+
):
|
37
|
+
super().__init__(
|
38
|
+
enabled=enabled,
|
39
|
+
artifacts_path=artifacts_path,
|
40
|
+
options=options,
|
41
|
+
accelerator_options=accelerator_options,
|
42
|
+
)
|
26
43
|
self.options: TesseractCliOcrOptions
|
27
44
|
|
28
45
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
@@ -228,18 +245,22 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
228
245
|
t = b + h
|
229
246
|
r = l + w
|
230
247
|
|
231
|
-
cell =
|
232
|
-
|
248
|
+
cell = TextCell(
|
249
|
+
index=ix,
|
233
250
|
text=text,
|
251
|
+
orig=text,
|
252
|
+
from_ocr=True,
|
234
253
|
confidence=conf / 100.0,
|
235
|
-
|
236
|
-
|
237
|
-
(
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
254
|
+
rect=BoundingRectangle.from_bounding_box(
|
255
|
+
BoundingBox.from_tuple(
|
256
|
+
coord=(
|
257
|
+
(l / self.scale) + ocr_rect.l,
|
258
|
+
(b / self.scale) + ocr_rect.t,
|
259
|
+
(r / self.scale) + ocr_rect.l,
|
260
|
+
(t / self.scale) + ocr_rect.t,
|
261
|
+
),
|
262
|
+
origin=CoordOrigin.TOPLEFT,
|
263
|
+
)
|
243
264
|
),
|
244
265
|
)
|
245
266
|
all_ocr_cells.append(cell)
|
@@ -252,3 +273,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
252
273
|
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
253
274
|
|
254
275
|
yield page
|
276
|
+
|
277
|
+
@classmethod
|
278
|
+
def get_options_type(cls) -> Type[OcrOptions]:
|
279
|
+
return TesseractCliOcrOptions
|