docling 2.29.0__py3-none-any.whl → 2.31.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +7 -15
- docling/backend/csv_backend.py +1 -1
- docling/backend/docling_parse_backend.py +2 -2
- docling/backend/docling_parse_v2_backend.py +2 -2
- docling/backend/docling_parse_v4_backend.py +3 -4
- docling/backend/docx/latex/latex_dict.py +0 -5
- docling/backend/docx/latex/omml.py +4 -7
- docling/backend/html_backend.py +26 -9
- docling/backend/md_backend.py +5 -7
- docling/backend/msexcel_backend.py +271 -95
- docling/backend/mspowerpoint_backend.py +4 -7
- docling/backend/msword_backend.py +23 -15
- docling/backend/pdf_backend.py +2 -1
- docling/backend/pypdfium2_backend.py +3 -3
- docling/backend/xml/jats_backend.py +10 -13
- docling/backend/xml/uspto_backend.py +15 -19
- docling/cli/main.py +27 -9
- docling/cli/models.py +2 -3
- docling/datamodel/base_models.py +40 -5
- docling/datamodel/document.py +18 -10
- docling/datamodel/pipeline_options.py +29 -4
- docling/document_converter.py +5 -5
- docling/models/api_vlm_model.py +66 -0
- docling/models/base_model.py +2 -4
- docling/models/base_ocr_model.py +2 -2
- docling/models/code_formula_model.py +2 -1
- docling/models/document_picture_classifier.py +2 -1
- docling/models/easyocr_model.py +10 -11
- docling/models/factories/__init__.py +2 -2
- docling/models/factories/base_factory.py +1 -1
- docling/models/hf_mlx_model.py +4 -6
- docling/models/hf_vlm_model.py +7 -5
- docling/models/layout_model.py +2 -2
- docling/models/ocr_mac_model.py +3 -4
- docling/models/page_assemble_model.py +7 -12
- docling/models/page_preprocessing_model.py +2 -1
- docling/models/picture_description_api_model.py +9 -75
- docling/models/picture_description_base_model.py +16 -5
- docling/models/picture_description_vlm_model.py +2 -3
- docling/models/rapid_ocr_model.py +2 -3
- docling/models/readingorder_model.py +8 -23
- docling/models/table_structure_model.py +2 -6
- docling/models/tesseract_ocr_cli_model.py +17 -16
- docling/models/tesseract_ocr_model.py +8 -6
- docling/pipeline/base_pipeline.py +4 -8
- docling/pipeline/simple_pipeline.py +0 -1
- docling/pipeline/standard_pdf_pipeline.py +6 -3
- docling/pipeline/vlm_pipeline.py +27 -20
- docling/utils/api_image_request.py +61 -0
- docling/utils/export.py +2 -4
- docling/utils/glm_utils.py +2 -2
- docling/utils/layout_postprocessor.py +4 -2
- docling/utils/model_downloader.py +7 -7
- docling/utils/utils.py +1 -1
- {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/METADATA +4 -3
- docling-2.31.0.dist-info/RECORD +86 -0
- docling-2.29.0.dist-info/RECORD +0 -84
- {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/LICENSE +0 -0
- {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/WHEEL +0 -0
- {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,66 @@
|
|
1
|
+
from collections.abc import Iterable
|
2
|
+
|
3
|
+
from docling.datamodel.base_models import Page, VlmPrediction
|
4
|
+
from docling.datamodel.document import ConversionResult
|
5
|
+
from docling.datamodel.pipeline_options import ApiVlmOptions
|
6
|
+
from docling.exceptions import OperationNotAllowed
|
7
|
+
from docling.models.base_model import BasePageModel
|
8
|
+
from docling.utils.api_image_request import api_image_request
|
9
|
+
from docling.utils.profiling import TimeRecorder
|
10
|
+
|
11
|
+
|
12
|
+
class ApiVlmModel(BasePageModel):
|
13
|
+
def __init__(
|
14
|
+
self,
|
15
|
+
enabled: bool,
|
16
|
+
enable_remote_services: bool,
|
17
|
+
vlm_options: ApiVlmOptions,
|
18
|
+
):
|
19
|
+
self.enabled = enabled
|
20
|
+
self.vlm_options = vlm_options
|
21
|
+
if self.enabled:
|
22
|
+
if not enable_remote_services:
|
23
|
+
raise OperationNotAllowed(
|
24
|
+
"Connections to remote services is only allowed when set explicitly. "
|
25
|
+
"pipeline_options.enable_remote_services=True, or using the CLI "
|
26
|
+
"--enable-remote-services."
|
27
|
+
)
|
28
|
+
|
29
|
+
self.timeout = self.vlm_options.timeout
|
30
|
+
self.prompt_content = (
|
31
|
+
f"This is a page from a document.\n{self.vlm_options.prompt}"
|
32
|
+
)
|
33
|
+
self.params = {
|
34
|
+
**self.vlm_options.params,
|
35
|
+
"temperature": 0,
|
36
|
+
}
|
37
|
+
|
38
|
+
def __call__(
|
39
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
40
|
+
) -> Iterable[Page]:
|
41
|
+
for page in page_batch:
|
42
|
+
assert page._backend is not None
|
43
|
+
if not page._backend.is_valid():
|
44
|
+
yield page
|
45
|
+
else:
|
46
|
+
with TimeRecorder(conv_res, "vlm"):
|
47
|
+
assert page.size is not None
|
48
|
+
|
49
|
+
hi_res_image = page.get_image(scale=self.vlm_options.scale)
|
50
|
+
assert hi_res_image is not None
|
51
|
+
if hi_res_image:
|
52
|
+
if hi_res_image.mode != "RGB":
|
53
|
+
hi_res_image = hi_res_image.convert("RGB")
|
54
|
+
|
55
|
+
page_tags = api_image_request(
|
56
|
+
image=hi_res_image,
|
57
|
+
prompt=self.prompt_content,
|
58
|
+
url=self.vlm_options.url,
|
59
|
+
timeout=self.timeout,
|
60
|
+
headers=self.vlm_options.headers,
|
61
|
+
**self.params,
|
62
|
+
)
|
63
|
+
|
64
|
+
page.predictions.vlm_response = VlmPrediction(text=page_tags)
|
65
|
+
|
66
|
+
yield page
|
docling/models/base_model.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
|
-
from
|
2
|
+
from collections.abc import Iterable
|
3
|
+
from typing import Generic, Optional, Protocol, Type
|
3
4
|
|
4
5
|
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
|
5
6
|
from typing_extensions import TypeVar
|
@@ -29,7 +30,6 @@ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
|
|
29
30
|
|
30
31
|
|
31
32
|
class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
|
32
|
-
|
33
33
|
elements_batch_size: int = settings.perf.elements_batch_size
|
34
34
|
|
35
35
|
@abstractmethod
|
@@ -50,7 +50,6 @@ class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
|
|
50
50
|
|
51
51
|
|
52
52
|
class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
|
53
|
-
|
54
53
|
def prepare_element(
|
55
54
|
self, conv_res: ConversionResult, element: NodeItem
|
56
55
|
) -> Optional[NodeItem]:
|
@@ -62,7 +61,6 @@ class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
|
|
62
61
|
class BaseItemAndImageEnrichmentModel(
|
63
62
|
GenericEnrichmentModel[ItemAndImageEnrichmentElement]
|
64
63
|
):
|
65
|
-
|
66
64
|
images_scale: float
|
67
65
|
expansion_factor: float = 0.0
|
68
66
|
|
docling/models/base_ocr_model.py
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
import copy
|
2
2
|
import logging
|
3
3
|
from abc import abstractmethod
|
4
|
+
from collections.abc import Iterable
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import
|
6
|
+
from typing import List, Optional, Type
|
6
7
|
|
7
8
|
import numpy as np
|
8
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
9
|
-
from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
|
10
10
|
from PIL import Image, ImageDraw
|
11
11
|
from rtree import index
|
12
12
|
from scipy.ndimage import binary_dilation, find_objects, label
|
@@ -1,7 +1,8 @@
|
|
1
1
|
import re
|
2
2
|
from collections import Counter
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from pathlib import Path
|
4
|
-
from typing import
|
5
|
+
from typing import List, Literal, Optional, Tuple, Union
|
5
6
|
|
6
7
|
import numpy as np
|
7
8
|
from docling_core.types.doc import (
|
docling/models/easyocr_model.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
import warnings
|
3
3
|
import zipfile
|
4
|
+
from collections.abc import Iterable
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import
|
6
|
+
from typing import List, Optional, Type
|
6
7
|
|
7
8
|
import numpy
|
8
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
@@ -58,12 +59,10 @@ class EasyOcrModel(BaseOcrModel):
|
|
58
59
|
device = decide_device(accelerator_options.device)
|
59
60
|
# Enable easyocr GPU if running on CUDA, MPS
|
60
61
|
use_gpu = any(
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
AcceleratorDevice.MPS.value,
|
66
|
-
]
|
62
|
+
device.startswith(x)
|
63
|
+
for x in [
|
64
|
+
AcceleratorDevice.CUDA.value,
|
65
|
+
AcceleratorDevice.MPS.value,
|
67
66
|
]
|
68
67
|
)
|
69
68
|
else:
|
@@ -98,8 +97,10 @@ class EasyOcrModel(BaseOcrModel):
|
|
98
97
|
progress: bool = False,
|
99
98
|
) -> Path:
|
100
99
|
# Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
|
101
|
-
from easyocr.config import
|
102
|
-
|
100
|
+
from easyocr.config import (
|
101
|
+
detection_models as det_models_dict,
|
102
|
+
recognition_models as rec_models_dict,
|
103
|
+
)
|
103
104
|
|
104
105
|
if local_dir is None:
|
105
106
|
local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
|
@@ -126,13 +127,11 @@ class EasyOcrModel(BaseOcrModel):
|
|
126
127
|
def __call__(
|
127
128
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
128
129
|
) -> Iterable[Page]:
|
129
|
-
|
130
130
|
if not self.enabled:
|
131
131
|
yield from page_batch
|
132
132
|
return
|
133
133
|
|
134
134
|
for page in page_batch:
|
135
|
-
|
136
135
|
assert page._backend is not None
|
137
136
|
if not page._backend.is_valid():
|
138
137
|
yield page
|
@@ -9,7 +9,7 @@ from docling.models.factories.picture_description_factory import (
|
|
9
9
|
logger = logging.getLogger(__name__)
|
10
10
|
|
11
11
|
|
12
|
-
@lru_cache
|
12
|
+
@lru_cache
|
13
13
|
def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
|
14
14
|
factory = OcrFactory()
|
15
15
|
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
|
@@ -17,7 +17,7 @@ def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
|
|
17
17
|
return factory
|
18
18
|
|
19
19
|
|
20
|
-
@lru_cache
|
20
|
+
@lru_cache
|
21
21
|
def get_picture_description_factory(
|
22
22
|
allow_external_plugins: bool = False,
|
23
23
|
) -> PictureDescriptionFactory:
|
@@ -33,7 +33,7 @@ class BaseFactory(Generic[A], metaclass=ABCMeta):
|
|
33
33
|
|
34
34
|
@property
|
35
35
|
def registered_kind(self) -> list[str]:
|
36
|
-
return
|
36
|
+
return [opt.kind for opt in self._classes.keys()]
|
37
37
|
|
38
38
|
def get_enum(self) -> enum.Enum:
|
39
39
|
return enum.Enum(
|
docling/models/hf_mlx_model.py
CHANGED
@@ -1,25 +1,22 @@
|
|
1
1
|
import logging
|
2
2
|
import time
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from pathlib import Path
|
4
|
-
from typing import
|
5
|
+
from typing import Optional
|
5
6
|
|
6
7
|
from docling.datamodel.base_models import Page, VlmPrediction
|
7
8
|
from docling.datamodel.document import ConversionResult
|
8
9
|
from docling.datamodel.pipeline_options import (
|
9
|
-
AcceleratorDevice,
|
10
10
|
AcceleratorOptions,
|
11
11
|
HuggingFaceVlmOptions,
|
12
12
|
)
|
13
|
-
from docling.datamodel.settings import settings
|
14
13
|
from docling.models.base_model import BasePageModel
|
15
|
-
from docling.utils.accelerator_utils import decide_device
|
16
14
|
from docling.utils.profiling import TimeRecorder
|
17
15
|
|
18
16
|
_log = logging.getLogger(__name__)
|
19
17
|
|
20
18
|
|
21
19
|
class HuggingFaceMlxModel(BasePageModel):
|
22
|
-
|
23
20
|
def __init__(
|
24
21
|
self,
|
25
22
|
enabled: bool,
|
@@ -32,7 +29,6 @@ class HuggingFaceMlxModel(BasePageModel):
|
|
32
29
|
self.vlm_options = vlm_options
|
33
30
|
|
34
31
|
if self.enabled:
|
35
|
-
|
36
32
|
try:
|
37
33
|
from mlx_vlm import generate, load # type: ignore
|
38
34
|
from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
|
@@ -125,6 +121,8 @@ class HuggingFaceMlxModel(BasePageModel):
|
|
125
121
|
generation_time = time.time() - start_time
|
126
122
|
page_tags = output
|
127
123
|
|
124
|
+
_log.debug(f"Generation time {generation_time:.2f} seconds.")
|
125
|
+
|
128
126
|
# inference_time = time.time() - start_time
|
129
127
|
# tokens_per_second = num_tokens / generation_time
|
130
128
|
# print("")
|
docling/models/hf_vlm_model.py
CHANGED
@@ -1,16 +1,15 @@
|
|
1
1
|
import logging
|
2
2
|
import time
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from pathlib import Path
|
4
|
-
from typing import
|
5
|
+
from typing import Optional
|
5
6
|
|
6
7
|
from docling.datamodel.base_models import Page, VlmPrediction
|
7
8
|
from docling.datamodel.document import ConversionResult
|
8
9
|
from docling.datamodel.pipeline_options import (
|
9
|
-
AcceleratorDevice,
|
10
10
|
AcceleratorOptions,
|
11
11
|
HuggingFaceVlmOptions,
|
12
12
|
)
|
13
|
-
from docling.datamodel.settings import settings
|
14
13
|
from docling.models.base_model import BasePageModel
|
15
14
|
from docling.utils.accelerator_utils import decide_device
|
16
15
|
from docling.utils.profiling import TimeRecorder
|
@@ -19,7 +18,6 @@ _log = logging.getLogger(__name__)
|
|
19
18
|
|
20
19
|
|
21
20
|
class HuggingFaceVlmModel(BasePageModel):
|
22
|
-
|
23
21
|
def __init__(
|
24
22
|
self,
|
25
23
|
enabled: bool,
|
@@ -42,7 +40,7 @@ class HuggingFaceVlmModel(BasePageModel):
|
|
42
40
|
device = decide_device(accelerator_options.device)
|
43
41
|
self.device = device
|
44
42
|
|
45
|
-
_log.debug("Available device for HuggingFace VLM: {}"
|
43
|
+
_log.debug(f"Available device for HuggingFace VLM: {device}")
|
46
44
|
|
47
45
|
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
48
46
|
|
@@ -168,6 +166,10 @@ class HuggingFaceVlmModel(BasePageModel):
|
|
168
166
|
num_tokens = len(generated_ids[0])
|
169
167
|
page_tags = generated_texts
|
170
168
|
|
169
|
+
_log.debug(
|
170
|
+
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
|
171
|
+
)
|
172
|
+
|
171
173
|
# inference_time = time.time() - start_time
|
172
174
|
# tokens_per_second = num_tokens / generation_time
|
173
175
|
# print("")
|
docling/models/layout_model.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
import copy
|
2
2
|
import logging
|
3
3
|
import warnings
|
4
|
+
from collections.abc import Iterable
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import
|
6
|
+
from typing import Optional
|
6
7
|
|
7
8
|
from docling_core.types.doc import DocItemLabel
|
8
9
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
@@ -142,7 +143,6 @@ class LayoutModel(BasePageModel):
|
|
142
143
|
def __call__(
|
143
144
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
144
145
|
) -> Iterable[Page]:
|
145
|
-
|
146
146
|
for page in page_batch:
|
147
147
|
assert page._backend is not None
|
148
148
|
if not page._backend.is_valid():
|
docling/models/ocr_mac_model.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
import sys
|
3
3
|
import tempfile
|
4
|
+
from collections.abc import Iterable
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import
|
6
|
+
from typing import Optional, Type
|
6
7
|
|
7
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
8
9
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
@@ -41,7 +42,7 @@ class OcrMacModel(BaseOcrModel):
|
|
41
42
|
|
42
43
|
if self.enabled:
|
43
44
|
if "darwin" != sys.platform:
|
44
|
-
raise RuntimeError(
|
45
|
+
raise RuntimeError("OcrMac is only supported on Mac.")
|
45
46
|
install_errmsg = (
|
46
47
|
"ocrmac is not correctly installed. "
|
47
48
|
"Please install it via `pip install ocrmac` to use this OCR engine. "
|
@@ -58,7 +59,6 @@ class OcrMacModel(BaseOcrModel):
|
|
58
59
|
def __call__(
|
59
60
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
60
61
|
) -> Iterable[Page]:
|
61
|
-
|
62
62
|
if not self.enabled:
|
63
63
|
yield from page_batch
|
64
64
|
return
|
@@ -69,7 +69,6 @@ class OcrMacModel(BaseOcrModel):
|
|
69
69
|
yield page
|
70
70
|
else:
|
71
71
|
with TimeRecorder(conv_res, "ocr"):
|
72
|
-
|
73
72
|
ocr_rects = self.get_ocr_rects(page)
|
74
73
|
|
75
74
|
all_ocr_cells = []
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import logging
|
2
2
|
import re
|
3
|
-
from
|
3
|
+
from collections.abc import Iterable
|
4
|
+
from typing import List
|
4
5
|
|
5
6
|
from pydantic import BaseModel
|
6
7
|
|
@@ -53,9 +54,9 @@ class PageAssembleModel(BasePageModel):
|
|
53
54
|
sanitized_text = "".join(lines)
|
54
55
|
|
55
56
|
# Text normalization
|
56
|
-
sanitized_text = sanitized_text.replace("⁄", "/")
|
57
|
-
sanitized_text = sanitized_text.replace("’", "'")
|
58
|
-
sanitized_text = sanitized_text.replace("‘", "'")
|
57
|
+
sanitized_text = sanitized_text.replace("⁄", "/") # noqa: RUF001
|
58
|
+
sanitized_text = sanitized_text.replace("’", "'") # noqa: RUF001
|
59
|
+
sanitized_text = sanitized_text.replace("‘", "'") # noqa: RUF001
|
59
60
|
sanitized_text = sanitized_text.replace("“", '"')
|
60
61
|
sanitized_text = sanitized_text.replace("”", '"')
|
61
62
|
sanitized_text = sanitized_text.replace("•", "·")
|
@@ -71,7 +72,6 @@ class PageAssembleModel(BasePageModel):
|
|
71
72
|
yield page
|
72
73
|
else:
|
73
74
|
with TimeRecorder(conv_res, "page_assemble"):
|
74
|
-
|
75
75
|
assert page.predictions.layout is not None
|
76
76
|
|
77
77
|
# assembles some JSON output page by page.
|
@@ -83,7 +83,6 @@ class PageAssembleModel(BasePageModel):
|
|
83
83
|
for cluster in page.predictions.layout.clusters:
|
84
84
|
# _log.info("Cluster label seen:", cluster.label)
|
85
85
|
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
86
|
-
|
87
86
|
textlines = [
|
88
87
|
cell.text.replace("\x02", "-").strip()
|
89
88
|
for cell in cluster.cells
|
@@ -109,9 +108,7 @@ class PageAssembleModel(BasePageModel):
|
|
109
108
|
tbl = page.predictions.tablestructure.table_map.get(
|
110
109
|
cluster.id, None
|
111
110
|
)
|
112
|
-
if
|
113
|
-
not tbl
|
114
|
-
): # fallback: add table without structure, if it isn't present
|
111
|
+
if not tbl: # fallback: add table without structure, if it isn't present
|
115
112
|
tbl = Table(
|
116
113
|
label=cluster.label,
|
117
114
|
id=cluster.id,
|
@@ -130,9 +127,7 @@ class PageAssembleModel(BasePageModel):
|
|
130
127
|
fig = page.predictions.figures_classification.figure_map.get(
|
131
128
|
cluster.id, None
|
132
129
|
)
|
133
|
-
if
|
134
|
-
not fig
|
135
|
-
): # fallback: add figure without classification, if it isn't present
|
130
|
+
if not fig: # fallback: add figure without classification, if it isn't present
|
136
131
|
fig = FigureElement(
|
137
132
|
label=cluster.label,
|
138
133
|
id=cluster.id,
|
@@ -1,12 +1,8 @@
|
|
1
|
-
import
|
2
|
-
import io
|
3
|
-
import logging
|
1
|
+
from collections.abc import Iterable
|
4
2
|
from pathlib import Path
|
5
|
-
from typing import
|
3
|
+
from typing import Optional, Type, Union
|
6
4
|
|
7
|
-
import requests
|
8
5
|
from PIL import Image
|
9
|
-
from pydantic import BaseModel, ConfigDict
|
10
6
|
|
11
7
|
from docling.datamodel.pipeline_options import (
|
12
8
|
AcceleratorOptions,
|
@@ -15,37 +11,7 @@ from docling.datamodel.pipeline_options import (
|
|
15
11
|
)
|
16
12
|
from docling.exceptions import OperationNotAllowed
|
17
13
|
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
18
|
-
|
19
|
-
_log = logging.getLogger(__name__)
|
20
|
-
|
21
|
-
|
22
|
-
class ChatMessage(BaseModel):
|
23
|
-
role: str
|
24
|
-
content: str
|
25
|
-
|
26
|
-
|
27
|
-
class ResponseChoice(BaseModel):
|
28
|
-
index: int
|
29
|
-
message: ChatMessage
|
30
|
-
finish_reason: str
|
31
|
-
|
32
|
-
|
33
|
-
class ResponseUsage(BaseModel):
|
34
|
-
prompt_tokens: int
|
35
|
-
completion_tokens: int
|
36
|
-
total_tokens: int
|
37
|
-
|
38
|
-
|
39
|
-
class ApiResponse(BaseModel):
|
40
|
-
model_config = ConfigDict(
|
41
|
-
protected_namespaces=(),
|
42
|
-
)
|
43
|
-
|
44
|
-
id: str
|
45
|
-
model: Optional[str] = None # returned by openai
|
46
|
-
choices: List[ResponseChoice]
|
47
|
-
created: int
|
48
|
-
usage: ResponseUsage
|
14
|
+
from docling.utils.api_image_request import api_image_request
|
49
15
|
|
50
16
|
|
51
17
|
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
@@ -83,43 +49,11 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
|
83
49
|
# Note: technically we could make a batch request here,
|
84
50
|
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
|
85
51
|
for image in images:
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
messages = [
|
91
|
-
{
|
92
|
-
"role": "user",
|
93
|
-
"content": [
|
94
|
-
{
|
95
|
-
"type": "text",
|
96
|
-
"text": self.options.prompt,
|
97
|
-
},
|
98
|
-
{
|
99
|
-
"type": "image_url",
|
100
|
-
"image_url": {
|
101
|
-
"url": f"data:image/png;base64,{image_base64}"
|
102
|
-
},
|
103
|
-
},
|
104
|
-
],
|
105
|
-
}
|
106
|
-
]
|
107
|
-
|
108
|
-
payload = {
|
109
|
-
"messages": messages,
|
110
|
-
**self.options.params,
|
111
|
-
}
|
112
|
-
|
113
|
-
r = requests.post(
|
114
|
-
str(self.options.url),
|
115
|
-
headers=self.options.headers,
|
116
|
-
json=payload,
|
52
|
+
yield api_image_request(
|
53
|
+
image=image,
|
54
|
+
prompt=self.options.prompt,
|
55
|
+
url=self.options.url,
|
117
56
|
timeout=self.options.timeout,
|
57
|
+
headers=self.options.headers,
|
58
|
+
**self.options.params,
|
118
59
|
)
|
119
|
-
if not r.ok:
|
120
|
-
_log.error(f"Error calling the API. Reponse was {r.text}")
|
121
|
-
r.raise_for_status()
|
122
|
-
|
123
|
-
api_resp = ApiResponse.model_validate_json(r.text)
|
124
|
-
generated_text = api_resp.choices[0].message.content.strip()
|
125
|
-
yield generated_text
|
@@ -1,12 +1,11 @@
|
|
1
|
-
import logging
|
2
1
|
from abc import abstractmethod
|
2
|
+
from collections.abc import Iterable
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import
|
4
|
+
from typing import List, Optional, Type, Union
|
5
5
|
|
6
6
|
from docling_core.types.doc import (
|
7
7
|
DoclingDocument,
|
8
8
|
NodeItem,
|
9
|
-
PictureClassificationClass,
|
10
9
|
PictureItem,
|
11
10
|
)
|
12
11
|
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
|
@@ -63,8 +62,20 @@ class PictureDescriptionBaseModel(
|
|
63
62
|
elements: List[PictureItem] = []
|
64
63
|
for el in element_batch:
|
65
64
|
assert isinstance(el.item, PictureItem)
|
66
|
-
|
67
|
-
|
65
|
+
describe_image = True
|
66
|
+
# Don't describe the image if it's smaller than the threshold
|
67
|
+
if len(el.item.prov) > 0:
|
68
|
+
prov = el.item.prov[0] # PictureItems have at most a single provenance
|
69
|
+
page = doc.pages.get(prov.page_no)
|
70
|
+
if page is not None:
|
71
|
+
page_area = page.size.width * page.size.height
|
72
|
+
if page_area > 0:
|
73
|
+
area_fraction = prov.bbox.area() / page_area
|
74
|
+
if area_fraction < self.options.picture_area_threshold:
|
75
|
+
describe_image = False
|
76
|
+
if describe_image:
|
77
|
+
elements.append(el.item)
|
78
|
+
images.append(el.image)
|
68
79
|
|
69
80
|
outputs = self._annotate_images(images)
|
70
81
|
|
@@ -1,5 +1,6 @@
|
|
1
|
+
from collections.abc import Iterable
|
1
2
|
from pathlib import Path
|
2
|
-
from typing import
|
3
|
+
from typing import Optional, Type, Union
|
3
4
|
|
4
5
|
from PIL import Image
|
5
6
|
|
@@ -13,7 +14,6 @@ from docling.utils.accelerator_utils import decide_device
|
|
13
14
|
|
14
15
|
|
15
16
|
class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
16
|
-
|
17
17
|
@classmethod
|
18
18
|
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
|
19
19
|
return PictureDescriptionVlmOptions
|
@@ -36,7 +36,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
|
36
36
|
self.options: PictureDescriptionVlmOptions
|
37
37
|
|
38
38
|
if self.enabled:
|
39
|
-
|
40
39
|
if artifacts_path is None:
|
41
40
|
artifacts_path = self.download_models(repo_id=self.options.repo_id)
|
42
41
|
else:
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import logging
|
2
|
+
from collections.abc import Iterable
|
2
3
|
from pathlib import Path
|
3
|
-
from typing import
|
4
|
+
from typing import Optional, Type
|
4
5
|
|
5
6
|
import numpy
|
6
7
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
@@ -74,13 +75,11 @@ class RapidOcrModel(BaseOcrModel):
|
|
74
75
|
def __call__(
|
75
76
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
76
77
|
) -> Iterable[Page]:
|
77
|
-
|
78
78
|
if not self.enabled:
|
79
79
|
yield from page_batch
|
80
80
|
return
|
81
81
|
|
82
82
|
for page in page_batch:
|
83
|
-
|
84
83
|
assert page._backend is not None
|
85
84
|
if not page._backend.is_valid():
|
86
85
|
yield page
|