docling 2.30.0__py3-none-any.whl → 2.31.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +7 -15
- docling/backend/csv_backend.py +1 -1
- docling/backend/docling_parse_backend.py +2 -2
- docling/backend/docling_parse_v2_backend.py +2 -2
- docling/backend/docling_parse_v4_backend.py +3 -4
- docling/backend/docx/latex/latex_dict.py +0 -5
- docling/backend/docx/latex/omml.py +4 -7
- docling/backend/html_backend.py +26 -9
- docling/backend/md_backend.py +5 -7
- docling/backend/msexcel_backend.py +1 -7
- docling/backend/mspowerpoint_backend.py +4 -7
- docling/backend/msword_backend.py +4 -4
- docling/backend/pdf_backend.py +2 -1
- docling/backend/pypdfium2_backend.py +3 -3
- docling/backend/xml/jats_backend.py +10 -13
- docling/backend/xml/uspto_backend.py +15 -19
- docling/cli/main.py +7 -7
- docling/cli/models.py +2 -3
- docling/datamodel/base_models.py +7 -5
- docling/datamodel/document.py +11 -10
- docling/datamodel/pipeline_options.py +0 -1
- docling/document_converter.py +5 -5
- docling/models/api_vlm_model.py +1 -2
- docling/models/base_model.py +2 -4
- docling/models/base_ocr_model.py +2 -2
- docling/models/code_formula_model.py +2 -1
- docling/models/document_picture_classifier.py +2 -1
- docling/models/easyocr_model.py +10 -11
- docling/models/factories/__init__.py +2 -2
- docling/models/factories/base_factory.py +1 -1
- docling/models/hf_mlx_model.py +4 -6
- docling/models/hf_vlm_model.py +7 -5
- docling/models/layout_model.py +2 -2
- docling/models/ocr_mac_model.py +3 -4
- docling/models/page_assemble_model.py +7 -12
- docling/models/page_preprocessing_model.py +2 -1
- docling/models/picture_description_api_model.py +2 -1
- docling/models/picture_description_base_model.py +2 -3
- docling/models/picture_description_vlm_model.py +2 -3
- docling/models/rapid_ocr_model.py +2 -3
- docling/models/readingorder_model.py +8 -23
- docling/models/table_structure_model.py +2 -6
- docling/models/tesseract_ocr_cli_model.py +17 -16
- docling/models/tesseract_ocr_model.py +8 -6
- docling/pipeline/base_pipeline.py +4 -8
- docling/pipeline/simple_pipeline.py +0 -1
- docling/pipeline/standard_pdf_pipeline.py +0 -1
- docling/pipeline/vlm_pipeline.py +0 -3
- docling/utils/export.py +2 -4
- docling/utils/glm_utils.py +2 -2
- docling/utils/layout_postprocessor.py +4 -2
- docling/utils/model_downloader.py +7 -7
- docling/utils/utils.py +1 -1
- {docling-2.30.0.dist-info → docling-2.31.0.dist-info}/METADATA +2 -1
- docling-2.31.0.dist-info/RECORD +86 -0
- docling-2.30.0.dist-info/RECORD +0 -86
- {docling-2.30.0.dist-info → docling-2.31.0.dist-info}/LICENSE +0 -0
- {docling-2.30.0.dist-info → docling-2.31.0.dist-info}/WHEEL +0 -0
- {docling-2.30.0.dist-info → docling-2.31.0.dist-info}/entry_points.txt +0 -0
docling/models/hf_mlx_model.py
CHANGED
@@ -1,25 +1,22 @@
|
|
1
1
|
import logging
|
2
2
|
import time
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from pathlib import Path
|
4
|
-
from typing import
|
5
|
+
from typing import Optional
|
5
6
|
|
6
7
|
from docling.datamodel.base_models import Page, VlmPrediction
|
7
8
|
from docling.datamodel.document import ConversionResult
|
8
9
|
from docling.datamodel.pipeline_options import (
|
9
|
-
AcceleratorDevice,
|
10
10
|
AcceleratorOptions,
|
11
11
|
HuggingFaceVlmOptions,
|
12
12
|
)
|
13
|
-
from docling.datamodel.settings import settings
|
14
13
|
from docling.models.base_model import BasePageModel
|
15
|
-
from docling.utils.accelerator_utils import decide_device
|
16
14
|
from docling.utils.profiling import TimeRecorder
|
17
15
|
|
18
16
|
_log = logging.getLogger(__name__)
|
19
17
|
|
20
18
|
|
21
19
|
class HuggingFaceMlxModel(BasePageModel):
|
22
|
-
|
23
20
|
def __init__(
|
24
21
|
self,
|
25
22
|
enabled: bool,
|
@@ -32,7 +29,6 @@ class HuggingFaceMlxModel(BasePageModel):
|
|
32
29
|
self.vlm_options = vlm_options
|
33
30
|
|
34
31
|
if self.enabled:
|
35
|
-
|
36
32
|
try:
|
37
33
|
from mlx_vlm import generate, load # type: ignore
|
38
34
|
from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
|
@@ -125,6 +121,8 @@ class HuggingFaceMlxModel(BasePageModel):
|
|
125
121
|
generation_time = time.time() - start_time
|
126
122
|
page_tags = output
|
127
123
|
|
124
|
+
_log.debug(f"Generation time {generation_time:.2f} seconds.")
|
125
|
+
|
128
126
|
# inference_time = time.time() - start_time
|
129
127
|
# tokens_per_second = num_tokens / generation_time
|
130
128
|
# print("")
|
docling/models/hf_vlm_model.py
CHANGED
@@ -1,16 +1,15 @@
|
|
1
1
|
import logging
|
2
2
|
import time
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from pathlib import Path
|
4
|
-
from typing import
|
5
|
+
from typing import Optional
|
5
6
|
|
6
7
|
from docling.datamodel.base_models import Page, VlmPrediction
|
7
8
|
from docling.datamodel.document import ConversionResult
|
8
9
|
from docling.datamodel.pipeline_options import (
|
9
|
-
AcceleratorDevice,
|
10
10
|
AcceleratorOptions,
|
11
11
|
HuggingFaceVlmOptions,
|
12
12
|
)
|
13
|
-
from docling.datamodel.settings import settings
|
14
13
|
from docling.models.base_model import BasePageModel
|
15
14
|
from docling.utils.accelerator_utils import decide_device
|
16
15
|
from docling.utils.profiling import TimeRecorder
|
@@ -19,7 +18,6 @@ _log = logging.getLogger(__name__)
|
|
19
18
|
|
20
19
|
|
21
20
|
class HuggingFaceVlmModel(BasePageModel):
|
22
|
-
|
23
21
|
def __init__(
|
24
22
|
self,
|
25
23
|
enabled: bool,
|
@@ -42,7 +40,7 @@ class HuggingFaceVlmModel(BasePageModel):
|
|
42
40
|
device = decide_device(accelerator_options.device)
|
43
41
|
self.device = device
|
44
42
|
|
45
|
-
_log.debug("Available device for HuggingFace VLM: {}"
|
43
|
+
_log.debug(f"Available device for HuggingFace VLM: {device}")
|
46
44
|
|
47
45
|
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
48
46
|
|
@@ -168,6 +166,10 @@ class HuggingFaceVlmModel(BasePageModel):
|
|
168
166
|
num_tokens = len(generated_ids[0])
|
169
167
|
page_tags = generated_texts
|
170
168
|
|
169
|
+
_log.debug(
|
170
|
+
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
|
171
|
+
)
|
172
|
+
|
171
173
|
# inference_time = time.time() - start_time
|
172
174
|
# tokens_per_second = num_tokens / generation_time
|
173
175
|
# print("")
|
docling/models/layout_model.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
import copy
|
2
2
|
import logging
|
3
3
|
import warnings
|
4
|
+
from collections.abc import Iterable
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import
|
6
|
+
from typing import Optional
|
6
7
|
|
7
8
|
from docling_core.types.doc import DocItemLabel
|
8
9
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
@@ -142,7 +143,6 @@ class LayoutModel(BasePageModel):
|
|
142
143
|
def __call__(
|
143
144
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
144
145
|
) -> Iterable[Page]:
|
145
|
-
|
146
146
|
for page in page_batch:
|
147
147
|
assert page._backend is not None
|
148
148
|
if not page._backend.is_valid():
|
docling/models/ocr_mac_model.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
import sys
|
3
3
|
import tempfile
|
4
|
+
from collections.abc import Iterable
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import
|
6
|
+
from typing import Optional, Type
|
6
7
|
|
7
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
8
9
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
@@ -41,7 +42,7 @@ class OcrMacModel(BaseOcrModel):
|
|
41
42
|
|
42
43
|
if self.enabled:
|
43
44
|
if "darwin" != sys.platform:
|
44
|
-
raise RuntimeError(
|
45
|
+
raise RuntimeError("OcrMac is only supported on Mac.")
|
45
46
|
install_errmsg = (
|
46
47
|
"ocrmac is not correctly installed. "
|
47
48
|
"Please install it via `pip install ocrmac` to use this OCR engine. "
|
@@ -58,7 +59,6 @@ class OcrMacModel(BaseOcrModel):
|
|
58
59
|
def __call__(
|
59
60
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
60
61
|
) -> Iterable[Page]:
|
61
|
-
|
62
62
|
if not self.enabled:
|
63
63
|
yield from page_batch
|
64
64
|
return
|
@@ -69,7 +69,6 @@ class OcrMacModel(BaseOcrModel):
|
|
69
69
|
yield page
|
70
70
|
else:
|
71
71
|
with TimeRecorder(conv_res, "ocr"):
|
72
|
-
|
73
72
|
ocr_rects = self.get_ocr_rects(page)
|
74
73
|
|
75
74
|
all_ocr_cells = []
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import logging
|
2
2
|
import re
|
3
|
-
from
|
3
|
+
from collections.abc import Iterable
|
4
|
+
from typing import List
|
4
5
|
|
5
6
|
from pydantic import BaseModel
|
6
7
|
|
@@ -53,9 +54,9 @@ class PageAssembleModel(BasePageModel):
|
|
53
54
|
sanitized_text = "".join(lines)
|
54
55
|
|
55
56
|
# Text normalization
|
56
|
-
sanitized_text = sanitized_text.replace("⁄", "/")
|
57
|
-
sanitized_text = sanitized_text.replace("’", "'")
|
58
|
-
sanitized_text = sanitized_text.replace("‘", "'")
|
57
|
+
sanitized_text = sanitized_text.replace("⁄", "/") # noqa: RUF001
|
58
|
+
sanitized_text = sanitized_text.replace("’", "'") # noqa: RUF001
|
59
|
+
sanitized_text = sanitized_text.replace("‘", "'") # noqa: RUF001
|
59
60
|
sanitized_text = sanitized_text.replace("“", '"')
|
60
61
|
sanitized_text = sanitized_text.replace("”", '"')
|
61
62
|
sanitized_text = sanitized_text.replace("•", "·")
|
@@ -71,7 +72,6 @@ class PageAssembleModel(BasePageModel):
|
|
71
72
|
yield page
|
72
73
|
else:
|
73
74
|
with TimeRecorder(conv_res, "page_assemble"):
|
74
|
-
|
75
75
|
assert page.predictions.layout is not None
|
76
76
|
|
77
77
|
# assembles some JSON output page by page.
|
@@ -83,7 +83,6 @@ class PageAssembleModel(BasePageModel):
|
|
83
83
|
for cluster in page.predictions.layout.clusters:
|
84
84
|
# _log.info("Cluster label seen:", cluster.label)
|
85
85
|
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
86
|
-
|
87
86
|
textlines = [
|
88
87
|
cell.text.replace("\x02", "-").strip()
|
89
88
|
for cell in cluster.cells
|
@@ -109,9 +108,7 @@ class PageAssembleModel(BasePageModel):
|
|
109
108
|
tbl = page.predictions.tablestructure.table_map.get(
|
110
109
|
cluster.id, None
|
111
110
|
)
|
112
|
-
if
|
113
|
-
not tbl
|
114
|
-
): # fallback: add table without structure, if it isn't present
|
111
|
+
if not tbl: # fallback: add table without structure, if it isn't present
|
115
112
|
tbl = Table(
|
116
113
|
label=cluster.label,
|
117
114
|
id=cluster.id,
|
@@ -130,9 +127,7 @@ class PageAssembleModel(BasePageModel):
|
|
130
127
|
fig = page.predictions.figures_classification.figure_map.get(
|
131
128
|
cluster.id, None
|
132
129
|
)
|
133
|
-
if
|
134
|
-
not fig
|
135
|
-
): # fallback: add figure without classification, if it isn't present
|
130
|
+
if not fig: # fallback: add figure without classification, if it isn't present
|
136
131
|
fig = FigureElement(
|
137
132
|
label=cluster.label,
|
138
133
|
id=cluster.id,
|
@@ -1,12 +1,11 @@
|
|
1
|
-
import logging
|
2
1
|
from abc import abstractmethod
|
2
|
+
from collections.abc import Iterable
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import
|
4
|
+
from typing import List, Optional, Type, Union
|
5
5
|
|
6
6
|
from docling_core.types.doc import (
|
7
7
|
DoclingDocument,
|
8
8
|
NodeItem,
|
9
|
-
PictureClassificationClass,
|
10
9
|
PictureItem,
|
11
10
|
)
|
12
11
|
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
|
@@ -1,5 +1,6 @@
|
|
1
|
+
from collections.abc import Iterable
|
1
2
|
from pathlib import Path
|
2
|
-
from typing import
|
3
|
+
from typing import Optional, Type, Union
|
3
4
|
|
4
5
|
from PIL import Image
|
5
6
|
|
@@ -13,7 +14,6 @@ from docling.utils.accelerator_utils import decide_device
|
|
13
14
|
|
14
15
|
|
15
16
|
class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
16
|
-
|
17
17
|
@classmethod
|
18
18
|
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
|
19
19
|
return PictureDescriptionVlmOptions
|
@@ -36,7 +36,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
|
36
36
|
self.options: PictureDescriptionVlmOptions
|
37
37
|
|
38
38
|
if self.enabled:
|
39
|
-
|
40
39
|
if artifacts_path is None:
|
41
40
|
artifacts_path = self.download_models(repo_id=self.options.repo_id)
|
42
41
|
else:
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import logging
|
2
|
+
from collections.abc import Iterable
|
2
3
|
from pathlib import Path
|
3
|
-
from typing import
|
4
|
+
from typing import Optional, Type
|
4
5
|
|
5
6
|
import numpy
|
6
7
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
@@ -74,13 +75,11 @@ class RapidOcrModel(BaseOcrModel):
|
|
74
75
|
def __call__(
|
75
76
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
76
77
|
) -> Iterable[Page]:
|
77
|
-
|
78
78
|
if not self.enabled:
|
79
79
|
yield from page_batch
|
80
80
|
return
|
81
81
|
|
82
82
|
for page in page_batch:
|
83
|
-
|
84
83
|
assert page._backend is not None
|
85
84
|
if not page._backend.is_valid():
|
86
85
|
yield page
|
@@ -1,12 +1,7 @@
|
|
1
|
-
import copy
|
2
|
-
import random
|
3
1
|
from pathlib import Path
|
4
2
|
from typing import Dict, List
|
5
3
|
|
6
4
|
from docling_core.types.doc import (
|
7
|
-
BoundingBox,
|
8
|
-
CoordOrigin,
|
9
|
-
DocItem,
|
10
5
|
DocItemLabel,
|
11
6
|
DoclingDocument,
|
12
7
|
DocumentOrigin,
|
@@ -17,13 +12,10 @@ from docling_core.types.doc import (
|
|
17
12
|
TableData,
|
18
13
|
)
|
19
14
|
from docling_core.types.doc.document import ContentLayer
|
20
|
-
from docling_core.types.legacy_doc.base import Ref
|
21
|
-
from docling_core.types.legacy_doc.document import BaseText
|
22
15
|
from docling_ibm_models.reading_order.reading_order_rb import (
|
23
16
|
PageElement as ReadingOrderPageElement,
|
17
|
+
ReadingOrderPredictor,
|
24
18
|
)
|
25
|
-
from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
|
26
|
-
from PIL import ImageDraw
|
27
19
|
from pydantic import BaseModel, ConfigDict
|
28
20
|
|
29
21
|
from docling.datamodel.base_models import (
|
@@ -35,7 +27,6 @@ from docling.datamodel.base_models import (
|
|
35
27
|
TextElement,
|
36
28
|
)
|
37
29
|
from docling.datamodel.document import ConversionResult
|
38
|
-
from docling.datamodel.settings import settings
|
39
30
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
40
31
|
|
41
32
|
|
@@ -53,12 +44,10 @@ class ReadingOrderModel:
|
|
53
44
|
def _assembled_to_readingorder_elements(
|
54
45
|
self, conv_res: ConversionResult
|
55
46
|
) -> List[ReadingOrderPageElement]:
|
56
|
-
|
57
47
|
elements: List[ReadingOrderPageElement] = []
|
58
48
|
page_no_to_pages = {p.page_no: p for p in conv_res.pages}
|
59
49
|
|
60
50
|
for element in conv_res.assembled.elements:
|
61
|
-
|
62
51
|
page_height = page_no_to_pages[element.page_no].size.height # type: ignore
|
63
52
|
bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
|
64
53
|
text = element.text or ""
|
@@ -84,7 +73,6 @@ class ReadingOrderModel:
|
|
84
73
|
def _add_child_elements(
|
85
74
|
self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
|
86
75
|
):
|
87
|
-
|
88
76
|
child: Cluster
|
89
77
|
for child in element.cluster.children:
|
90
78
|
c_label = child.label
|
@@ -110,7 +98,7 @@ class ReadingOrderModel:
|
|
110
98
|
else:
|
111
99
|
doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
|
112
100
|
|
113
|
-
def _readingorder_elements_to_docling_doc(
|
101
|
+
def _readingorder_elements_to_docling_doc( # noqa: C901
|
114
102
|
self,
|
115
103
|
conv_res: ConversionResult,
|
116
104
|
ro_elements: List[ReadingOrderPageElement],
|
@@ -118,7 +106,6 @@ class ReadingOrderModel:
|
|
118
106
|
el_to_footnotes_mapping: Dict[int, List[int]],
|
119
107
|
el_merges_mapping: Dict[int, List[int]],
|
120
108
|
) -> DoclingDocument:
|
121
|
-
|
122
109
|
id_to_elem = {
|
123
110
|
RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
|
124
111
|
for elem in conv_res.assembled.elements
|
@@ -192,7 +179,6 @@ class ReadingOrderModel:
|
|
192
179
|
|
193
180
|
code_item.footnotes.append(new_footnote_item.get_ref())
|
194
181
|
else:
|
195
|
-
|
196
182
|
new_item, current_list = self._handle_text_element(
|
197
183
|
element, out_doc, current_list, page_height
|
198
184
|
)
|
@@ -206,7 +192,6 @@ class ReadingOrderModel:
|
|
206
192
|
)
|
207
193
|
|
208
194
|
elif isinstance(element, Table):
|
209
|
-
|
210
195
|
tbl_data = TableData(
|
211
196
|
num_rows=element.num_rows,
|
212
197
|
num_cols=element.num_cols,
|
@@ -342,12 +327,12 @@ class ReadingOrderModel:
|
|
342
327
|
return new_item, current_list
|
343
328
|
|
344
329
|
def _merge_elements(self, element, merged_elem, new_item, page_height):
|
345
|
-
assert isinstance(
|
346
|
-
|
347
|
-
)
|
348
|
-
assert (
|
349
|
-
|
350
|
-
)
|
330
|
+
assert isinstance(merged_elem, type(element)), (
|
331
|
+
"Merged element must be of same type as element."
|
332
|
+
)
|
333
|
+
assert merged_elem.label == new_item.label, (
|
334
|
+
"Labels of merged elements must match."
|
335
|
+
)
|
351
336
|
prov = ProvenanceItem(
|
352
337
|
page_no=element.page_no + 1,
|
353
338
|
charspan=(
|
@@ -1,13 +1,13 @@
|
|
1
1
|
import copy
|
2
2
|
import warnings
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from pathlib import Path
|
4
|
-
from typing import
|
5
|
+
from typing import Optional
|
5
6
|
|
6
7
|
import numpy
|
7
8
|
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
8
9
|
from docling_core.types.doc.page import (
|
9
10
|
BoundingRectangle,
|
10
|
-
SegmentedPdfPage,
|
11
11
|
TextCellUnit,
|
12
12
|
)
|
13
13
|
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
@@ -44,7 +44,6 @@ class TableStructureModel(BasePageModel):
|
|
44
44
|
|
45
45
|
self.enabled = enabled
|
46
46
|
if self.enabled:
|
47
|
-
|
48
47
|
if artifacts_path is None:
|
49
48
|
artifacts_path = self.download_models() / self._model_path
|
50
49
|
else:
|
@@ -175,7 +174,6 @@ class TableStructureModel(BasePageModel):
|
|
175
174
|
def __call__(
|
176
175
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
177
176
|
) -> Iterable[Page]:
|
178
|
-
|
179
177
|
if not self.enabled:
|
180
178
|
yield from page_batch
|
181
179
|
return
|
@@ -186,7 +184,6 @@ class TableStructureModel(BasePageModel):
|
|
186
184
|
yield page
|
187
185
|
else:
|
188
186
|
with TimeRecorder(conv_res, "table_structure"):
|
189
|
-
|
190
187
|
assert page.predictions.layout is not None
|
191
188
|
assert page.size is not None
|
192
189
|
|
@@ -260,7 +257,6 @@ class TableStructureModel(BasePageModel):
|
|
260
257
|
table_out = tf_output[0]
|
261
258
|
table_cells = []
|
262
259
|
for element in table_out["tf_responses"]:
|
263
|
-
|
264
260
|
if not self.do_cell_matching:
|
265
261
|
the_bbox = BoundingBox.model_validate(
|
266
262
|
element["bbox"]
|
@@ -3,9 +3,10 @@ import io
|
|
3
3
|
import logging
|
4
4
|
import os
|
5
5
|
import tempfile
|
6
|
+
from collections.abc import Iterable
|
6
7
|
from pathlib import Path
|
7
8
|
from subprocess import DEVNULL, PIPE, Popen
|
8
|
-
from typing import
|
9
|
+
from typing import List, Optional, Tuple, Type
|
9
10
|
|
10
11
|
import pandas as pd
|
11
12
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
@@ -63,8 +64,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
63
64
|
)
|
64
65
|
|
65
66
|
def _get_name_and_version(self) -> Tuple[str, str]:
|
66
|
-
|
67
|
-
if self._name != None and self._version != None:
|
67
|
+
if self._name is not None and self._version is not None:
|
68
68
|
return self._name, self._version # type: ignore
|
69
69
|
|
70
70
|
cmd = [self.options.tesseract_cmd, "--version"]
|
@@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
125
125
|
# _log.info(decoded_data)
|
126
126
|
|
127
127
|
# Read the TSV file generated by Tesseract
|
128
|
-
|
128
|
+
df_result = pd.read_csv(
|
129
|
+
io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
|
130
|
+
)
|
129
131
|
|
130
132
|
# Display the dataframe (optional)
|
131
133
|
# _log.info("df: ", df.head())
|
132
134
|
|
133
135
|
# Filter rows that contain actual text (ignore header or empty rows)
|
134
|
-
df_filtered =
|
135
|
-
|
136
|
+
df_filtered = df_result[
|
137
|
+
df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
|
136
138
|
]
|
137
139
|
|
138
140
|
return df_filtered
|
@@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
149
151
|
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
150
152
|
output, _ = proc.communicate()
|
151
153
|
decoded_data = output.decode("utf-8")
|
152
|
-
|
154
|
+
df_detected = pd.read_csv(
|
153
155
|
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
154
156
|
)
|
155
|
-
scripts =
|
157
|
+
scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
|
156
158
|
if len(scripts) == 0:
|
157
159
|
_log.warning("Tesseract cannot detect the script of the page")
|
158
160
|
return None
|
@@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
183
185
|
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
184
186
|
output, _ = proc.communicate()
|
185
187
|
decoded_data = output.decode("utf-8")
|
186
|
-
|
187
|
-
self._tesseract_languages =
|
188
|
+
df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
|
189
|
+
self._tesseract_languages = df_list[0].tolist()[1:]
|
188
190
|
|
189
191
|
# Decide the script prefix
|
190
|
-
if any(
|
192
|
+
if any(lang.startswith("script/") for lang in self._tesseract_languages):
|
191
193
|
script_prefix = "script/"
|
192
194
|
else:
|
193
195
|
script_prefix = ""
|
@@ -197,7 +199,6 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
197
199
|
def __call__(
|
198
200
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
199
201
|
) -> Iterable[Page]:
|
200
|
-
|
201
202
|
if not self.enabled:
|
202
203
|
yield from page_batch
|
203
204
|
return
|
@@ -225,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
225
226
|
fname = image_file.name
|
226
227
|
high_res_image.save(image_file)
|
227
228
|
|
228
|
-
|
229
|
+
df_result = self._run_tesseract(fname)
|
229
230
|
finally:
|
230
231
|
if os.path.exists(fname):
|
231
232
|
os.remove(fname)
|
232
233
|
|
233
|
-
# _log.info(
|
234
|
+
# _log.info(df_result)
|
234
235
|
|
235
236
|
# Print relevant columns (bounding box and text)
|
236
|
-
for ix, row in
|
237
|
+
for ix, row in df_result.iterrows():
|
237
238
|
text = row["text"]
|
238
239
|
conf = row["conf"]
|
239
240
|
|
240
|
-
l = float(row["left"])
|
241
|
+
l = float(row["left"]) # noqa: E741
|
241
242
|
b = float(row["top"])
|
242
243
|
w = float(row["width"])
|
243
244
|
h = float(row["height"])
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import logging
|
2
|
+
from collections.abc import Iterable
|
2
3
|
from pathlib import Path
|
3
|
-
from typing import
|
4
|
+
from typing import Optional, Type
|
4
5
|
|
5
6
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
6
7
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
@@ -37,9 +38,6 @@ class TesseractOcrModel(BaseOcrModel):
|
|
37
38
|
self.options: TesseractOcrOptions
|
38
39
|
|
39
40
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
40
|
-
self.reader = None
|
41
|
-
self.osd_reader = None
|
42
|
-
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
43
41
|
|
44
42
|
if self.enabled:
|
45
43
|
install_errmsg = (
|
@@ -64,7 +62,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|
64
62
|
raise ImportError(install_errmsg)
|
65
63
|
try:
|
66
64
|
tesseract_version = tesserocr.tesseract_version()
|
67
|
-
except:
|
65
|
+
except Exception:
|
68
66
|
raise ImportError(install_errmsg)
|
69
67
|
|
70
68
|
_, self._tesserocr_languages = tesserocr.get_languages()
|
@@ -75,7 +73,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|
75
73
|
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
76
74
|
lang = "+".join(self.options.lang)
|
77
75
|
|
78
|
-
if any(
|
76
|
+
if any(lang.startswith("script/") for lang in self._tesserocr_languages):
|
79
77
|
self.script_prefix = "script/"
|
80
78
|
else:
|
81
79
|
self.script_prefix = ""
|
@@ -86,6 +84,10 @@ class TesseractOcrModel(BaseOcrModel):
|
|
86
84
|
"oem": tesserocr.OEM.DEFAULT,
|
87
85
|
}
|
88
86
|
|
87
|
+
self.reader = None
|
88
|
+
self.osd_reader = None
|
89
|
+
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
90
|
+
|
89
91
|
if self.options.path is not None:
|
90
92
|
tesserocr_kwargs["path"] = self.options.path
|
91
93
|
|
@@ -3,9 +3,10 @@ import logging
|
|
3
3
|
import time
|
4
4
|
import traceback
|
5
5
|
from abc import ABC, abstractmethod
|
6
|
-
from
|
6
|
+
from collections.abc import Iterable
|
7
|
+
from typing import Any, Callable, List
|
7
8
|
|
8
|
-
from docling_core.types.doc import
|
9
|
+
from docling_core.types.doc import NodeItem
|
9
10
|
|
10
11
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
11
12
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
@@ -64,7 +65,6 @@ class BasePipeline(ABC):
|
|
64
65
|
return conv_res
|
65
66
|
|
66
67
|
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
|
67
|
-
|
68
68
|
def _prepare_elements(
|
69
69
|
conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
|
70
70
|
) -> Iterable[NodeItem]:
|
@@ -113,7 +113,6 @@ class BasePipeline(ABC):
|
|
113
113
|
|
114
114
|
|
115
115
|
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
116
|
-
|
117
116
|
def __init__(self, pipeline_options: PipelineOptions):
|
118
117
|
super().__init__(pipeline_options)
|
119
118
|
self.keep_backend = False
|
@@ -127,7 +126,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
127
126
|
yield from page_batch
|
128
127
|
|
129
128
|
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
130
|
-
|
131
129
|
if not isinstance(conv_res.input._backend, PdfDocumentBackend):
|
132
130
|
raise RuntimeError(
|
133
131
|
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
|
@@ -139,8 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
139
137
|
|
140
138
|
total_elapsed_time = 0.0
|
141
139
|
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
142
|
-
|
143
|
-
for i in range(0, conv_res.input.page_count):
|
140
|
+
for i in range(conv_res.input.page_count):
|
144
141
|
start_page, end_page = conv_res.input.limits.page_range
|
145
142
|
if (start_page - 1) <= i <= (end_page - 1):
|
146
143
|
conv_res.pages.append(Page(page_no=i))
|
@@ -161,7 +158,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
161
158
|
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
|
162
159
|
|
163
160
|
for p in pipeline_pages: # Must exhaust!
|
164
|
-
|
165
161
|
# Cleanup cached images
|
166
162
|
if not self.keep_images:
|
167
163
|
p._image_cache = {}
|
@@ -24,7 +24,6 @@ class SimplePipeline(BasePipeline):
|
|
24
24
|
super().__init__(pipeline_options)
|
25
25
|
|
26
26
|
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
27
|
-
|
28
27
|
if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
|
29
28
|
raise RuntimeError(
|
30
29
|
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
|