docling 2.16.0__py3-none-any.whl → 2.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/html_backend.py +3 -2
- docling/backend/md_backend.py +4 -8
- docling/backend/xml/uspto_backend.py +25 -25
- docling/cli/main.py +18 -3
- docling/datamodel/document.py +2 -0
- docling/datamodel/pipeline_options.py +1 -0
- docling/models/rapid_ocr_model.py +1 -0
- docling/models/tesseract_ocr_cli_model.py +72 -4
- docling/models/tesseract_ocr_model.py +37 -37
- docling/utils/ocr_utils.py +9 -0
- {docling-2.16.0.dist-info → docling-2.17.0.dist-info}/METADATA +13 -10
- {docling-2.16.0.dist-info → docling-2.17.0.dist-info}/RECORD +15 -14
- {docling-2.16.0.dist-info → docling-2.17.0.dist-info}/LICENSE +0 -0
- {docling-2.16.0.dist-info → docling-2.17.0.dist-info}/WHEEL +0 -0
- {docling-2.16.0.dist-info → docling-2.17.0.dist-info}/entry_points.txt +0 -0
docling/backend/html_backend.py
CHANGED
@@ -78,10 +78,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
78
78
|
|
79
79
|
if self.is_valid():
|
80
80
|
assert self.soup is not None
|
81
|
+
content = self.soup.body or self.soup
|
81
82
|
# Replace <br> tags with newline characters
|
82
|
-
for br in
|
83
|
+
for br in content.find_all("br"):
|
83
84
|
br.replace_with("\n")
|
84
|
-
doc = self.walk(
|
85
|
+
doc = self.walk(content, doc)
|
85
86
|
else:
|
86
87
|
raise RuntimeError(
|
87
88
|
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
docling/backend/md_backend.py
CHANGED
@@ -65,7 +65,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
65
65
|
|
66
66
|
self.in_table = False
|
67
67
|
self.md_table_buffer: list[str] = []
|
68
|
-
self.
|
68
|
+
self.inline_texts: list[str] = []
|
69
69
|
|
70
70
|
try:
|
71
71
|
if isinstance(self.path_or_stream, BytesIO):
|
@@ -152,15 +152,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
152
152
|
def process_inline_text(
|
153
153
|
self, parent_element: Optional[NodeItem], doc: DoclingDocument
|
154
154
|
):
|
155
|
-
|
156
|
-
txt = self.inline_text_buffer.strip()
|
155
|
+
txt = " ".join(self.inline_texts)
|
157
156
|
if len(txt) > 0:
|
158
157
|
doc.add_text(
|
159
158
|
label=DocItemLabel.PARAGRAPH,
|
160
159
|
parent=parent_element,
|
161
160
|
text=txt,
|
162
161
|
)
|
163
|
-
self.
|
162
|
+
self.inline_texts = []
|
164
163
|
|
165
164
|
def iterate_elements(
|
166
165
|
self,
|
@@ -266,9 +265,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
266
265
|
self.close_table(doc)
|
267
266
|
self.in_table = False
|
268
267
|
# most likely just inline text
|
269
|
-
self.
|
270
|
-
element.children
|
271
|
-
) # do not strip an inline text, as it may contain important spaces
|
268
|
+
self.inline_texts.append(str(element.children))
|
272
269
|
|
273
270
|
elif isinstance(element, marko.inline.CodeSpan):
|
274
271
|
self.close_table(doc)
|
@@ -292,7 +289,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
292
289
|
doc.add_code(parent=parent_element, text=snippet_text)
|
293
290
|
|
294
291
|
elif isinstance(element, marko.inline.LineBreak):
|
295
|
-
self.process_inline_text(parent_element, doc)
|
296
292
|
if self.in_table:
|
297
293
|
_log.debug("Line break in a table")
|
298
294
|
self.md_table_buffer.append("")
|
@@ -389,7 +389,7 @@ class PatentUsptoIce(PatentUspto):
|
|
389
389
|
if name == self.Element.TITLE.value:
|
390
390
|
if text:
|
391
391
|
self.parents[self.level + 1] = self.doc.add_title(
|
392
|
-
parent=self.parents[self.level],
|
392
|
+
parent=self.parents[self.level],
|
393
393
|
text=text,
|
394
394
|
)
|
395
395
|
self.level += 1
|
@@ -406,7 +406,7 @@ class PatentUsptoIce(PatentUspto):
|
|
406
406
|
abstract_item = self.doc.add_heading(
|
407
407
|
heading_text,
|
408
408
|
level=heading_level,
|
409
|
-
parent=self.parents[heading_level],
|
409
|
+
parent=self.parents[heading_level],
|
410
410
|
)
|
411
411
|
self.doc.add_text(
|
412
412
|
label=DocItemLabel.PARAGRAPH,
|
@@ -434,7 +434,7 @@ class PatentUsptoIce(PatentUspto):
|
|
434
434
|
claims_item = self.doc.add_heading(
|
435
435
|
heading_text,
|
436
436
|
level=heading_level,
|
437
|
-
parent=self.parents[heading_level],
|
437
|
+
parent=self.parents[heading_level],
|
438
438
|
)
|
439
439
|
for text in self.claims:
|
440
440
|
self.doc.add_text(
|
@@ -452,7 +452,7 @@ class PatentUsptoIce(PatentUspto):
|
|
452
452
|
self.doc.add_text(
|
453
453
|
label=DocItemLabel.PARAGRAPH,
|
454
454
|
text=text,
|
455
|
-
parent=self.parents[self.level],
|
455
|
+
parent=self.parents[self.level],
|
456
456
|
)
|
457
457
|
self.text = ""
|
458
458
|
|
@@ -460,7 +460,7 @@ class PatentUsptoIce(PatentUspto):
|
|
460
460
|
self.parents[self.level + 1] = self.doc.add_heading(
|
461
461
|
text=text,
|
462
462
|
level=self.level,
|
463
|
-
parent=self.parents[self.level],
|
463
|
+
parent=self.parents[self.level],
|
464
464
|
)
|
465
465
|
self.level += 1
|
466
466
|
self.text = ""
|
@@ -470,7 +470,7 @@ class PatentUsptoIce(PatentUspto):
|
|
470
470
|
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
|
471
471
|
self.doc.add_table(
|
472
472
|
data=empty_table,
|
473
|
-
parent=self.parents[self.level],
|
473
|
+
parent=self.parents[self.level],
|
474
474
|
)
|
475
475
|
|
476
476
|
def _apply_style(self, text: str, style_tag: str) -> str:
|
@@ -721,7 +721,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
721
721
|
if self.Element.TITLE.value in self.property and text.strip():
|
722
722
|
title = text.strip()
|
723
723
|
self.parents[self.level + 1] = self.doc.add_title(
|
724
|
-
parent=self.parents[self.level],
|
724
|
+
parent=self.parents[self.level],
|
725
725
|
text=title,
|
726
726
|
)
|
727
727
|
self.level += 1
|
@@ -749,7 +749,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
749
749
|
self.parents[self.level + 1] = self.doc.add_heading(
|
750
750
|
text=text.strip(),
|
751
751
|
level=self.level,
|
752
|
-
parent=self.parents[self.level],
|
752
|
+
parent=self.parents[self.level],
|
753
753
|
)
|
754
754
|
self.level += 1
|
755
755
|
|
@@ -769,7 +769,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
769
769
|
claims_item = self.doc.add_heading(
|
770
770
|
heading_text,
|
771
771
|
level=heading_level,
|
772
|
-
parent=self.parents[heading_level],
|
772
|
+
parent=self.parents[heading_level],
|
773
773
|
)
|
774
774
|
for text in self.claims:
|
775
775
|
self.doc.add_text(
|
@@ -787,7 +787,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
787
787
|
abstract_item = self.doc.add_heading(
|
788
788
|
heading_text,
|
789
789
|
level=heading_level,
|
790
|
-
parent=self.parents[heading_level],
|
790
|
+
parent=self.parents[heading_level],
|
791
791
|
)
|
792
792
|
self.doc.add_text(
|
793
793
|
label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item
|
@@ -799,7 +799,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
799
799
|
self.doc.add_text(
|
800
800
|
label=DocItemLabel.PARAGRAPH,
|
801
801
|
text=paragraph,
|
802
|
-
parent=self.parents[self.level],
|
802
|
+
parent=self.parents[self.level],
|
803
803
|
)
|
804
804
|
elif self.Element.CLAIM.value in self.property:
|
805
805
|
# we may need a space after a paragraph in claim text
|
@@ -811,7 +811,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
811
811
|
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
|
812
812
|
self.doc.add_table(
|
813
813
|
data=empty_table,
|
814
|
-
parent=self.parents[self.level],
|
814
|
+
parent=self.parents[self.level],
|
815
815
|
)
|
816
816
|
|
817
817
|
def _apply_style(self, text: str, style_tag: str) -> str:
|
@@ -938,7 +938,7 @@ class PatentUsptoGrantAps(PatentUspto):
|
|
938
938
|
self.parents[self.level + 1] = self.doc.add_heading(
|
939
939
|
heading.value,
|
940
940
|
level=self.level,
|
941
|
-
parent=self.parents[self.level],
|
941
|
+
parent=self.parents[self.level],
|
942
942
|
)
|
943
943
|
self.level += 1
|
944
944
|
|
@@ -959,7 +959,7 @@ class PatentUsptoGrantAps(PatentUspto):
|
|
959
959
|
|
960
960
|
if field == self.Field.TITLE.value:
|
961
961
|
self.parents[self.level + 1] = self.doc.add_title(
|
962
|
-
parent=self.parents[self.level], text=value
|
962
|
+
parent=self.parents[self.level], text=value
|
963
963
|
)
|
964
964
|
self.level += 1
|
965
965
|
|
@@ -971,14 +971,14 @@ class PatentUsptoGrantAps(PatentUspto):
|
|
971
971
|
self.doc.add_text(
|
972
972
|
label=DocItemLabel.PARAGRAPH,
|
973
973
|
text=value,
|
974
|
-
parent=self.parents[self.level],
|
974
|
+
parent=self.parents[self.level],
|
975
975
|
)
|
976
976
|
|
977
977
|
elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value:
|
978
978
|
self.doc.add_text(
|
979
979
|
label=DocItemLabel.PARAGRAPH,
|
980
980
|
text="",
|
981
|
-
parent=self.parents[self.level],
|
981
|
+
parent=self.parents[self.level],
|
982
982
|
)
|
983
983
|
|
984
984
|
elif (
|
@@ -996,7 +996,7 @@ class PatentUsptoGrantAps(PatentUspto):
|
|
996
996
|
last_claim = self.doc.add_text(
|
997
997
|
label=DocItemLabel.PARAGRAPH,
|
998
998
|
text="",
|
999
|
-
parent=self.parents[self.level],
|
999
|
+
parent=self.parents[self.level],
|
1000
1000
|
)
|
1001
1001
|
|
1002
1002
|
last_claim.text += f" {value}" if last_claim.text else value
|
@@ -1012,7 +1012,7 @@ class PatentUsptoGrantAps(PatentUspto):
|
|
1012
1012
|
self.parents[self.level + 1] = self.doc.add_heading(
|
1013
1013
|
value,
|
1014
1014
|
level=self.level,
|
1015
|
-
parent=self.parents[self.level],
|
1015
|
+
parent=self.parents[self.level],
|
1016
1016
|
)
|
1017
1017
|
self.level += 1
|
1018
1018
|
|
@@ -1029,7 +1029,7 @@ class PatentUsptoGrantAps(PatentUspto):
|
|
1029
1029
|
self.doc.add_text(
|
1030
1030
|
label=DocItemLabel.PARAGRAPH,
|
1031
1031
|
text=value,
|
1032
|
-
parent=self.parents[self.level],
|
1032
|
+
parent=self.parents[self.level],
|
1033
1033
|
)
|
1034
1034
|
|
1035
1035
|
def parse(self, patent_content: str) -> Optional[DoclingDocument]:
|
@@ -1283,7 +1283,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|
1283
1283
|
title = text.strip()
|
1284
1284
|
if title:
|
1285
1285
|
self.parents[self.level + 1] = self.doc.add_text(
|
1286
|
-
parent=self.parents[self.level],
|
1286
|
+
parent=self.parents[self.level],
|
1287
1287
|
label=DocItemLabel.TITLE,
|
1288
1288
|
text=title,
|
1289
1289
|
)
|
@@ -1301,7 +1301,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|
1301
1301
|
abstract_item = self.doc.add_heading(
|
1302
1302
|
heading_text,
|
1303
1303
|
level=heading_level,
|
1304
|
-
parent=self.parents[heading_level],
|
1304
|
+
parent=self.parents[heading_level],
|
1305
1305
|
)
|
1306
1306
|
self.doc.add_text(
|
1307
1307
|
label=DocItemLabel.PARAGRAPH,
|
@@ -1331,7 +1331,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|
1331
1331
|
claims_item = self.doc.add_heading(
|
1332
1332
|
heading_text,
|
1333
1333
|
level=heading_level,
|
1334
|
-
parent=self.parents[heading_level],
|
1334
|
+
parent=self.parents[heading_level],
|
1335
1335
|
)
|
1336
1336
|
for text in self.claims:
|
1337
1337
|
self.doc.add_text(
|
@@ -1350,14 +1350,14 @@ class PatentUsptoAppV1(PatentUspto):
|
|
1350
1350
|
self.parents[self.level + 1] = self.doc.add_heading(
|
1351
1351
|
text=text,
|
1352
1352
|
level=self.level,
|
1353
|
-
parent=self.parents[self.level],
|
1353
|
+
parent=self.parents[self.level],
|
1354
1354
|
)
|
1355
1355
|
self.level += 1
|
1356
1356
|
else:
|
1357
1357
|
self.doc.add_text(
|
1358
1358
|
label=DocItemLabel.PARAGRAPH,
|
1359
1359
|
text=text,
|
1360
|
-
parent=self.parents[self.level],
|
1360
|
+
parent=self.parents[self.level],
|
1361
1361
|
)
|
1362
1362
|
self.text = ""
|
1363
1363
|
|
@@ -1366,7 +1366,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|
1366
1366
|
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
|
1367
1367
|
self.doc.add_table(
|
1368
1368
|
data=empty_table,
|
1369
|
-
parent=self.parents[self.level],
|
1369
|
+
parent=self.parents[self.level],
|
1370
1370
|
)
|
1371
1371
|
|
1372
1372
|
def _apply_style(self, text: str, style_tag: str) -> str:
|
docling/cli/main.py
CHANGED
@@ -1,18 +1,18 @@
|
|
1
1
|
import importlib
|
2
|
-
import json
|
3
2
|
import logging
|
3
|
+
import platform
|
4
4
|
import re
|
5
|
+
import sys
|
5
6
|
import tempfile
|
6
7
|
import time
|
7
8
|
import warnings
|
8
|
-
from enum import Enum
|
9
9
|
from pathlib import Path
|
10
10
|
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
11
11
|
|
12
12
|
import typer
|
13
13
|
from docling_core.types.doc import ImageRefMode
|
14
14
|
from docling_core.utils.file import resolve_source_to_path
|
15
|
-
from pydantic import TypeAdapter
|
15
|
+
from pydantic import TypeAdapter
|
16
16
|
|
17
17
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
18
18
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
@@ -65,10 +65,15 @@ def version_callback(value: bool):
|
|
65
65
|
docling_core_version = importlib.metadata.version("docling-core")
|
66
66
|
docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
|
67
67
|
docling_parse_version = importlib.metadata.version("docling-parse")
|
68
|
+
platform_str = platform.platform()
|
69
|
+
py_impl_version = sys.implementation.cache_tag
|
70
|
+
py_lang_version = platform.python_version()
|
68
71
|
print(f"Docling version: {docling_version}")
|
69
72
|
print(f"Docling Core version: {docling_core_version}")
|
70
73
|
print(f"Docling IBM Models version: {docling_ibm_models_version}")
|
71
74
|
print(f"Docling Parse version: {docling_parse_version}")
|
75
|
+
print(f"Python: {py_impl_version} ({py_lang_version})")
|
76
|
+
print(f"Platform: {platform_str}")
|
72
77
|
raise typer.Exit()
|
73
78
|
|
74
79
|
|
@@ -206,6 +211,14 @@ def convert(
|
|
206
211
|
TableFormerMode,
|
207
212
|
typer.Option(..., help="The mode to use in the table structure model."),
|
208
213
|
] = TableFormerMode.FAST,
|
214
|
+
enrich_code: Annotated[
|
215
|
+
bool,
|
216
|
+
typer.Option(..., help="Enable the code enrichment model in the pipeline."),
|
217
|
+
] = False,
|
218
|
+
enrich_formula: Annotated[
|
219
|
+
bool,
|
220
|
+
typer.Option(..., help="Enable the formula enrichment model in the pipeline."),
|
221
|
+
] = False,
|
209
222
|
artifacts_path: Annotated[
|
210
223
|
Optional[Path],
|
211
224
|
typer.Option(..., help="If provided, the location of the model artifacts."),
|
@@ -360,6 +373,8 @@ def convert(
|
|
360
373
|
do_ocr=ocr,
|
361
374
|
ocr_options=ocr_options,
|
362
375
|
do_table_structure=True,
|
376
|
+
do_code_enrichment=enrich_code,
|
377
|
+
do_formula_enrichment=enrich_formula,
|
363
378
|
document_timeout=document_timeout,
|
364
379
|
)
|
365
380
|
pipeline_options.table_structure_options.do_cell_matching = (
|
docling/datamodel/document.py
CHANGED
@@ -352,6 +352,8 @@ class _DocumentConversionInput(BaseModel):
|
|
352
352
|
mime = FormatToMimeType[InputFormat.MD][0]
|
353
353
|
elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
|
354
354
|
mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
|
355
|
+
elif ext in FormatToExtensions[InputFormat.PDF]:
|
356
|
+
mime = FormatToMimeType[InputFormat.PDF][0]
|
355
357
|
return mime
|
356
358
|
|
357
359
|
@staticmethod
|
@@ -119,6 +119,7 @@ class RapidOcrOptions(OcrOptions):
|
|
119
119
|
det_model_path: Optional[str] = None # same default as rapidocr
|
120
120
|
cls_model_path: Optional[str] = None # same default as rapidocr
|
121
121
|
rec_model_path: Optional[str] = None # same default as rapidocr
|
122
|
+
rec_keys_path: Optional[str] = None # same default as rapidocr
|
122
123
|
|
123
124
|
model_config = ConfigDict(
|
124
125
|
extra="forbid",
|
@@ -4,7 +4,7 @@ import logging
|
|
4
4
|
import os
|
5
5
|
import tempfile
|
6
6
|
from subprocess import DEVNULL, PIPE, Popen
|
7
|
-
from typing import Iterable, Optional, Tuple
|
7
|
+
from typing import Iterable, List, Optional, Tuple
|
8
8
|
|
9
9
|
import pandas as pd
|
10
10
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
@@ -14,6 +14,7 @@ from docling.datamodel.document import ConversionResult
|
|
14
14
|
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
15
15
|
from docling.datamodel.settings import settings
|
16
16
|
from docling.models.base_ocr_model import BaseOcrModel
|
17
|
+
from docling.utils.ocr_utils import map_tesseract_script
|
17
18
|
from docling.utils.profiling import TimeRecorder
|
18
19
|
|
19
20
|
_log = logging.getLogger(__name__)
|
@@ -28,10 +29,13 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
28
29
|
|
29
30
|
self._name: Optional[str] = None
|
30
31
|
self._version: Optional[str] = None
|
32
|
+
self._tesseract_languages: Optional[List[str]] = None
|
33
|
+
self._script_prefix: Optional[str] = None
|
31
34
|
|
32
35
|
if self.enabled:
|
33
36
|
try:
|
34
37
|
self._get_name_and_version()
|
38
|
+
self._set_languages_and_prefix()
|
35
39
|
|
36
40
|
except Exception as exc:
|
37
41
|
raise RuntimeError(
|
@@ -73,12 +77,20 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
73
77
|
return name, version
|
74
78
|
|
75
79
|
def _run_tesseract(self, ifilename: str):
|
76
|
-
|
80
|
+
r"""
|
81
|
+
Run tesseract CLI
|
82
|
+
"""
|
77
83
|
cmd = [self.options.tesseract_cmd]
|
78
84
|
|
79
|
-
if
|
85
|
+
if "auto" in self.options.lang:
|
86
|
+
lang = self._detect_language(ifilename)
|
87
|
+
if lang is not None:
|
88
|
+
cmd.append("-l")
|
89
|
+
cmd.append(lang)
|
90
|
+
elif self.options.lang is not None and len(self.options.lang) > 0:
|
80
91
|
cmd.append("-l")
|
81
92
|
cmd.append("+".join(self.options.lang))
|
93
|
+
|
82
94
|
if self.options.path is not None:
|
83
95
|
cmd.append("--tessdata-dir")
|
84
96
|
cmd.append(self.options.path)
|
@@ -106,6 +118,63 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
106
118
|
|
107
119
|
return df_filtered
|
108
120
|
|
121
|
+
def _detect_language(self, ifilename: str):
|
122
|
+
r"""
|
123
|
+
Run tesseract in PSM 0 mode to detect the language
|
124
|
+
"""
|
125
|
+
assert self._tesseract_languages is not None
|
126
|
+
|
127
|
+
cmd = [self.options.tesseract_cmd]
|
128
|
+
cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
|
129
|
+
_log.info("command: {}".format(" ".join(cmd)))
|
130
|
+
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
131
|
+
output, _ = proc.communicate()
|
132
|
+
decoded_data = output.decode("utf-8")
|
133
|
+
df = pd.read_csv(
|
134
|
+
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
135
|
+
)
|
136
|
+
scripts = df.loc[df["key"] == "Script"].value.tolist()
|
137
|
+
if len(scripts) == 0:
|
138
|
+
_log.warning("Tesseract cannot detect the script of the page")
|
139
|
+
return None
|
140
|
+
|
141
|
+
script = map_tesseract_script(scripts[0].strip())
|
142
|
+
lang = f"{self._script_prefix}{script}"
|
143
|
+
|
144
|
+
# Check if the detected language has been installed
|
145
|
+
if lang not in self._tesseract_languages:
|
146
|
+
msg = f"Tesseract detected the script '{script}' and language '{lang}'."
|
147
|
+
msg += " However this language is not installed in your system and will be ignored."
|
148
|
+
_log.warning(msg)
|
149
|
+
return None
|
150
|
+
|
151
|
+
_log.debug(
|
152
|
+
f"Using tesseract model for the detected script '{script}' and language '{lang}'"
|
153
|
+
)
|
154
|
+
return lang
|
155
|
+
|
156
|
+
def _set_languages_and_prefix(self):
|
157
|
+
r"""
|
158
|
+
Read and set the languages installed in tesseract and decide the script prefix
|
159
|
+
"""
|
160
|
+
# Get all languages
|
161
|
+
cmd = [self.options.tesseract_cmd]
|
162
|
+
cmd.append("--list-langs")
|
163
|
+
_log.info("command: {}".format(" ".join(cmd)))
|
164
|
+
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
165
|
+
output, _ = proc.communicate()
|
166
|
+
decoded_data = output.decode("utf-8")
|
167
|
+
df = pd.read_csv(io.StringIO(decoded_data), header=None)
|
168
|
+
self._tesseract_languages = df[0].tolist()[1:]
|
169
|
+
|
170
|
+
# Decide the script prefix
|
171
|
+
if any([l.startswith("script/") for l in self._tesseract_languages]):
|
172
|
+
script_prefix = "script/"
|
173
|
+
else:
|
174
|
+
script_prefix = ""
|
175
|
+
|
176
|
+
self._script_prefix = script_prefix
|
177
|
+
|
109
178
|
def __call__(
|
110
179
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
111
180
|
) -> Iterable[Page]:
|
@@ -120,7 +189,6 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
120
189
|
yield page
|
121
190
|
else:
|
122
191
|
with TimeRecorder(conv_res, "ocr"):
|
123
|
-
|
124
192
|
ocr_rects = self.get_ocr_rects(page)
|
125
193
|
|
126
194
|
all_ocr_cells = []
|
@@ -8,6 +8,7 @@ from docling.datamodel.document import ConversionResult
|
|
8
8
|
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
9
9
|
from docling.datamodel.settings import settings
|
10
10
|
from docling.models.base_ocr_model import BaseOcrModel
|
11
|
+
from docling.utils.ocr_utils import map_tesseract_script
|
11
12
|
from docling.utils.profiling import TimeRecorder
|
12
13
|
|
13
14
|
_log = logging.getLogger(__name__)
|
@@ -20,6 +21,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|
20
21
|
|
21
22
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
22
23
|
self.reader = None
|
24
|
+
self.osd_reader = None
|
23
25
|
|
24
26
|
if self.enabled:
|
25
27
|
install_errmsg = (
|
@@ -47,8 +49,8 @@ class TesseractOcrModel(BaseOcrModel):
|
|
47
49
|
except:
|
48
50
|
raise ImportError(install_errmsg)
|
49
51
|
|
50
|
-
_,
|
51
|
-
if not
|
52
|
+
_, self._tesserocr_languages = tesserocr.get_languages()
|
53
|
+
if not self._tesserocr_languages:
|
52
54
|
raise ImportError(missing_langs_errmsg)
|
53
55
|
|
54
56
|
# Initialize the tesseractAPI
|
@@ -57,7 +59,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|
57
59
|
|
58
60
|
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
59
61
|
|
60
|
-
if any([l.startswith("script/") for l in
|
62
|
+
if any([l.startswith("script/") for l in self._tesserocr_languages]):
|
61
63
|
self.script_prefix = "script/"
|
62
64
|
else:
|
63
65
|
self.script_prefix = ""
|
@@ -72,14 +74,14 @@ class TesseractOcrModel(BaseOcrModel):
|
|
72
74
|
tesserocr_kwargs["path"] = self.options.path
|
73
75
|
|
74
76
|
if lang == "auto":
|
75
|
-
self.reader = tesserocr.PyTessBaseAPI(
|
77
|
+
self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
|
78
|
+
self.osd_reader = tesserocr.PyTessBaseAPI(
|
76
79
|
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
|
77
80
|
)
|
78
81
|
else:
|
79
82
|
self.reader = tesserocr.PyTessBaseAPI(
|
80
83
|
**{"lang": lang} | tesserocr_kwargs,
|
81
84
|
)
|
82
|
-
|
83
85
|
self.reader_RIL = tesserocr.RIL
|
84
86
|
|
85
87
|
def __del__(self):
|
@@ -96,8 +98,6 @@ class TesseractOcrModel(BaseOcrModel):
|
|
96
98
|
yield from page_batch
|
97
99
|
return
|
98
100
|
|
99
|
-
import tesserocr
|
100
|
-
|
101
101
|
for page in page_batch:
|
102
102
|
assert page._backend is not None
|
103
103
|
if not page._backend.is_valid():
|
@@ -105,6 +105,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|
105
105
|
else:
|
106
106
|
with TimeRecorder(conv_res, "ocr"):
|
107
107
|
assert self.reader is not None
|
108
|
+
assert self._tesserocr_languages is not None
|
108
109
|
|
109
110
|
ocr_rects = self.get_ocr_rects(page)
|
110
111
|
|
@@ -117,43 +118,42 @@ class TesseractOcrModel(BaseOcrModel):
|
|
117
118
|
scale=self.scale, cropbox=ocr_rect
|
118
119
|
)
|
119
120
|
|
120
|
-
|
121
|
-
self.
|
121
|
+
local_reader = self.reader
|
122
|
+
if "auto" in self.options.lang:
|
123
|
+
assert self.osd_reader is not None
|
122
124
|
|
123
|
-
|
124
|
-
osd = self.
|
125
|
+
self.osd_reader.SetImage(high_res_image)
|
126
|
+
osd = self.osd_reader.DetectOrientationScript()
|
125
127
|
|
126
128
|
# No text, probably
|
127
129
|
if osd is None:
|
128
130
|
continue
|
129
131
|
|
130
132
|
script = osd["script_name"]
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
local_reader = self.reader
|
156
|
-
|
133
|
+
script = map_tesseract_script(script)
|
134
|
+
lang = f"{self.script_prefix}{script}"
|
135
|
+
|
136
|
+
# Check if the detected languge is present in the system
|
137
|
+
if lang not in self._tesserocr_languages:
|
138
|
+
msg = f"Tesseract detected the script '{script}' and language '{lang}'."
|
139
|
+
msg += " However this language is not installed in your system and will be ignored."
|
140
|
+
_log.warning(msg)
|
141
|
+
else:
|
142
|
+
if script not in self.script_readers:
|
143
|
+
import tesserocr
|
144
|
+
|
145
|
+
self.script_readers[script] = (
|
146
|
+
tesserocr.PyTessBaseAPI(
|
147
|
+
path=self.reader.GetDatapath(),
|
148
|
+
lang=lang,
|
149
|
+
psm=tesserocr.PSM.AUTO,
|
150
|
+
init=True,
|
151
|
+
oem=tesserocr.OEM.DEFAULT,
|
152
|
+
)
|
153
|
+
)
|
154
|
+
local_reader = self.script_readers[script]
|
155
|
+
|
156
|
+
local_reader.SetImage(high_res_image)
|
157
157
|
boxes = local_reader.GetComponentImages(
|
158
158
|
self.reader_RIL.TEXTLINE, True
|
159
159
|
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.17.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -78,22 +78,21 @@ Description-Content-Type: text/markdown
|
|
78
78
|
[](https://opensource.org/licenses/MIT)
|
79
79
|
[](https://pepy.tech/projects/docling)
|
80
80
|
|
81
|
-
Docling
|
81
|
+
Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
|
82
82
|
|
83
83
|
## Features
|
84
84
|
|
85
|
-
* 🗂️
|
86
|
-
* 📑 Advanced PDF
|
87
|
-
*
|
88
|
-
*
|
89
|
-
*
|
85
|
+
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more
|
86
|
+
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
87
|
+
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
88
|
+
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON
|
89
|
+
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
90
|
+
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
91
|
+
* 🔍 Extensive OCR support for scanned PDFs and images
|
90
92
|
* 💻 Simple and convenient CLI
|
91
93
|
|
92
|
-
Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
|
93
|
-
|
94
94
|
### Coming soon
|
95
95
|
|
96
|
-
* ♾️ Equation & code extraction
|
97
96
|
* 📝 Metadata extraction, including title, authors, references & language
|
98
97
|
|
99
98
|
## Installation
|
@@ -177,3 +176,7 @@ For individual model usage, please refer to the model licenses found in the orig
|
|
177
176
|
|
178
177
|
Docling has been brought to you by IBM.
|
179
178
|
|
179
|
+
[supported_formats]: https://ds4sd.github.io/docling/supported_formats/
|
180
|
+
[docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/
|
181
|
+
[integrations]: https://ds4sd.github.io/docling/integrations/
|
182
|
+
|
@@ -4,10 +4,10 @@ docling/backend/abstract_backend.py,sha256=1lNxzwDTn303aXduPDVmTyXn-5ZIoWMLYqNxA
|
|
4
4
|
docling/backend/asciidoc_backend.py,sha256=zyHxlG_BvlLwvpdNca3P6aopxOJZw8wbDFkJQQknNXk,14050
|
5
5
|
docling/backend/docling_parse_backend.py,sha256=hEEJibI1oJS0LAnFoIs6gMshS3bCqGtVxHnDNvBGZuA,7649
|
6
6
|
docling/backend/docling_parse_v2_backend.py,sha256=IpwrBrtLGwNRl5AYO-o3NjEfNRsAkuMhzvDt2HXb9Ko,8655
|
7
|
-
docling/backend/html_backend.py,sha256=
|
7
|
+
docling/backend/html_backend.py,sha256=DDfQ84VQB4nF_0wgGtbYUA9luVumB5bjjoWHjESa6Tk,15596
|
8
8
|
docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
9
|
docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
|
10
|
-
docling/backend/md_backend.py,sha256=
|
10
|
+
docling/backend/md_backend.py,sha256=PicGKM2cg4r1lztr46eC4sKbFLvGnqzrEcLTE5fW1zc,14426
|
11
11
|
docling/backend/msexcel_backend.py,sha256=lyJc4ShJGAN2ZfNTTuhdYTF-44cZsGyn_8Djstp3IEU,12700
|
12
12
|
docling/backend/mspowerpoint_backend.py,sha256=kOGawhcn0BFq4M_C6kW0mY8vMIB24_6R6q6GaszbSt0,15957
|
13
13
|
docling/backend/msword_backend.py,sha256=WcQmRYmpH8o2snGoWGxNRkCtUI3mf2JL3-9CxAfDAJg,19232
|
@@ -15,14 +15,14 @@ docling/backend/pdf_backend.py,sha256=17Pr8dWsD1C4FYUprrwMM9trDGW-JYLjrcScx1Ul4i
|
|
15
15
|
docling/backend/pypdfium2_backend.py,sha256=QSPfp903ZtSpoNqPmcIek0HmvETrJ1kkwrdxnF5pjS0,9014
|
16
16
|
docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
17
|
docling/backend/xml/pubmed_backend.py,sha256=LMnpowjnxa5SydfNC00Ll840BYraL8dCJu-FfC9iSKk,20447
|
18
|
-
docling/backend/xml/uspto_backend.py,sha256=
|
18
|
+
docling/backend/xml/uspto_backend.py,sha256=a5GxWLj2SUR5Of8TWJinhef1gKyaQSjHPVXvGiN8yG8,70324
|
19
19
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
20
20
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
|
-
docling/cli/main.py,sha256=
|
21
|
+
docling/cli/main.py,sha256=K5C2yQIoM40_W3YU8a7SmneY-hWbNp_JOFPLk0NPcDI,16098
|
22
22
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
23
|
docling/datamodel/base_models.py,sha256=vewP1X99qfAwiUsiC2m8CBDGiQPsGyp_WkKJHYPoYn4,7026
|
24
|
-
docling/datamodel/document.py,sha256=
|
25
|
-
docling/datamodel/pipeline_options.py,sha256=
|
24
|
+
docling/datamodel/document.py,sha256=vuY8S9n-_w5UQl-7C_wasrW4bSHPQeAeH4RR-MWrGW4,13315
|
25
|
+
docling/datamodel/pipeline_options.py,sha256=f9-VQFgOdahyclGQgH_T8ZYBopkWsF_fbWbxo39ux3g,7888
|
26
26
|
docling/datamodel/settings.py,sha256=Sw0rN_f8rdLV1eNvVeKiyET2Oe6oz9jtW3lJzniW9Do,1302
|
27
27
|
docling/document_converter.py,sha256=qtYPEkWuMUUGmFko2in38iSHdYrjAFf_GHNoXRRvEVs,12631
|
28
28
|
docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
|
@@ -37,10 +37,10 @@ docling/models/layout_model.py,sha256=3Fw7OM6g0j7NgItKsQOgFOCd1q6lp1DacN_db7f6QC
|
|
37
37
|
docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
|
38
38
|
docling/models/page_assemble_model.py,sha256=c5KLKwkUIdW0JcDHizWsqrpb5x_3DK28x82Q8o-3VJM,5968
|
39
39
|
docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
|
40
|
-
docling/models/rapid_ocr_model.py,sha256=
|
40
|
+
docling/models/rapid_ocr_model.py,sha256=2HXmurNRPP6qyqn7U5h9NQIs8zi0TMHf56CpcKQk0fU,5038
|
41
41
|
docling/models/table_structure_model.py,sha256=fUpCHthO4Uk3BhA99a85BHBm51fmdE9kfqhAk3WjuBw,9392
|
42
|
-
docling/models/tesseract_ocr_cli_model.py,sha256=
|
43
|
-
docling/models/tesseract_ocr_model.py,sha256=
|
42
|
+
docling/models/tesseract_ocr_cli_model.py,sha256=b2Is5x2gZLS6mQWnKe0y7p6UU6hRTHDfoH4D2RQ5mx0,9310
|
43
|
+
docling/models/tesseract_ocr_model.py,sha256=BN85u-4a-xzUY7Iw21Ib8L8kx4mgbDGiUtxBelLiJm8,8513
|
44
44
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
45
45
|
docling/pipeline/base_pipeline.py,sha256=J0ZjtincsJr-BbRgqoQozxIhDWxWFlWaS9CTPwypJFk,8621
|
46
46
|
docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
|
@@ -51,11 +51,12 @@ docling/utils/accelerator_utils.py,sha256=ZjULCn-qhxqx3frF-rJmAlWdzqgUMxH5utLHbS
|
|
51
51
|
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
52
52
|
docling/utils/glm_utils.py,sha256=Nfxdx0W-sl1owYncTeJmZdiPcn-jpTqK8f8TeQlDOMY,11683
|
53
53
|
docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
|
54
|
+
docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,263
|
54
55
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
55
56
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
56
57
|
docling/utils/visualization.py,sha256=4pn-80fVuE04ken7hUg5Ar47ndRSL9MWBgdHM-1g1zU,2735
|
57
|
-
docling-2.
|
58
|
-
docling-2.
|
59
|
-
docling-2.
|
60
|
-
docling-2.
|
61
|
-
docling-2.
|
58
|
+
docling-2.17.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
59
|
+
docling-2.17.0.dist-info/METADATA,sha256=BkpXBck-2EjuYUsn_2aAGftgdbf260baqAb9P8ZixSM,8025
|
60
|
+
docling-2.17.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
61
|
+
docling-2.17.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
62
|
+
docling-2.17.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|