docling 2.47.0__tar.gz → 2.48.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.47.0 → docling-2.48.0}/PKG-INFO +4 -3
- {docling-2.47.0 → docling-2.48.0}/docling/backend/html_backend.py +172 -76
- {docling-2.47.0 → docling-2.48.0}/docling/datamodel/pipeline_options.py +2 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/rapid_ocr_model.py +37 -16
- {docling-2.47.0 → docling-2.48.0}/docling/pipeline/base_pipeline.py +3 -2
- {docling-2.47.0 → docling-2.48.0}/docling.egg-info/PKG-INFO +4 -3
- {docling-2.47.0 → docling-2.48.0}/docling.egg-info/requires.txt +4 -3
- {docling-2.47.0 → docling-2.48.0}/pyproject.toml +4 -3
- {docling-2.47.0 → docling-2.48.0}/tests/test_backend_webp.py +2 -2
- {docling-2.47.0 → docling-2.48.0}/LICENSE +0 -0
- {docling-2.47.0 → docling-2.48.0}/README.md +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/__init__.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/__init__.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/docling_parse_v4_backend.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/md_backend.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/mets_gbs_backend.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/noop_backend.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/chunking/__init__.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/cli/__init__.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/cli/main.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/cli/models.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/cli/tools.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/datamodel/asr_model_specs.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/datamodel/base_models.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/datamodel/document.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/datamodel/layout_model_specs.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/datamodel/settings.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/document_converter.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/exceptions.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/__init__.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/api_vlm_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/base_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/layout_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/models/vlm_models_inline/vllm_model.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/pipeline/asr_pipeline.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/py.typed +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/utils/__init__.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/utils/export.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/utils/locks.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/utils/orientation.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/utils/profiling.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/utils/utils.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling/utils/visualization.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling.egg-info/SOURCES.txt +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.47.0 → docling-2.48.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.47.0 → docling-2.48.0}/setup.cfg +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_asr_pipeline.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_backend_csv.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_backend_html.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_backend_jats.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_backend_markdown.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_backend_mets_gbs.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_backend_msexcel.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_backend_msword.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_backend_pdfium.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_backend_pptx.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_cli.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_code_formula.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_e2e_conversion.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_e2e_ocr_conversion.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_input_doc.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_interfaces.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_invalid_input.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_ocr_utils.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_options.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_settings_load.py +0 -0
- {docling-2.47.0 → docling-2.48.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.48.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -59,10 +59,11 @@ Provides-Extra: vlm
|
|
59
59
|
Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
|
60
60
|
Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
|
61
61
|
Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
|
62
|
-
Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux") and extra == "vlm"
|
62
|
+
Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux" and platform_machine == "x86_64") and extra == "vlm"
|
63
63
|
Provides-Extra: rapidocr
|
64
|
-
Requires-Dist: rapidocr
|
64
|
+
Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14" and extra == "rapidocr"
|
65
65
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
66
|
+
Requires-Dist: modelscope>=1.29.0; extra == "rapidocr"
|
66
67
|
Provides-Extra: asr
|
67
68
|
Requires-Dist: openai-whisper>=20250625; extra == "asr"
|
68
69
|
Dynamic: license-file
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import logging
|
2
2
|
import re
|
3
|
+
import traceback
|
3
4
|
from contextlib import contextmanager
|
4
5
|
from copy import deepcopy
|
5
6
|
from io import BytesIO
|
@@ -45,20 +46,22 @@ _BLOCK_TAGS: Final = {
|
|
45
46
|
"h4",
|
46
47
|
"h5",
|
47
48
|
"h6",
|
49
|
+
"ol",
|
48
50
|
"p",
|
49
51
|
"pre",
|
50
|
-
"code",
|
51
|
-
"ul",
|
52
|
-
"ol",
|
53
52
|
"summary",
|
54
53
|
"table",
|
54
|
+
"ul",
|
55
55
|
}
|
56
56
|
|
57
|
+
_CODE_TAG_SET: Final = {"code", "kbd", "samp"}
|
58
|
+
|
57
59
|
_FORMAT_TAG_MAP: Final = {
|
58
60
|
"b": {"bold": True},
|
59
61
|
"strong": {"bold": True},
|
60
62
|
"i": {"italic": True},
|
61
63
|
"em": {"italic": True},
|
64
|
+
"var": {"italic": True},
|
62
65
|
# "mark",
|
63
66
|
# "small",
|
64
67
|
"s": {"strikethrough": True},
|
@@ -67,6 +70,7 @@ _FORMAT_TAG_MAP: Final = {
|
|
67
70
|
"ins": {"underline": True},
|
68
71
|
"sub": {"script": Script.SUB},
|
69
72
|
"sup": {"script": Script.SUPER},
|
73
|
+
**{k: {} for k in _CODE_TAG_SET},
|
70
74
|
}
|
71
75
|
|
72
76
|
|
@@ -79,6 +83,7 @@ class AnnotatedText(BaseModel):
|
|
79
83
|
text: str
|
80
84
|
hyperlink: Union[AnyUrl, Path, None] = None
|
81
85
|
formatting: Union[Formatting, None] = None
|
86
|
+
code: bool = False
|
82
87
|
|
83
88
|
|
84
89
|
class AnnotatedTextList(list):
|
@@ -86,10 +91,12 @@ class AnnotatedTextList(list):
|
|
86
91
|
current_h = None
|
87
92
|
current_text = ""
|
88
93
|
current_f = None
|
94
|
+
current_code = False
|
89
95
|
for at in self:
|
90
96
|
t = at.text
|
91
97
|
h = at.hyperlink
|
92
98
|
f = at.formatting
|
99
|
+
c = at.code
|
93
100
|
current_text += t.strip() + " "
|
94
101
|
if f is not None and current_f is None:
|
95
102
|
current_f = f
|
@@ -103,8 +110,13 @@ class AnnotatedTextList(list):
|
|
103
110
|
_log.warning(
|
104
111
|
f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
|
105
112
|
)
|
113
|
+
current_code = c if c else current_code
|
114
|
+
|
106
115
|
return AnnotatedText(
|
107
|
-
text=current_text.strip(),
|
116
|
+
text=current_text.strip(),
|
117
|
+
hyperlink=current_h,
|
118
|
+
formatting=current_f,
|
119
|
+
code=current_code,
|
108
120
|
)
|
109
121
|
|
110
122
|
def simplify_text_elements(self) -> "AnnotatedTextList":
|
@@ -114,9 +126,14 @@ class AnnotatedTextList(list):
|
|
114
126
|
text = self[0].text
|
115
127
|
hyperlink = self[0].hyperlink
|
116
128
|
formatting = self[0].formatting
|
129
|
+
code = self[0].code
|
117
130
|
last_elm = text
|
118
131
|
for i in range(1, len(self)):
|
119
|
-
if
|
132
|
+
if (
|
133
|
+
hyperlink == self[i].hyperlink
|
134
|
+
and formatting == self[i].formatting
|
135
|
+
and code == self[i].code
|
136
|
+
):
|
120
137
|
sep = " "
|
121
138
|
if not self[i].text.strip() or not last_elm.strip():
|
122
139
|
sep = ""
|
@@ -124,15 +141,20 @@ class AnnotatedTextList(list):
|
|
124
141
|
last_elm = self[i].text
|
125
142
|
else:
|
126
143
|
simplified.append(
|
127
|
-
AnnotatedText(
|
144
|
+
AnnotatedText(
|
145
|
+
text=text, hyperlink=hyperlink, formatting=formatting, code=code
|
146
|
+
)
|
128
147
|
)
|
129
148
|
text = self[i].text
|
130
149
|
last_elm = text
|
131
150
|
hyperlink = self[i].hyperlink
|
132
151
|
formatting = self[i].formatting
|
152
|
+
code = self[i].code
|
133
153
|
if text:
|
134
154
|
simplified.append(
|
135
|
-
AnnotatedText(
|
155
|
+
AnnotatedText(
|
156
|
+
text=text, hyperlink=hyperlink, formatting=formatting, code=code
|
157
|
+
)
|
136
158
|
)
|
137
159
|
return simplified
|
138
160
|
|
@@ -174,7 +196,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
174
196
|
self.ctx = _Context()
|
175
197
|
for i in range(self.max_levels):
|
176
198
|
self.parents[i] = None
|
177
|
-
self.hyperlink = None
|
199
|
+
self.hyperlink: Union[AnyUrl, Path, None] = None
|
178
200
|
self.original_url = original_url
|
179
201
|
self.format_tags: list[str] = []
|
180
202
|
|
@@ -235,9 +257,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
235
257
|
orig=title_text,
|
236
258
|
content_layer=ContentLayer.FURNITURE,
|
237
259
|
)
|
238
|
-
# remove
|
260
|
+
# remove script and style tags
|
239
261
|
for tag in self.soup(["script", "style"]):
|
240
262
|
tag.decompose()
|
263
|
+
# remove any hidden tag
|
264
|
+
for tag in self.soup(hidden=True):
|
265
|
+
tag.decompose()
|
266
|
+
|
241
267
|
content = self.soup.body or self.soup
|
242
268
|
# normalize <br> tags
|
243
269
|
for br in content("br"):
|
@@ -268,7 +294,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
268
294
|
def flush_buffer():
|
269
295
|
if not buffer:
|
270
296
|
return
|
271
|
-
annotated_text_list = buffer.simplify_text_elements()
|
297
|
+
annotated_text_list: AnnotatedTextList = buffer.simplify_text_elements()
|
272
298
|
parts = annotated_text_list.split_by_newline()
|
273
299
|
buffer.clear()
|
274
300
|
|
@@ -276,20 +302,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
276
302
|
return
|
277
303
|
|
278
304
|
for annotated_text_list in parts:
|
279
|
-
with self.
|
305
|
+
with self._use_inline_group(annotated_text_list, doc):
|
280
306
|
for annotated_text in annotated_text_list:
|
281
307
|
if annotated_text.text.strip():
|
282
308
|
seg_clean = HTMLDocumentBackend._clean_unicode(
|
283
309
|
annotated_text.text.strip()
|
284
310
|
)
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
311
|
+
if annotated_text.code:
|
312
|
+
doc.add_code(
|
313
|
+
parent=self.parents[self.level],
|
314
|
+
text=seg_clean,
|
315
|
+
content_layer=self.content_layer,
|
316
|
+
formatting=annotated_text.formatting,
|
317
|
+
hyperlink=annotated_text.hyperlink,
|
318
|
+
)
|
319
|
+
else:
|
320
|
+
doc.add_text(
|
321
|
+
parent=self.parents[self.level],
|
322
|
+
label=DocItemLabel.TEXT,
|
323
|
+
text=seg_clean,
|
324
|
+
content_layer=self.content_layer,
|
325
|
+
formatting=annotated_text.formatting,
|
326
|
+
hyperlink=annotated_text.hyperlink,
|
327
|
+
)
|
293
328
|
|
294
329
|
for node in element.contents:
|
295
330
|
if isinstance(node, Tag):
|
@@ -298,10 +333,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
298
333
|
flush_buffer()
|
299
334
|
self._emit_image(node, doc)
|
300
335
|
elif name in _FORMAT_TAG_MAP:
|
301
|
-
with self.
|
336
|
+
with self._use_format([name]):
|
302
337
|
self._walk(node, doc)
|
303
338
|
elif name == "a":
|
304
|
-
with self.
|
339
|
+
with self._use_hyperlink(node):
|
305
340
|
self._walk(node, doc)
|
306
341
|
elif name in _BLOCK_TAGS:
|
307
342
|
flush_buffer()
|
@@ -367,8 +402,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
367
402
|
this_parent = item.parent
|
368
403
|
while this_parent is not None:
|
369
404
|
if this_parent.name == "a" and this_parent.get("href"):
|
370
|
-
with self.
|
371
|
-
with self.
|
405
|
+
with self._use_format(format_tags):
|
406
|
+
with self._use_hyperlink(this_parent):
|
372
407
|
return self._extract_text_and_hyperlink_recursively(
|
373
408
|
item, ignore_list
|
374
409
|
)
|
@@ -379,6 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
379
414
|
|
380
415
|
if isinstance(item, NavigableString):
|
381
416
|
text = item.strip()
|
417
|
+
code = any(code_tag in self.format_tags for code_tag in _CODE_TAG_SET)
|
382
418
|
if text:
|
383
419
|
return AnnotatedTextList(
|
384
420
|
[
|
@@ -386,6 +422,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
386
422
|
text=text,
|
387
423
|
hyperlink=self.hyperlink,
|
388
424
|
formatting=self._formatting,
|
425
|
+
code=code,
|
389
426
|
)
|
390
427
|
]
|
391
428
|
)
|
@@ -396,6 +433,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
396
433
|
text="\n",
|
397
434
|
hyperlink=self.hyperlink,
|
398
435
|
formatting=self._formatting,
|
436
|
+
code=code,
|
399
437
|
)
|
400
438
|
]
|
401
439
|
)
|
@@ -405,14 +443,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
405
443
|
if not ignore_list or (tag.name not in ["ul", "ol"]):
|
406
444
|
for child in tag:
|
407
445
|
if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP:
|
408
|
-
with self.
|
446
|
+
with self._use_format([child.name]):
|
409
447
|
result.extend(
|
410
448
|
self._extract_text_and_hyperlink_recursively(
|
411
449
|
child, ignore_list, keep_newlines=keep_newlines
|
412
450
|
)
|
413
451
|
)
|
414
452
|
elif isinstance(child, Tag) and child.name == "a":
|
415
|
-
with self.
|
453
|
+
with self._use_hyperlink(child):
|
416
454
|
result.extend(
|
417
455
|
self._extract_text_and_hyperlink_recursively(
|
418
456
|
child, ignore_list, keep_newlines=keep_newlines
|
@@ -428,29 +466,30 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
428
466
|
return result
|
429
467
|
|
430
468
|
@contextmanager
|
431
|
-
def
|
469
|
+
def _use_hyperlink(self, tag: Tag):
|
432
470
|
this_href = tag.get("href")
|
433
471
|
if this_href is None:
|
434
472
|
yield None
|
435
473
|
else:
|
436
|
-
if this_href:
|
437
|
-
old_hyperlink = self.hyperlink
|
474
|
+
if isinstance(this_href, str) and this_href:
|
475
|
+
old_hyperlink: Union[AnyUrl, Path, None] = self.hyperlink
|
476
|
+
new_hyperlink: Union[AnyUrl, Path, None] = None
|
438
477
|
if self.original_url is not None:
|
439
|
-
this_href = urljoin(self.original_url, this_href)
|
478
|
+
this_href = urljoin(str(self.original_url), str(this_href))
|
440
479
|
# ugly fix for relative links since pydantic does not support them.
|
441
480
|
try:
|
442
|
-
AnyUrl(this_href)
|
481
|
+
new_hyperlink = AnyUrl(this_href)
|
443
482
|
except ValidationError:
|
444
|
-
|
445
|
-
self.hyperlink =
|
483
|
+
new_hyperlink = Path(this_href)
|
484
|
+
self.hyperlink = new_hyperlink
|
446
485
|
try:
|
447
486
|
yield None
|
448
487
|
finally:
|
449
|
-
if
|
488
|
+
if new_hyperlink:
|
450
489
|
self.hyperlink = old_hyperlink
|
451
490
|
|
452
491
|
@contextmanager
|
453
|
-
def
|
492
|
+
def _use_format(self, tags: list[str]):
|
454
493
|
if not tags:
|
455
494
|
yield None
|
456
495
|
else:
|
@@ -461,7 +500,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
461
500
|
self.format_tags = self.format_tags[: -len(tags)]
|
462
501
|
|
463
502
|
@contextmanager
|
464
|
-
def
|
503
|
+
def _use_inline_group(
|
465
504
|
self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
|
466
505
|
):
|
467
506
|
"""Create an inline group for annotated texts.
|
@@ -473,9 +512,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
473
512
|
Args:
|
474
513
|
annotated_text_list (AnnotatedTextList): Annotated text
|
475
514
|
doc (DoclingDocument): Currently used document
|
476
|
-
|
477
|
-
Yields:
|
478
|
-
None: _description_
|
479
515
|
"""
|
480
516
|
if len(annotated_text_list) > 1:
|
481
517
|
inline_fmt = doc.add_group(
|
@@ -493,6 +529,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
493
529
|
else:
|
494
530
|
yield None
|
495
531
|
|
532
|
+
@contextmanager
|
533
|
+
def _use_details(self, tag: Tag, doc: DoclingDocument):
|
534
|
+
"""Create a group with the content of a details tag.
|
535
|
+
|
536
|
+
While the context manager is active, the hierarchy level is set one
|
537
|
+
level higher as the cuurent parent.
|
538
|
+
|
539
|
+
Args:
|
540
|
+
tag: The details tag.
|
541
|
+
doc: Currently used document.
|
542
|
+
"""
|
543
|
+
self.parents[self.level + 1] = doc.add_group(
|
544
|
+
name=tag.name,
|
545
|
+
label=GroupLabel.SECTION,
|
546
|
+
parent=self.parents[self.level],
|
547
|
+
content_layer=self.content_layer,
|
548
|
+
)
|
549
|
+
self.level += 1
|
550
|
+
try:
|
551
|
+
yield None
|
552
|
+
finally:
|
553
|
+
self.parents[self.level + 1] = None
|
554
|
+
self.level -= 1
|
555
|
+
|
556
|
+
@contextmanager
|
557
|
+
def _use_footer(self, tag: Tag, doc: DoclingDocument):
|
558
|
+
"""Create a group with a footer.
|
559
|
+
|
560
|
+
Create a group with the content of a footer tag. While the context manager
|
561
|
+
is active, the hierarchy level is set one level higher as the cuurent parent.
|
562
|
+
|
563
|
+
Args:
|
564
|
+
tag: The footer tag.
|
565
|
+
doc: Currently used document.
|
566
|
+
"""
|
567
|
+
current_layer = self.content_layer
|
568
|
+
self.content_layer = ContentLayer.FURNITURE
|
569
|
+
self.parents[self.level + 1] = doc.add_group(
|
570
|
+
name=tag.name,
|
571
|
+
label=GroupLabel.SECTION,
|
572
|
+
parent=self.parents[self.level],
|
573
|
+
content_layer=self.content_layer,
|
574
|
+
)
|
575
|
+
self.level += 1
|
576
|
+
try:
|
577
|
+
yield None
|
578
|
+
finally:
|
579
|
+
self.parents[self.level + 1] = None
|
580
|
+
self.level -= 1
|
581
|
+
self.content_layer = current_layer
|
582
|
+
|
496
583
|
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
|
497
584
|
tag_name = tag.name.lower()
|
498
585
|
# set default content layer to BODY as soon as we encounter a heading
|
@@ -611,20 +698,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
611
698
|
content_layer=self.content_layer,
|
612
699
|
)
|
613
700
|
self.level += 1
|
614
|
-
with self.
|
701
|
+
with self._use_inline_group(min_parts, doc):
|
615
702
|
for annotated_text in min_parts:
|
616
703
|
li_text = re.sub(
|
617
704
|
r"\s+|\n+", " ", annotated_text.text
|
618
705
|
).strip()
|
619
706
|
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
707
|
+
if annotated_text.code:
|
708
|
+
doc.add_code(
|
709
|
+
parent=self.parents[self.level],
|
710
|
+
text=li_clean,
|
711
|
+
content_layer=self.content_layer,
|
712
|
+
formatting=annotated_text.formatting,
|
713
|
+
hyperlink=annotated_text.hyperlink,
|
714
|
+
)
|
715
|
+
else:
|
716
|
+
doc.add_text(
|
717
|
+
parent=self.parents[self.level],
|
718
|
+
label=DocItemLabel.TEXT,
|
719
|
+
text=li_clean,
|
720
|
+
content_layer=self.content_layer,
|
721
|
+
formatting=annotated_text.formatting,
|
722
|
+
hyperlink=annotated_text.hyperlink,
|
723
|
+
)
|
628
724
|
|
629
725
|
# 4) recurse into any nested lists, attaching them to this <li> item
|
630
726
|
for sublist in li({"ul", "ol"}, recursive=False):
|
@@ -687,20 +783,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
687
783
|
text_list = self._extract_text_and_hyperlink_recursively(
|
688
784
|
tag, find_parent_annotation=True
|
689
785
|
)
|
690
|
-
annotated_texts = text_list.simplify_text_elements()
|
786
|
+
annotated_texts: AnnotatedTextList = text_list.simplify_text_elements()
|
691
787
|
for part in annotated_texts.split_by_newline():
|
692
|
-
with self.
|
788
|
+
with self._use_inline_group(part, doc):
|
693
789
|
for annotated_text in part:
|
694
790
|
if seg := annotated_text.text.strip():
|
695
791
|
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
792
|
+
if annotated_text.code:
|
793
|
+
doc.add_code(
|
794
|
+
parent=self.parents[self.level],
|
795
|
+
text=seg_clean,
|
796
|
+
content_layer=self.content_layer,
|
797
|
+
formatting=annotated_text.formatting,
|
798
|
+
hyperlink=annotated_text.hyperlink,
|
799
|
+
)
|
800
|
+
else:
|
801
|
+
doc.add_text(
|
802
|
+
parent=self.parents[self.level],
|
803
|
+
label=DocItemLabel.TEXT,
|
804
|
+
text=seg_clean,
|
805
|
+
content_layer=self.content_layer,
|
806
|
+
formatting=annotated_text.formatting,
|
807
|
+
hyperlink=annotated_text.hyperlink,
|
808
|
+
)
|
704
809
|
|
705
810
|
for img_tag in tag("img"):
|
706
811
|
if isinstance(img_tag, Tag):
|
@@ -718,13 +823,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
718
823
|
content_layer=self.content_layer,
|
719
824
|
)
|
720
825
|
|
721
|
-
elif tag_name in {"pre"
|
826
|
+
elif tag_name in {"pre"}:
|
722
827
|
# handle monospace code snippets (pre).
|
723
828
|
text_list = self._extract_text_and_hyperlink_recursively(
|
724
|
-
tag, find_parent_annotation=True
|
829
|
+
tag, find_parent_annotation=True, keep_newlines=True
|
725
830
|
)
|
726
831
|
annotated_texts = text_list.simplify_text_elements()
|
727
|
-
with self.
|
832
|
+
with self._use_inline_group(annotated_texts, doc):
|
728
833
|
for annotated_text in annotated_texts:
|
729
834
|
text_clean = HTMLDocumentBackend._clean_unicode(
|
730
835
|
annotated_text.text.strip()
|
@@ -737,22 +842,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
737
842
|
hyperlink=annotated_text.hyperlink,
|
738
843
|
)
|
739
844
|
|
740
|
-
elif tag_name
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
parent=self.parents[self.level],
|
748
|
-
content_layer=self.content_layer,
|
749
|
-
)
|
750
|
-
self.level += 1
|
751
|
-
self._walk(tag, doc)
|
752
|
-
self.parents[self.level + 1] = None
|
753
|
-
self.level -= 1
|
754
|
-
if tag_name == "footer":
|
755
|
-
self.content_layer = current_layer
|
845
|
+
elif tag_name == "footer":
|
846
|
+
with self._use_footer(tag, doc):
|
847
|
+
self._walk(tag, doc)
|
848
|
+
|
849
|
+
elif tag_name == "details":
|
850
|
+
with self._use_details(tag, doc):
|
851
|
+
self._walk(tag, doc)
|
756
852
|
|
757
853
|
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
|
758
854
|
figure = img_tag.find_parent("figure")
|
@@ -99,6 +99,8 @@ class RapidOcrOptions(OcrOptions):
|
|
99
99
|
# For more details on the following options visit
|
100
100
|
# https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
|
101
101
|
|
102
|
+
# https://rapidai.github.io/RapidOCRDocs/main/install_usage/rapidocr/usage/#__tabbed_3_4
|
103
|
+
backend: Literal["onnxruntime", "openvino", "paddle", "torch"] = "onnxruntime"
|
102
104
|
text_score: float = 0.5 # same default as rapidocr
|
103
105
|
|
104
106
|
use_det: Optional[bool] = None # same default as rapidocr
|
@@ -42,10 +42,10 @@ class RapidOcrModel(BaseOcrModel):
|
|
42
42
|
|
43
43
|
if self.enabled:
|
44
44
|
try:
|
45
|
-
from
|
45
|
+
from rapidocr import EngineType, RapidOCR # type: ignore
|
46
46
|
except ImportError:
|
47
47
|
raise ImportError(
|
48
|
-
"RapidOCR is not installed. Please install it via `pip install
|
48
|
+
"RapidOCR is not installed. Please install it via `pip install rapidocr onnxruntime` to use this OCR engine. "
|
49
49
|
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
50
50
|
)
|
51
51
|
|
@@ -54,21 +54,39 @@ class RapidOcrModel(BaseOcrModel):
|
|
54
54
|
use_cuda = str(AcceleratorDevice.CUDA.value).lower() in device
|
55
55
|
use_dml = accelerator_options.device == AcceleratorDevice.AUTO
|
56
56
|
intra_op_num_threads = accelerator_options.num_threads
|
57
|
+
_ALIASES = {
|
58
|
+
"onnxruntime": EngineType.ONNXRUNTIME,
|
59
|
+
"openvino": EngineType.OPENVINO,
|
60
|
+
"paddle": EngineType.PADDLE,
|
61
|
+
"torch": EngineType.TORCH,
|
62
|
+
}
|
63
|
+
backend_enum = _ALIASES.get(self.options.backend, EngineType.ONNXRUNTIME)
|
57
64
|
|
58
65
|
self.reader = RapidOCR(
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
66
|
+
params={
|
67
|
+
# Global settings (these are still correct)
|
68
|
+
"Global.text_score": self.options.text_score,
|
69
|
+
# "Global.verbose": self.options.print_verbose,
|
70
|
+
# Detection model settings
|
71
|
+
"Det.model_path": self.options.det_model_path,
|
72
|
+
"Det.use_cuda": use_cuda,
|
73
|
+
"Det.use_dml": use_dml,
|
74
|
+
"Det.intra_op_num_threads": intra_op_num_threads,
|
75
|
+
# Classification model settings
|
76
|
+
"Cls.model_path": self.options.cls_model_path,
|
77
|
+
"Cls.use_cuda": use_cuda,
|
78
|
+
"Cls.use_dml": use_dml,
|
79
|
+
"Cls.intra_op_num_threads": intra_op_num_threads,
|
80
|
+
# Recognition model settings
|
81
|
+
"Rec.model_path": self.options.rec_model_path,
|
82
|
+
"Rec.keys_path": self.options.rec_keys_path,
|
83
|
+
"Rec.use_cuda": use_cuda,
|
84
|
+
"Rec.use_dml": use_dml,
|
85
|
+
"Rec.intra_op_num_threads": intra_op_num_threads,
|
86
|
+
"Det.engine_type": backend_enum,
|
87
|
+
"Cls.engine_type": backend_enum,
|
88
|
+
"Rec.engine_type": backend_enum,
|
89
|
+
}
|
72
90
|
)
|
73
91
|
|
74
92
|
def __call__(
|
@@ -95,12 +113,15 @@ class RapidOcrModel(BaseOcrModel):
|
|
95
113
|
scale=self.scale, cropbox=ocr_rect
|
96
114
|
)
|
97
115
|
im = numpy.array(high_res_image)
|
98
|
-
result
|
116
|
+
result = self.reader(
|
99
117
|
im,
|
100
118
|
use_det=self.options.use_det,
|
101
119
|
use_cls=self.options.use_cls,
|
102
120
|
use_rec=self.options.use_rec,
|
103
121
|
)
|
122
|
+
result = list(
|
123
|
+
zip(result.boxes.tolist(), result.txts, result.scores)
|
124
|
+
)
|
104
125
|
|
105
126
|
del high_res_image
|
106
127
|
del im
|
@@ -146,6 +146,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
146
146
|
conv_res.pages.append(Page(page_no=i))
|
147
147
|
|
148
148
|
try:
|
149
|
+
total_pages_processed = 0
|
149
150
|
# Iterate batches of pages (page_batch_size) in the doc
|
150
151
|
for page_batch in chunkify(
|
151
152
|
conv_res.pages, settings.perf.page_batch_size
|
@@ -186,9 +187,9 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
186
187
|
)
|
187
188
|
conv_res.status = ConversionStatus.PARTIAL_SUCCESS
|
188
189
|
break
|
189
|
-
|
190
|
+
total_pages_processed += len(page_batch)
|
190
191
|
_log.debug(
|
191
|
-
f"Finished converting
|
192
|
+
f"Finished converting pages {total_pages_processed}/{len(conv_res.pages)} time={end_batch_time:.3f}"
|
192
193
|
)
|
193
194
|
|
194
195
|
except Exception as e:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.48.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -59,10 +59,11 @@ Provides-Extra: vlm
|
|
59
59
|
Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
|
60
60
|
Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
|
61
61
|
Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
|
62
|
-
Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux") and extra == "vlm"
|
62
|
+
Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux" and platform_machine == "x86_64") and extra == "vlm"
|
63
63
|
Provides-Extra: rapidocr
|
64
|
-
Requires-Dist: rapidocr
|
64
|
+
Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14" and extra == "rapidocr"
|
65
65
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
66
|
+
Requires-Dist: modelscope>=1.29.0; extra == "rapidocr"
|
66
67
|
Provides-Extra: asr
|
67
68
|
Requires-Dist: openai-whisper>=20250625; extra == "asr"
|
68
69
|
Dynamic: license-file
|
@@ -35,9 +35,10 @@ ocrmac<2.0.0,>=1.0.0
|
|
35
35
|
|
36
36
|
[rapidocr]
|
37
37
|
onnxruntime<2.0.0,>=1.7.0
|
38
|
+
modelscope>=1.29.0
|
38
39
|
|
39
|
-
[rapidocr:python_version < "3.
|
40
|
-
rapidocr
|
40
|
+
[rapidocr:python_version < "3.14"]
|
41
|
+
rapidocr<4.0.0,>=3.3
|
41
42
|
|
42
43
|
[tesserocr]
|
43
44
|
tesserocr<3.0.0,>=2.7.1
|
@@ -49,5 +50,5 @@ accelerate<2.0.0,>=1.2.1
|
|
49
50
|
[vlm:python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"]
|
50
51
|
mlx-vlm<1.0.0,>=0.3.0
|
51
52
|
|
52
|
-
[vlm:python_version >= "3.10" and sys_platform == "linux"]
|
53
|
+
[vlm:python_version >= "3.10" and sys_platform == "linux" and platform_machine == "x86_64"]
|
53
54
|
vllm<1.0.0,>=0.10.0
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.
|
3
|
+
version = "2.48.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
license = "MIT"
|
6
6
|
keywords = [
|
@@ -93,11 +93,12 @@ vlm = [
|
|
93
93
|
'transformers (>=4.46.0,<5.0.0)',
|
94
94
|
'accelerate (>=1.2.1,<2.0.0)',
|
95
95
|
'mlx-vlm (>=0.3.0,<1.0.0) ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
|
96
|
-
'vllm (>=0.10.0,<1.0.0) ; python_version >= "3.10" and sys_platform == "linux"',
|
96
|
+
'vllm (>=0.10.0,<1.0.0) ; python_version >= "3.10" and sys_platform == "linux" and platform_machine == "x86_64"',
|
97
97
|
]
|
98
98
|
rapidocr = [
|
99
|
-
'rapidocr
|
99
|
+
'rapidocr (>=3.3,<4.0.0) ; python_version < "3.14"',
|
100
100
|
'onnxruntime (>=1.7.0,<2.0.0)',
|
101
|
+
"modelscope>=1.29.0",
|
101
102
|
# 'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"',
|
102
103
|
# 'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"',
|
103
104
|
]
|
@@ -55,8 +55,8 @@ def test_e2e_webp_conversions():
|
|
55
55
|
TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]),
|
56
56
|
]
|
57
57
|
|
58
|
-
# rapidocr is only available for Python >=3.6,<3.
|
59
|
-
if sys.version_info < (3,
|
58
|
+
# rapidocr is only available for Python >=3.6,<3.14
|
59
|
+
if sys.version_info < (3, 14):
|
60
60
|
engines.append(RapidOcrOptions())
|
61
61
|
engines.append(RapidOcrOptions(force_full_page_ocr=True))
|
62
62
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|