docling 2.44.0__tar.gz → 2.46.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.44.0 → docling-2.46.0}/PKG-INFO +2 -2
- {docling-2.44.0 → docling-2.46.0}/docling/backend/docling_parse_v4_backend.py +61 -27
- {docling-2.44.0 → docling-2.46.0}/docling/backend/html_backend.py +356 -80
- docling-2.46.0/docling/backend/mets_gbs_backend.py +399 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/pdf_backend.py +3 -3
- {docling-2.44.0 → docling-2.46.0}/docling/cli/main.py +10 -0
- {docling-2.44.0 → docling-2.46.0}/docling/datamodel/base_models.py +3 -0
- {docling-2.44.0 → docling-2.46.0}/docling/datamodel/document.py +26 -0
- {docling-2.44.0 → docling-2.46.0}/docling/datamodel/pipeline_options.py +1 -3
- {docling-2.44.0 → docling-2.46.0}/docling/datamodel/pipeline_options_vlm_model.py +8 -2
- {docling-2.44.0 → docling-2.46.0}/docling/document_converter.py +4 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/api_vlm_model.py +2 -5
- {docling-2.44.0 → docling-2.46.0}/docling/models/code_formula_model.py +87 -76
- {docling-2.44.0 → docling-2.46.0}/docling/models/tesseract_ocr_cli_model.py +4 -2
- {docling-2.44.0 → docling-2.46.0}/docling/models/vlm_models_inline/hf_transformers_model.py +2 -4
- {docling-2.44.0 → docling-2.46.0}/docling/models/vlm_models_inline/mlx_model.py +2 -4
- {docling-2.44.0 → docling-2.46.0}/docling/pipeline/base_pipeline.py +14 -5
- {docling-2.44.0 → docling-2.46.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +6 -4
- {docling-2.44.0 → docling-2.46.0}/docling.egg-info/PKG-INFO +2 -2
- {docling-2.44.0 → docling-2.46.0}/docling.egg-info/SOURCES.txt +2 -0
- {docling-2.44.0 → docling-2.46.0}/docling.egg-info/requires.txt +1 -1
- {docling-2.44.0 → docling-2.46.0}/pyproject.toml +2 -2
- {docling-2.44.0 → docling-2.46.0}/tests/test_backend_html.py +52 -0
- docling-2.46.0/tests/test_backend_mets_gbs.py +77 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_e2e_conversion.py +1 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_interfaces.py +3 -0
- {docling-2.44.0 → docling-2.46.0}/LICENSE +0 -0
- {docling-2.44.0 → docling-2.46.0}/README.md +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/__init__.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/__init__.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/md_backend.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/noop_backend.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/chunking/__init__.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/cli/__init__.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/cli/models.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/cli/tools.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/datamodel/asr_model_specs.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/datamodel/layout_model_specs.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/datamodel/settings.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/exceptions.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/__init__.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/base_model.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/layout_model.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/pipeline/asr_pipeline.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/py.typed +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/utils/__init__.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/utils/export.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/utils/locks.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/utils/orientation.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/utils/profiling.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/utils/utils.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling/utils/visualization.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.44.0 → docling-2.46.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.44.0 → docling-2.46.0}/setup.cfg +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_asr_pipeline.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_backend_csv.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_backend_jats.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_backend_markdown.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_backend_msexcel.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_backend_msword.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_backend_pdfium.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_backend_pptx.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_backend_webp.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_cli.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_code_formula.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_e2e_ocr_conversion.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_input_doc.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_invalid_input.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_ocr_utils.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_options.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_settings_load.py +0 -0
- {docling-2.44.0 → docling-2.46.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.46.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -27,7 +27,7 @@ Description-Content-Type: text/markdown
|
|
27
27
|
License-File: LICENSE
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
29
29
|
Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
|
30
|
-
Requires-Dist: docling-parse<5.0.0,>=4.
|
30
|
+
Requires-Dist: docling-parse<5.0.0,>=4.2.2
|
31
31
|
Requires-Dist: docling-ibm-models<4,>=3.9.0
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
33
33
|
Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
|
@@ -22,15 +22,52 @@ _log = logging.getLogger(__name__)
|
|
22
22
|
|
23
23
|
|
24
24
|
class DoclingParseV4PageBackend(PdfPageBackend):
|
25
|
-
def __init__(
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
*,
|
28
|
+
dp_doc: PdfDocument,
|
29
|
+
page_obj: PdfPage,
|
30
|
+
page_no: int,
|
31
|
+
create_words: bool = True,
|
32
|
+
create_textlines: bool = True,
|
33
|
+
):
|
26
34
|
self._ppage = page_obj
|
27
|
-
self.
|
28
|
-
self.
|
35
|
+
self._dp_doc = dp_doc
|
36
|
+
self._page_no = page_no
|
37
|
+
self._create_words = create_words
|
38
|
+
self._create_textlines = create_textlines
|
39
|
+
|
40
|
+
self._dpage: Optional[SegmentedPdfPage] = None
|
41
|
+
self._unloaded = False
|
42
|
+
self.valid = (self._ppage is not None) and (self._dp_doc is not None)
|
43
|
+
|
44
|
+
def _ensure_parsed(self) -> None:
|
45
|
+
if self._dpage is not None:
|
46
|
+
return
|
47
|
+
|
48
|
+
seg_page = self._dp_doc.get_page(
|
49
|
+
self._page_no + 1,
|
50
|
+
create_words=self._create_words,
|
51
|
+
create_textlines=self._create_textlines,
|
52
|
+
)
|
53
|
+
|
54
|
+
# In Docling, all TextCell instances are expected with top-left origin.
|
55
|
+
[
|
56
|
+
tc.to_top_left_origin(seg_page.dimension.height)
|
57
|
+
for tc in seg_page.textline_cells
|
58
|
+
]
|
59
|
+
[tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.char_cells]
|
60
|
+
[tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.word_cells]
|
61
|
+
|
62
|
+
self._dpage = seg_page
|
29
63
|
|
30
64
|
def is_valid(self) -> bool:
|
31
65
|
return self.valid
|
32
66
|
|
33
67
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
68
|
+
self._ensure_parsed()
|
69
|
+
assert self._dpage is not None
|
70
|
+
|
34
71
|
# Find intersecting cells on the page
|
35
72
|
text_piece = ""
|
36
73
|
page_size = self.get_size()
|
@@ -56,12 +93,19 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
56
93
|
return text_piece
|
57
94
|
|
58
95
|
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
96
|
+
self._ensure_parsed()
|
59
97
|
return self._dpage
|
60
98
|
|
61
99
|
def get_text_cells(self) -> Iterable[TextCell]:
|
100
|
+
self._ensure_parsed()
|
101
|
+
assert self._dpage is not None
|
102
|
+
|
62
103
|
return self._dpage.textline_cells
|
63
104
|
|
64
105
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
106
|
+
self._ensure_parsed()
|
107
|
+
assert self._dpage is not None
|
108
|
+
|
65
109
|
AREA_THRESHOLD = 0 # 32 * 32
|
66
110
|
|
67
111
|
images = self._dpage.bitmap_resources
|
@@ -123,8 +167,13 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
123
167
|
# )
|
124
168
|
|
125
169
|
def unload(self):
|
170
|
+
if not self._unloaded and self._dp_doc is not None:
|
171
|
+
self._dp_doc.unload_pages((self._page_no + 1, self._page_no + 2))
|
172
|
+
self._unloaded = True
|
173
|
+
|
126
174
|
self._ppage = None
|
127
175
|
self._dpage = None
|
176
|
+
self._dp_doc = None
|
128
177
|
|
129
178
|
|
130
179
|
class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
@@ -157,30 +206,15 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
|
157
206
|
self, page_no: int, create_words: bool = True, create_textlines: bool = True
|
158
207
|
) -> DoclingParseV4PageBackend:
|
159
208
|
with pypdfium2_lock:
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
for tc in seg_page.textline_cells
|
170
|
-
]
|
171
|
-
[
|
172
|
-
tc.to_top_left_origin(seg_page.dimension.height)
|
173
|
-
for tc in seg_page.char_cells
|
174
|
-
]
|
175
|
-
[
|
176
|
-
tc.to_top_left_origin(seg_page.dimension.height)
|
177
|
-
for tc in seg_page.word_cells
|
178
|
-
]
|
179
|
-
|
180
|
-
return DoclingParseV4PageBackend(
|
181
|
-
seg_page,
|
182
|
-
self._pdoc[page_no],
|
183
|
-
)
|
209
|
+
ppage = self._pdoc[page_no]
|
210
|
+
|
211
|
+
return DoclingParseV4PageBackend(
|
212
|
+
dp_doc=self.dp_doc,
|
213
|
+
page_obj=ppage,
|
214
|
+
page_no=page_no,
|
215
|
+
create_words=create_words,
|
216
|
+
create_textlines=create_textlines,
|
217
|
+
)
|
184
218
|
|
185
219
|
def is_valid(self) -> bool:
|
186
220
|
return self.page_count() > 0
|