docling 2.29.0__tar.gz → 2.31.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.29.0 → docling-2.31.0}/PKG-INFO +4 -3
- {docling-2.29.0 → docling-2.31.0}/README.md +1 -0
- {docling-2.29.0 → docling-2.31.0}/docling/backend/asciidoc_backend.py +7 -15
- {docling-2.29.0 → docling-2.31.0}/docling/backend/csv_backend.py +1 -1
- {docling-2.29.0 → docling-2.31.0}/docling/backend/docling_parse_backend.py +2 -2
- {docling-2.29.0 → docling-2.31.0}/docling/backend/docling_parse_v2_backend.py +2 -2
- {docling-2.29.0 → docling-2.31.0}/docling/backend/docling_parse_v4_backend.py +3 -4
- {docling-2.29.0 → docling-2.31.0}/docling/backend/docx/latex/latex_dict.py +0 -5
- {docling-2.29.0 → docling-2.31.0}/docling/backend/docx/latex/omml.py +4 -7
- {docling-2.29.0 → docling-2.31.0}/docling/backend/html_backend.py +26 -9
- {docling-2.29.0 → docling-2.31.0}/docling/backend/md_backend.py +5 -7
- docling-2.31.0/docling/backend/msexcel_backend.py +519 -0
- {docling-2.29.0 → docling-2.31.0}/docling/backend/mspowerpoint_backend.py +4 -7
- {docling-2.29.0 → docling-2.31.0}/docling/backend/msword_backend.py +23 -15
- {docling-2.29.0 → docling-2.31.0}/docling/backend/pdf_backend.py +2 -1
- {docling-2.29.0 → docling-2.31.0}/docling/backend/pypdfium2_backend.py +3 -3
- {docling-2.29.0 → docling-2.31.0}/docling/backend/xml/jats_backend.py +10 -13
- {docling-2.29.0 → docling-2.31.0}/docling/backend/xml/uspto_backend.py +15 -19
- {docling-2.29.0 → docling-2.31.0}/docling/cli/main.py +27 -9
- {docling-2.29.0 → docling-2.31.0}/docling/cli/models.py +2 -3
- {docling-2.29.0 → docling-2.31.0}/docling/datamodel/base_models.py +40 -5
- {docling-2.29.0 → docling-2.31.0}/docling/datamodel/document.py +18 -10
- {docling-2.29.0 → docling-2.31.0}/docling/datamodel/pipeline_options.py +29 -4
- {docling-2.29.0 → docling-2.31.0}/docling/document_converter.py +5 -5
- docling-2.31.0/docling/models/api_vlm_model.py +66 -0
- {docling-2.29.0 → docling-2.31.0}/docling/models/base_model.py +2 -4
- {docling-2.29.0 → docling-2.31.0}/docling/models/base_ocr_model.py +2 -2
- {docling-2.29.0 → docling-2.31.0}/docling/models/code_formula_model.py +2 -1
- {docling-2.29.0 → docling-2.31.0}/docling/models/document_picture_classifier.py +2 -1
- {docling-2.29.0 → docling-2.31.0}/docling/models/easyocr_model.py +10 -11
- {docling-2.29.0 → docling-2.31.0}/docling/models/factories/__init__.py +2 -2
- {docling-2.29.0 → docling-2.31.0}/docling/models/factories/base_factory.py +1 -1
- {docling-2.29.0 → docling-2.31.0}/docling/models/hf_mlx_model.py +4 -6
- {docling-2.29.0 → docling-2.31.0}/docling/models/hf_vlm_model.py +7 -5
- {docling-2.29.0 → docling-2.31.0}/docling/models/layout_model.py +2 -2
- {docling-2.29.0 → docling-2.31.0}/docling/models/ocr_mac_model.py +3 -4
- {docling-2.29.0 → docling-2.31.0}/docling/models/page_assemble_model.py +7 -12
- {docling-2.29.0 → docling-2.31.0}/docling/models/page_preprocessing_model.py +2 -1
- docling-2.31.0/docling/models/picture_description_api_model.py +59 -0
- {docling-2.29.0 → docling-2.31.0}/docling/models/picture_description_base_model.py +16 -5
- {docling-2.29.0 → docling-2.31.0}/docling/models/picture_description_vlm_model.py +2 -3
- {docling-2.29.0 → docling-2.31.0}/docling/models/rapid_ocr_model.py +2 -3
- {docling-2.29.0 → docling-2.31.0}/docling/models/readingorder_model.py +8 -23
- {docling-2.29.0 → docling-2.31.0}/docling/models/table_structure_model.py +2 -6
- {docling-2.29.0 → docling-2.31.0}/docling/models/tesseract_ocr_cli_model.py +17 -16
- {docling-2.29.0 → docling-2.31.0}/docling/models/tesseract_ocr_model.py +8 -6
- {docling-2.29.0 → docling-2.31.0}/docling/pipeline/base_pipeline.py +4 -8
- {docling-2.29.0 → docling-2.31.0}/docling/pipeline/simple_pipeline.py +0 -1
- {docling-2.29.0 → docling-2.31.0}/docling/pipeline/standard_pdf_pipeline.py +6 -3
- {docling-2.29.0 → docling-2.31.0}/docling/pipeline/vlm_pipeline.py +27 -20
- docling-2.31.0/docling/utils/api_image_request.py +61 -0
- {docling-2.29.0 → docling-2.31.0}/docling/utils/export.py +2 -4
- {docling-2.29.0 → docling-2.31.0}/docling/utils/glm_utils.py +2 -2
- {docling-2.29.0 → docling-2.31.0}/docling/utils/layout_postprocessor.py +4 -2
- {docling-2.29.0 → docling-2.31.0}/docling/utils/model_downloader.py +7 -7
- {docling-2.29.0 → docling-2.31.0}/docling/utils/utils.py +1 -1
- {docling-2.29.0 → docling-2.31.0}/pyproject.toml +80 -14
- docling-2.29.0/docling/backend/msexcel_backend.py +0 -343
- docling-2.29.0/docling/models/picture_description_api_model.py +0 -125
- {docling-2.29.0 → docling-2.31.0}/LICENSE +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/__init__.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/backend/__init__.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/chunking/__init__.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/cli/__init__.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/cli/tools.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/datamodel/settings.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/exceptions.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/models/__init__.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/py.typed +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/utils/__init__.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/utils/locks.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/utils/profiling.py +0 -0
- {docling-2.29.0 → docling-2.31.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.31.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/docling-project/docling
|
6
6
|
License: MIT
|
@@ -28,7 +28,7 @@ Provides-Extra: vlm
|
|
28
28
|
Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
|
29
29
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
30
30
|
Requires-Dist: certifi (>=2024.7.4)
|
31
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
31
|
+
Requires-Dist: docling-core[chunking] (>=2.26.0,<3.0.0)
|
32
32
|
Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
|
33
33
|
Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
|
34
34
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -58,7 +58,7 @@ Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
|
|
58
58
|
Requires-Dist: tqdm (>=4.65.0,<5.0.0)
|
59
59
|
Requires-Dist: transformers (>=4.42.0,<4.43.0) ; (sys_platform == "darwin" and platform_machine == "x86_64") and (extra == "vlm")
|
60
60
|
Requires-Dist: transformers (>=4.46.0,<5.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
|
61
|
-
Requires-Dist: typer (>=0.12.5,<0.
|
61
|
+
Requires-Dist: typer (>=0.12.5,<0.16.0)
|
62
62
|
Project-URL: Repository, https://github.com/docling-project/docling
|
63
63
|
Description-Content-Type: text/markdown
|
64
64
|
|
@@ -86,6 +86,7 @@ Description-Content-Type: text/markdown
|
|
86
86
|
[](https://opensource.org/licenses/MIT)
|
87
87
|
[](https://pepy.tech/projects/docling)
|
88
88
|
[](https://apify.com/vancura/docling)
|
89
|
+
[](https://www.bestpractices.dev/projects/10101)
|
89
90
|
[](https://lfaidata.foundation/projects/)
|
90
91
|
|
91
92
|
Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
|
@@ -22,6 +22,7 @@
|
|
22
22
|
[](https://opensource.org/licenses/MIT)
|
23
23
|
[](https://pepy.tech/projects/docling)
|
24
24
|
[](https://apify.com/vancura/docling)
|
25
|
+
[](https://www.bestpractices.dev/projects/10101)
|
25
26
|
[](https://lfaidata.foundation/projects/)
|
26
27
|
|
27
28
|
Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
|
@@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
34
34
|
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
35
35
|
self.lines = text_stream.split("\n")
|
36
36
|
if isinstance(self.path_or_stream, Path):
|
37
|
-
with open(self.path_or_stream,
|
37
|
+
with open(self.path_or_stream, encoding="utf-8") as f:
|
38
38
|
self.lines = f.readlines()
|
39
39
|
self.valid = True
|
40
40
|
|
@@ -75,14 +75,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
75
75
|
|
76
76
|
return doc
|
77
77
|
|
78
|
-
def _parse(self, doc: DoclingDocument):
|
78
|
+
def _parse(self, doc: DoclingDocument): # noqa: C901
|
79
79
|
"""
|
80
80
|
Main function that orchestrates the parsing by yielding components:
|
81
81
|
title, section headers, text, lists, and tables.
|
82
82
|
"""
|
83
83
|
|
84
|
-
content = ""
|
85
|
-
|
86
84
|
in_list = False
|
87
85
|
in_table = False
|
88
86
|
|
@@ -95,7 +93,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
95
93
|
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
96
94
|
indents: dict[int, Union[GroupItem, None]] = {}
|
97
95
|
|
98
|
-
for i in range(
|
96
|
+
for i in range(10):
|
99
97
|
parents[i] = None
|
100
98
|
indents[i] = None
|
101
99
|
|
@@ -125,7 +123,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
125
123
|
|
126
124
|
# Lists
|
127
125
|
elif self._is_list_item(line):
|
128
|
-
|
129
126
|
_log.debug(f"line: {line}")
|
130
127
|
item = self._parse_list_item(line)
|
131
128
|
_log.debug(f"parsed list-item: {item}")
|
@@ -147,7 +144,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
147
144
|
indents[level + 1] = item["indent"]
|
148
145
|
|
149
146
|
elif in_list and item["indent"] < indents[level]:
|
150
|
-
|
151
147
|
# print(item["indent"], " => ", indents[level])
|
152
148
|
while item["indent"] < indents[level]:
|
153
149
|
# print(item["indent"], " => ", indents[level])
|
@@ -176,7 +172,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
176
172
|
elif in_table and (
|
177
173
|
(not self._is_table_line(line)) or line.strip() == "|==="
|
178
174
|
): # end of table
|
179
|
-
|
180
175
|
caption = None
|
181
176
|
if len(caption_data) > 0:
|
182
177
|
caption = doc.add_text(
|
@@ -195,7 +190,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
195
190
|
|
196
191
|
# Picture
|
197
192
|
elif self._is_picture(line):
|
198
|
-
|
199
193
|
caption = None
|
200
194
|
if len(caption_data) > 0:
|
201
195
|
caption = doc.add_text(
|
@@ -250,7 +244,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
250
244
|
text_data = []
|
251
245
|
|
252
246
|
elif len(line.strip()) > 0: # allow multiline texts
|
253
|
-
|
254
247
|
item = self._parse_text(line)
|
255
248
|
text_data.append(item["text"])
|
256
249
|
|
@@ -273,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
273
266
|
|
274
267
|
def _get_current_level(self, parents):
|
275
268
|
for k, v in parents.items():
|
276
|
-
if v
|
269
|
+
if v is None and k > 0:
|
277
270
|
return k - 1
|
278
271
|
|
279
272
|
return 0
|
280
273
|
|
281
274
|
def _get_current_parent(self, parents):
|
282
275
|
for k, v in parents.items():
|
283
|
-
if v
|
276
|
+
if v is None and k > 0:
|
284
277
|
return parents[k - 1]
|
285
278
|
|
286
279
|
return None
|
@@ -328,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
328
321
|
"marker": marker,
|
329
322
|
"text": text.strip(),
|
330
323
|
"numbered": False,
|
331
|
-
"indent": 0 if indent
|
324
|
+
"indent": 0 if indent is None else len(indent),
|
332
325
|
}
|
333
326
|
else:
|
334
327
|
return {
|
@@ -336,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
336
329
|
"marker": marker,
|
337
330
|
"text": text.strip(),
|
338
331
|
"numbered": True,
|
339
|
-
"indent": 0 if indent
|
332
|
+
"indent": 0 if indent is None else len(indent),
|
340
333
|
}
|
341
334
|
else:
|
342
335
|
# Fallback if no match
|
@@ -357,7 +350,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
357
350
|
return [cell.strip() for cell in line.split("|") if cell.strip()]
|
358
351
|
|
359
352
|
def _populate_table_as_grid(self, table_data):
|
360
|
-
|
361
353
|
num_rows = len(table_data)
|
362
354
|
|
363
355
|
# Adjust the table data into a grid format
|
@@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
|
|
58
58
|
head = self.content.readline()
|
59
59
|
dialect = csv.Sniffer().sniff(head, ",;\t|:")
|
60
60
|
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
|
61
|
-
if
|
61
|
+
if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
|
62
62
|
raise RuntimeError(
|
63
63
|
f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
|
64
64
|
)
|
@@ -1,8 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
import random
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from io import BytesIO
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import
|
6
|
+
from typing import List, Optional, Union
|
6
7
|
|
7
8
|
import pypdfium2 as pdfium
|
8
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
@@ -156,7 +157,6 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
156
157
|
def get_page_image(
|
157
158
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
158
159
|
) -> Image.Image:
|
159
|
-
|
160
160
|
page_size = self.get_size()
|
161
161
|
|
162
162
|
if not cropbox:
|
@@ -1,8 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
import random
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from io import BytesIO
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import TYPE_CHECKING,
|
6
|
+
from typing import TYPE_CHECKING, List, Optional, Union
|
6
7
|
|
7
8
|
import pypdfium2 as pdfium
|
8
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
@@ -172,7 +173,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|
172
173
|
def get_page_image(
|
173
174
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
174
175
|
) -> Image.Image:
|
175
|
-
|
176
176
|
page_size = self.get_size()
|
177
177
|
|
178
178
|
if not cropbox:
|
@@ -1,14 +1,14 @@
|
|
1
1
|
import logging
|
2
|
-
import
|
2
|
+
from collections.abc import Iterable
|
3
3
|
from io import BytesIO
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import TYPE_CHECKING,
|
5
|
+
from typing import TYPE_CHECKING, Optional, Union
|
6
6
|
|
7
7
|
import pypdfium2 as pdfium
|
8
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
9
9
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
10
10
|
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
|
11
|
-
from PIL import Image
|
11
|
+
from PIL import Image
|
12
12
|
from pypdfium2 import PdfPage
|
13
13
|
|
14
14
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
@@ -93,7 +93,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
93
93
|
def get_page_image(
|
94
94
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
95
95
|
) -> Image.Image:
|
96
|
-
|
97
96
|
page_size = self.get_size()
|
98
97
|
|
99
98
|
if not cropbox:
|
@@ -1,12 +1,8 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
|
3
1
|
"""
|
4
2
|
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
|
5
3
|
On 23/01/2025
|
6
4
|
"""
|
7
5
|
|
8
|
-
from __future__ import unicode_literals
|
9
|
-
|
10
6
|
CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
|
11
7
|
|
12
8
|
BLANK = ""
|
@@ -79,7 +75,6 @@ CHR_BO = {
|
|
79
75
|
}
|
80
76
|
|
81
77
|
T = {
|
82
|
-
"\u2192": "\\rightarrow ",
|
83
78
|
# Greek letters
|
84
79
|
"\U0001d6fc": "\\alpha ",
|
85
80
|
"\U0001d6fd": "\\beta ",
|
@@ -76,8 +76,7 @@ def get_val(key, default=None, store=CHR):
|
|
76
76
|
return default
|
77
77
|
|
78
78
|
|
79
|
-
class Tag2Method
|
80
|
-
|
79
|
+
class Tag2Method:
|
81
80
|
def call_method(self, elm, stag=None):
|
82
81
|
getmethod = self.tag2meth.get
|
83
82
|
if stag is None:
|
@@ -130,7 +129,6 @@ class Tag2Method(object):
|
|
130
129
|
|
131
130
|
|
132
131
|
class Pr(Tag2Method):
|
133
|
-
|
134
132
|
text = ""
|
135
133
|
|
136
134
|
__val_tags = ("chr", "pos", "begChr", "endChr", "type")
|
@@ -159,7 +157,7 @@ class Pr(Tag2Method):
|
|
159
157
|
def do_common(self, elm):
|
160
158
|
stag = elm.tag.replace(OMML_NS, "")
|
161
159
|
if stag in self.__val_tags:
|
162
|
-
t = elm.get("{
|
160
|
+
t = elm.get(f"{OMML_NS}val")
|
163
161
|
self.__innerdict[stag] = t
|
164
162
|
return None
|
165
163
|
|
@@ -248,7 +246,6 @@ class oMath2Latex(Tag2Method):
|
|
248
246
|
"""
|
249
247
|
the Pre-Sub-Superscript object -- Not support yet
|
250
248
|
"""
|
251
|
-
pass
|
252
249
|
|
253
250
|
def do_sub(self, elm):
|
254
251
|
text = self.process_children(elm)
|
@@ -331,7 +328,7 @@ class oMath2Latex(Tag2Method):
|
|
331
328
|
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
332
329
|
latex_s = LIM_FUNC.get(t_dict["e"])
|
333
330
|
if not latex_s:
|
334
|
-
raise
|
331
|
+
raise RuntimeError("Not support lim {}".format(t_dict["e"]))
|
335
332
|
else:
|
336
333
|
return latex_s.format(lim=t_dict.get("lim"))
|
337
334
|
|
@@ -413,7 +410,7 @@ class oMath2Latex(Tag2Method):
|
|
413
410
|
"""
|
414
411
|
_str = []
|
415
412
|
_base_str = []
|
416
|
-
found_text = elm.findtext("./{
|
413
|
+
found_text = elm.findtext(f"./{OMML_NS}t")
|
417
414
|
if found_text:
|
418
415
|
for s in found_text:
|
419
416
|
out_latex_str = self.process_unicode(s)
|
@@ -26,6 +26,8 @@ _log = logging.getLogger(__name__)
|
|
26
26
|
|
27
27
|
# tags that generate NodeItem elements
|
28
28
|
TAGS_FOR_NODE_ITEMS: Final = [
|
29
|
+
"address",
|
30
|
+
"details",
|
29
31
|
"h1",
|
30
32
|
"h2",
|
31
33
|
"h3",
|
@@ -38,6 +40,7 @@ TAGS_FOR_NODE_ITEMS: Final = [
|
|
38
40
|
"ul",
|
39
41
|
"ol",
|
40
42
|
"li",
|
43
|
+
"summary",
|
41
44
|
"table",
|
42
45
|
"figure",
|
43
46
|
"img",
|
@@ -55,7 +58,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
55
58
|
self.max_levels = 10
|
56
59
|
self.level = 0
|
57
60
|
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
|
58
|
-
for i in range(
|
61
|
+
for i in range(self.max_levels):
|
59
62
|
self.parents[i] = None
|
60
63
|
|
61
64
|
try:
|
@@ -126,7 +129,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
126
129
|
return doc
|
127
130
|
|
128
131
|
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
|
129
|
-
|
130
132
|
# Iterate over elements in the body of the document
|
131
133
|
text: str = ""
|
132
134
|
for element in tag.children:
|
@@ -135,7 +137,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
135
137
|
self.analyze_tag(cast(Tag, element), doc)
|
136
138
|
except Exception as exc_child:
|
137
139
|
_log.error(
|
138
|
-
f"Error processing child from tag {tag.name}: {
|
140
|
+
f"Error processing child from tag {tag.name}: {exc_child!r}"
|
139
141
|
)
|
140
142
|
raise exc_child
|
141
143
|
elif isinstance(element, NavigableString) and not isinstance(
|
@@ -147,7 +149,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
147
149
|
item for item in element.next_siblings if isinstance(item, Tag)
|
148
150
|
]
|
149
151
|
if element.next_sibling is None or any(
|
150
|
-
|
152
|
+
item.name in TAGS_FOR_NODE_ITEMS for item in siblings
|
151
153
|
):
|
152
154
|
text = text.strip()
|
153
155
|
if text and tag.name in ["div"]:
|
@@ -164,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
164
166
|
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
|
165
167
|
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
166
168
|
self.handle_header(tag, doc)
|
167
|
-
elif tag.name in ["p"]:
|
169
|
+
elif tag.name in ["p", "address", "summary"]:
|
168
170
|
self.handle_paragraph(tag, doc)
|
169
171
|
elif tag.name in ["pre", "code"]:
|
170
172
|
self.handle_code(tag, doc)
|
@@ -178,6 +180,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
178
180
|
self.handle_figure(tag, doc)
|
179
181
|
elif tag.name == "img":
|
180
182
|
self.handle_image(tag, doc)
|
183
|
+
elif tag.name == "details":
|
184
|
+
self.handle_details(tag, doc)
|
181
185
|
else:
|
182
186
|
self.walk(tag, doc)
|
183
187
|
|
@@ -202,6 +206,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
202
206
|
|
203
207
|
return ["".join(result) + " "]
|
204
208
|
|
209
|
+
def handle_details(self, element: Tag, doc: DoclingDocument) -> None:
|
210
|
+
"""Handle details tag (details) and its content."""
|
211
|
+
|
212
|
+
self.parents[self.level + 1] = doc.add_group(
|
213
|
+
name="details",
|
214
|
+
label=GroupLabel.SECTION,
|
215
|
+
parent=self.parents[self.level],
|
216
|
+
content_layer=self.content_layer,
|
217
|
+
)
|
218
|
+
|
219
|
+
self.level += 1
|
220
|
+
self.walk(element, doc)
|
221
|
+
self.parents[self.level + 1] = None
|
222
|
+
self.level -= 1
|
223
|
+
|
205
224
|
def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
|
206
225
|
"""Handles header tags (h1, h2, etc.)."""
|
207
226
|
hlevel = int(element.name.replace("h", ""))
|
@@ -222,7 +241,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
222
241
|
)
|
223
242
|
else:
|
224
243
|
if hlevel > self.level:
|
225
|
-
|
226
244
|
# add invisible group
|
227
245
|
for i in range(self.level + 1, hlevel):
|
228
246
|
self.parents[i] = doc.add_group(
|
@@ -234,7 +252,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
234
252
|
self.level = hlevel
|
235
253
|
|
236
254
|
elif hlevel < self.level:
|
237
|
-
|
238
255
|
# remove the tail
|
239
256
|
for key in self.parents.keys():
|
240
257
|
if key > hlevel:
|
@@ -261,7 +278,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
261
278
|
)
|
262
279
|
|
263
280
|
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
|
264
|
-
"""Handles paragraph tags (p)."""
|
281
|
+
"""Handles paragraph tags (p) or equivalent ones."""
|
265
282
|
if element.text is None:
|
266
283
|
return
|
267
284
|
text = element.text.strip()
|
@@ -360,7 +377,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
360
377
|
marker = ""
|
361
378
|
enumerated = False
|
362
379
|
if parent_label == GroupLabel.ORDERED_LIST:
|
363
|
-
marker = f"{
|
380
|
+
marker = f"{index_in_list!s}."
|
364
381
|
enumerated = True
|
365
382
|
doc.add_list_item(
|
366
383
|
text=text,
|
@@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
83
83
|
# otherwise they represent emphasis (bold or italic)
|
84
84
|
self.markdown = self._shorten_underscore_sequences(text_stream)
|
85
85
|
if isinstance(self.path_or_stream, Path):
|
86
|
-
with open(self.path_or_stream,
|
86
|
+
with open(self.path_or_stream, encoding="utf-8") as f:
|
87
87
|
md_content = f.read()
|
88
88
|
# remove invalid sequences
|
89
89
|
# very long sequences of underscores will lead to unnecessary long processing times.
|
@@ -168,7 +168,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
168
168
|
)
|
169
169
|
self.inline_texts = []
|
170
170
|
|
171
|
-
def _iterate_elements(
|
171
|
+
def _iterate_elements( # noqa: C901
|
172
172
|
self,
|
173
173
|
element: marko.element.Element,
|
174
174
|
depth: int,
|
@@ -176,7 +176,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
176
176
|
visited: Set[marko.element.Element],
|
177
177
|
parent_item: Optional[NodeItem] = None,
|
178
178
|
):
|
179
|
-
|
180
179
|
if element in visited:
|
181
180
|
return
|
182
181
|
|
@@ -236,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
236
235
|
if has_non_empty_list_items:
|
237
236
|
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
238
237
|
parent_item = doc.add_group(
|
239
|
-
label=label, name=
|
238
|
+
label=label, name="list", parent=parent_item
|
240
239
|
)
|
241
240
|
|
242
241
|
elif (
|
@@ -320,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
320
319
|
self._html_blocks += 1
|
321
320
|
self._process_inline_text(parent_item, doc)
|
322
321
|
self._close_table(doc)
|
323
|
-
_log.debug("HTML Block: {}"
|
322
|
+
_log.debug(f"HTML Block: {element}")
|
324
323
|
if (
|
325
324
|
len(element.body) > 0
|
326
325
|
): # If Marko doesn't return any content for HTML block, skip it
|
@@ -332,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
332
331
|
else:
|
333
332
|
if not isinstance(element, str):
|
334
333
|
self._close_table(doc)
|
335
|
-
_log.debug("Some other element: {}"
|
334
|
+
_log.debug(f"Some other element: {element}")
|
336
335
|
|
337
336
|
processed_block_types = (
|
338
337
|
marko.block.Heading,
|
@@ -398,7 +397,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
398
397
|
|
399
398
|
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
400
399
|
if self._html_blocks > 0:
|
401
|
-
|
402
400
|
# export to HTML
|
403
401
|
html_backend_cls = HTMLDocumentBackend
|
404
402
|
html_str = doc.export_to_html()
|