docling 2.30.0__tar.gz → 2.31.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.30.0 → docling-2.31.1}/PKG-INFO +2 -1
- {docling-2.30.0 → docling-2.31.1}/README.md +1 -0
- {docling-2.30.0 → docling-2.31.1}/docling/backend/asciidoc_backend.py +7 -15
- {docling-2.30.0 → docling-2.31.1}/docling/backend/csv_backend.py +1 -1
- {docling-2.30.0 → docling-2.31.1}/docling/backend/docling_parse_backend.py +2 -2
- {docling-2.30.0 → docling-2.31.1}/docling/backend/docling_parse_v2_backend.py +2 -2
- {docling-2.30.0 → docling-2.31.1}/docling/backend/docling_parse_v4_backend.py +3 -4
- {docling-2.30.0 → docling-2.31.1}/docling/backend/docx/latex/latex_dict.py +0 -5
- {docling-2.30.0 → docling-2.31.1}/docling/backend/docx/latex/omml.py +4 -7
- {docling-2.30.0 → docling-2.31.1}/docling/backend/html_backend.py +66 -25
- {docling-2.30.0 → docling-2.31.1}/docling/backend/md_backend.py +6 -8
- {docling-2.30.0 → docling-2.31.1}/docling/backend/msexcel_backend.py +1 -7
- {docling-2.30.0 → docling-2.31.1}/docling/backend/mspowerpoint_backend.py +4 -7
- {docling-2.30.0 → docling-2.31.1}/docling/backend/msword_backend.py +5 -5
- {docling-2.30.0 → docling-2.31.1}/docling/backend/pdf_backend.py +2 -1
- {docling-2.30.0 → docling-2.31.1}/docling/backend/pypdfium2_backend.py +3 -3
- {docling-2.30.0 → docling-2.31.1}/docling/backend/xml/jats_backend.py +11 -14
- {docling-2.30.0 → docling-2.31.1}/docling/backend/xml/uspto_backend.py +19 -23
- {docling-2.30.0 → docling-2.31.1}/docling/cli/main.py +8 -8
- {docling-2.30.0 → docling-2.31.1}/docling/cli/models.py +6 -3
- {docling-2.30.0 → docling-2.31.1}/docling/datamodel/base_models.py +7 -5
- {docling-2.30.0 → docling-2.31.1}/docling/datamodel/document.py +19 -10
- {docling-2.30.0 → docling-2.31.1}/docling/datamodel/pipeline_options.py +0 -1
- {docling-2.30.0 → docling-2.31.1}/docling/document_converter.py +8 -6
- {docling-2.30.0 → docling-2.31.1}/docling/models/api_vlm_model.py +1 -2
- {docling-2.30.0 → docling-2.31.1}/docling/models/base_model.py +2 -4
- {docling-2.30.0 → docling-2.31.1}/docling/models/base_ocr_model.py +2 -2
- {docling-2.30.0 → docling-2.31.1}/docling/models/code_formula_model.py +2 -1
- {docling-2.30.0 → docling-2.31.1}/docling/models/document_picture_classifier.py +2 -1
- {docling-2.30.0 → docling-2.31.1}/docling/models/easyocr_model.py +10 -11
- {docling-2.30.0 → docling-2.31.1}/docling/models/factories/__init__.py +2 -2
- {docling-2.30.0 → docling-2.31.1}/docling/models/factories/base_factory.py +1 -1
- {docling-2.30.0 → docling-2.31.1}/docling/models/hf_mlx_model.py +4 -6
- {docling-2.30.0 → docling-2.31.1}/docling/models/hf_vlm_model.py +7 -5
- {docling-2.30.0 → docling-2.31.1}/docling/models/layout_model.py +2 -2
- {docling-2.30.0 → docling-2.31.1}/docling/models/ocr_mac_model.py +3 -4
- {docling-2.30.0 → docling-2.31.1}/docling/models/page_assemble_model.py +7 -12
- {docling-2.30.0 → docling-2.31.1}/docling/models/page_preprocessing_model.py +2 -1
- {docling-2.30.0 → docling-2.31.1}/docling/models/picture_description_api_model.py +2 -1
- {docling-2.30.0 → docling-2.31.1}/docling/models/picture_description_base_model.py +2 -3
- {docling-2.30.0 → docling-2.31.1}/docling/models/picture_description_vlm_model.py +6 -4
- {docling-2.30.0 → docling-2.31.1}/docling/models/rapid_ocr_model.py +2 -3
- {docling-2.30.0 → docling-2.31.1}/docling/models/readingorder_model.py +9 -24
- {docling-2.30.0 → docling-2.31.1}/docling/models/table_structure_model.py +4 -8
- {docling-2.30.0 → docling-2.31.1}/docling/models/tesseract_ocr_cli_model.py +17 -16
- {docling-2.30.0 → docling-2.31.1}/docling/models/tesseract_ocr_model.py +9 -5
- {docling-2.30.0 → docling-2.31.1}/docling/pipeline/base_pipeline.py +4 -8
- {docling-2.30.0 → docling-2.31.1}/docling/pipeline/simple_pipeline.py +0 -1
- {docling-2.30.0 → docling-2.31.1}/docling/pipeline/standard_pdf_pipeline.py +0 -1
- {docling-2.30.0 → docling-2.31.1}/docling/pipeline/vlm_pipeline.py +0 -3
- {docling-2.30.0 → docling-2.31.1}/docling/utils/export.py +2 -4
- {docling-2.30.0 → docling-2.31.1}/docling/utils/glm_utils.py +2 -2
- {docling-2.30.0 → docling-2.31.1}/docling/utils/layout_postprocessor.py +4 -2
- {docling-2.30.0 → docling-2.31.1}/docling/utils/model_downloader.py +31 -7
- {docling-2.30.0 → docling-2.31.1}/docling/utils/utils.py +3 -3
- {docling-2.30.0 → docling-2.31.1}/pyproject.toml +78 -12
- {docling-2.30.0 → docling-2.31.1}/LICENSE +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/__init__.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/backend/__init__.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/backend/abstract_backend.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/backend/docx/__init__.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/backend/json/__init__.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/backend/xml/__init__.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/chunking/__init__.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/cli/__init__.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/cli/tools.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/datamodel/__init__.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/datamodel/settings.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/exceptions.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/models/__init__.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/models/plugins/__init__.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/models/plugins/defaults.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/pipeline/__init__.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/py.typed +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/utils/__init__.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/utils/api_image_request.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/utils/locks.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/utils/ocr_utils.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/utils/profiling.py +0 -0
- {docling-2.30.0 → docling-2.31.1}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.31.1
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/docling-project/docling
|
6
6
|
License: MIT
|
@@ -86,6 +86,7 @@ Description-Content-Type: text/markdown
|
|
86
86
|
[](https://opensource.org/licenses/MIT)
|
87
87
|
[](https://pepy.tech/projects/docling)
|
88
88
|
[](https://apify.com/vancura/docling)
|
89
|
+
[](https://www.bestpractices.dev/projects/10101)
|
89
90
|
[](https://lfaidata.foundation/projects/)
|
90
91
|
|
91
92
|
Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
|
@@ -22,6 +22,7 @@
|
|
22
22
|
[](https://opensource.org/licenses/MIT)
|
23
23
|
[](https://pepy.tech/projects/docling)
|
24
24
|
[](https://apify.com/vancura/docling)
|
25
|
+
[](https://www.bestpractices.dev/projects/10101)
|
25
26
|
[](https://lfaidata.foundation/projects/)
|
26
27
|
|
27
28
|
Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
|
@@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
34
34
|
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
35
35
|
self.lines = text_stream.split("\n")
|
36
36
|
if isinstance(self.path_or_stream, Path):
|
37
|
-
with open(self.path_or_stream,
|
37
|
+
with open(self.path_or_stream, encoding="utf-8") as f:
|
38
38
|
self.lines = f.readlines()
|
39
39
|
self.valid = True
|
40
40
|
|
@@ -75,14 +75,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
75
75
|
|
76
76
|
return doc
|
77
77
|
|
78
|
-
def _parse(self, doc: DoclingDocument):
|
78
|
+
def _parse(self, doc: DoclingDocument): # noqa: C901
|
79
79
|
"""
|
80
80
|
Main function that orchestrates the parsing by yielding components:
|
81
81
|
title, section headers, text, lists, and tables.
|
82
82
|
"""
|
83
83
|
|
84
|
-
content = ""
|
85
|
-
|
86
84
|
in_list = False
|
87
85
|
in_table = False
|
88
86
|
|
@@ -95,7 +93,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
95
93
|
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
96
94
|
indents: dict[int, Union[GroupItem, None]] = {}
|
97
95
|
|
98
|
-
for i in range(
|
96
|
+
for i in range(10):
|
99
97
|
parents[i] = None
|
100
98
|
indents[i] = None
|
101
99
|
|
@@ -125,7 +123,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
125
123
|
|
126
124
|
# Lists
|
127
125
|
elif self._is_list_item(line):
|
128
|
-
|
129
126
|
_log.debug(f"line: {line}")
|
130
127
|
item = self._parse_list_item(line)
|
131
128
|
_log.debug(f"parsed list-item: {item}")
|
@@ -147,7 +144,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
147
144
|
indents[level + 1] = item["indent"]
|
148
145
|
|
149
146
|
elif in_list and item["indent"] < indents[level]:
|
150
|
-
|
151
147
|
# print(item["indent"], " => ", indents[level])
|
152
148
|
while item["indent"] < indents[level]:
|
153
149
|
# print(item["indent"], " => ", indents[level])
|
@@ -176,7 +172,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
176
172
|
elif in_table and (
|
177
173
|
(not self._is_table_line(line)) or line.strip() == "|==="
|
178
174
|
): # end of table
|
179
|
-
|
180
175
|
caption = None
|
181
176
|
if len(caption_data) > 0:
|
182
177
|
caption = doc.add_text(
|
@@ -195,7 +190,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
195
190
|
|
196
191
|
# Picture
|
197
192
|
elif self._is_picture(line):
|
198
|
-
|
199
193
|
caption = None
|
200
194
|
if len(caption_data) > 0:
|
201
195
|
caption = doc.add_text(
|
@@ -250,7 +244,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
250
244
|
text_data = []
|
251
245
|
|
252
246
|
elif len(line.strip()) > 0: # allow multiline texts
|
253
|
-
|
254
247
|
item = self._parse_text(line)
|
255
248
|
text_data.append(item["text"])
|
256
249
|
|
@@ -273,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
273
266
|
|
274
267
|
def _get_current_level(self, parents):
|
275
268
|
for k, v in parents.items():
|
276
|
-
if v
|
269
|
+
if v is None and k > 0:
|
277
270
|
return k - 1
|
278
271
|
|
279
272
|
return 0
|
280
273
|
|
281
274
|
def _get_current_parent(self, parents):
|
282
275
|
for k, v in parents.items():
|
283
|
-
if v
|
276
|
+
if v is None and k > 0:
|
284
277
|
return parents[k - 1]
|
285
278
|
|
286
279
|
return None
|
@@ -328,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
328
321
|
"marker": marker,
|
329
322
|
"text": text.strip(),
|
330
323
|
"numbered": False,
|
331
|
-
"indent": 0 if indent
|
324
|
+
"indent": 0 if indent is None else len(indent),
|
332
325
|
}
|
333
326
|
else:
|
334
327
|
return {
|
@@ -336,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
336
329
|
"marker": marker,
|
337
330
|
"text": text.strip(),
|
338
331
|
"numbered": True,
|
339
|
-
"indent": 0 if indent
|
332
|
+
"indent": 0 if indent is None else len(indent),
|
340
333
|
}
|
341
334
|
else:
|
342
335
|
# Fallback if no match
|
@@ -357,7 +350,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
357
350
|
return [cell.strip() for cell in line.split("|") if cell.strip()]
|
358
351
|
|
359
352
|
def _populate_table_as_grid(self, table_data):
|
360
|
-
|
361
353
|
num_rows = len(table_data)
|
362
354
|
|
363
355
|
# Adjust the table data into a grid format
|
@@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
|
|
58
58
|
head = self.content.readline()
|
59
59
|
dialect = csv.Sniffer().sniff(head, ",;\t|:")
|
60
60
|
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
|
61
|
-
if
|
61
|
+
if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
|
62
62
|
raise RuntimeError(
|
63
63
|
f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
|
64
64
|
)
|
@@ -1,8 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
import random
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from io import BytesIO
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import
|
6
|
+
from typing import List, Optional, Union
|
6
7
|
|
7
8
|
import pypdfium2 as pdfium
|
8
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
@@ -156,7 +157,6 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
156
157
|
def get_page_image(
|
157
158
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
158
159
|
) -> Image.Image:
|
159
|
-
|
160
160
|
page_size = self.get_size()
|
161
161
|
|
162
162
|
if not cropbox:
|
@@ -1,8 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
import random
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from io import BytesIO
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import TYPE_CHECKING,
|
6
|
+
from typing import TYPE_CHECKING, List, Optional, Union
|
6
7
|
|
7
8
|
import pypdfium2 as pdfium
|
8
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
@@ -172,7 +173,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|
172
173
|
def get_page_image(
|
173
174
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
174
175
|
) -> Image.Image:
|
175
|
-
|
176
176
|
page_size = self.get_size()
|
177
177
|
|
178
178
|
if not cropbox:
|
@@ -1,14 +1,14 @@
|
|
1
1
|
import logging
|
2
|
-
import
|
2
|
+
from collections.abc import Iterable
|
3
3
|
from io import BytesIO
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import TYPE_CHECKING,
|
5
|
+
from typing import TYPE_CHECKING, Optional, Union
|
6
6
|
|
7
7
|
import pypdfium2 as pdfium
|
8
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
9
9
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
10
10
|
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
|
11
|
-
from PIL import Image
|
11
|
+
from PIL import Image
|
12
12
|
from pypdfium2 import PdfPage
|
13
13
|
|
14
14
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
@@ -93,7 +93,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
93
93
|
def get_page_image(
|
94
94
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
95
95
|
) -> Image.Image:
|
96
|
-
|
97
96
|
page_size = self.get_size()
|
98
97
|
|
99
98
|
if not cropbox:
|
@@ -1,12 +1,8 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
|
3
1
|
"""
|
4
2
|
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
|
5
3
|
On 23/01/2025
|
6
4
|
"""
|
7
5
|
|
8
|
-
from __future__ import unicode_literals
|
9
|
-
|
10
6
|
CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
|
11
7
|
|
12
8
|
BLANK = ""
|
@@ -79,7 +75,6 @@ CHR_BO = {
|
|
79
75
|
}
|
80
76
|
|
81
77
|
T = {
|
82
|
-
"\u2192": "\\rightarrow ",
|
83
78
|
# Greek letters
|
84
79
|
"\U0001d6fc": "\\alpha ",
|
85
80
|
"\U0001d6fd": "\\beta ",
|
@@ -76,8 +76,7 @@ def get_val(key, default=None, store=CHR):
|
|
76
76
|
return default
|
77
77
|
|
78
78
|
|
79
|
-
class Tag2Method
|
80
|
-
|
79
|
+
class Tag2Method:
|
81
80
|
def call_method(self, elm, stag=None):
|
82
81
|
getmethod = self.tag2meth.get
|
83
82
|
if stag is None:
|
@@ -130,7 +129,6 @@ class Tag2Method(object):
|
|
130
129
|
|
131
130
|
|
132
131
|
class Pr(Tag2Method):
|
133
|
-
|
134
132
|
text = ""
|
135
133
|
|
136
134
|
__val_tags = ("chr", "pos", "begChr", "endChr", "type")
|
@@ -159,7 +157,7 @@ class Pr(Tag2Method):
|
|
159
157
|
def do_common(self, elm):
|
160
158
|
stag = elm.tag.replace(OMML_NS, "")
|
161
159
|
if stag in self.__val_tags:
|
162
|
-
t = elm.get("{
|
160
|
+
t = elm.get(f"{OMML_NS}val")
|
163
161
|
self.__innerdict[stag] = t
|
164
162
|
return None
|
165
163
|
|
@@ -248,7 +246,6 @@ class oMath2Latex(Tag2Method):
|
|
248
246
|
"""
|
249
247
|
the Pre-Sub-Superscript object -- Not support yet
|
250
248
|
"""
|
251
|
-
pass
|
252
249
|
|
253
250
|
def do_sub(self, elm):
|
254
251
|
text = self.process_children(elm)
|
@@ -331,7 +328,7 @@ class oMath2Latex(Tag2Method):
|
|
331
328
|
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
332
329
|
latex_s = LIM_FUNC.get(t_dict["e"])
|
333
330
|
if not latex_s:
|
334
|
-
raise
|
331
|
+
raise RuntimeError("Not support lim {}".format(t_dict["e"]))
|
335
332
|
else:
|
336
333
|
return latex_s.format(lim=t_dict.get("lim"))
|
337
334
|
|
@@ -413,7 +410,7 @@ class oMath2Latex(Tag2Method):
|
|
413
410
|
"""
|
414
411
|
_str = []
|
415
412
|
_base_str = []
|
416
|
-
found_text = elm.findtext("./{
|
413
|
+
found_text = elm.findtext(f"./{OMML_NS}t")
|
417
414
|
if found_text:
|
418
415
|
for s in found_text:
|
419
416
|
out_latex_str = self.process_unicode(s)
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import logging
|
2
|
+
import traceback
|
2
3
|
from io import BytesIO
|
3
4
|
from pathlib import Path
|
4
5
|
from typing import Final, Optional, Union, cast
|
@@ -26,6 +27,8 @@ _log = logging.getLogger(__name__)
|
|
26
27
|
|
27
28
|
# tags that generate NodeItem elements
|
28
29
|
TAGS_FOR_NODE_ITEMS: Final = [
|
30
|
+
"address",
|
31
|
+
"details",
|
29
32
|
"h1",
|
30
33
|
"h2",
|
31
34
|
"h3",
|
@@ -38,6 +41,7 @@ TAGS_FOR_NODE_ITEMS: Final = [
|
|
38
41
|
"ul",
|
39
42
|
"ol",
|
40
43
|
"li",
|
44
|
+
"summary",
|
41
45
|
"table",
|
42
46
|
"figure",
|
43
47
|
"img",
|
@@ -55,7 +59,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
55
59
|
self.max_levels = 10
|
56
60
|
self.level = 0
|
57
61
|
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
|
58
|
-
for i in range(
|
62
|
+
for i in range(self.max_levels):
|
59
63
|
self.parents[i] = None
|
60
64
|
|
61
65
|
try:
|
@@ -126,7 +130,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
126
130
|
return doc
|
127
131
|
|
128
132
|
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
|
129
|
-
|
130
133
|
# Iterate over elements in the body of the document
|
131
134
|
text: str = ""
|
132
135
|
for element in tag.children:
|
@@ -135,7 +138,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
135
138
|
self.analyze_tag(cast(Tag, element), doc)
|
136
139
|
except Exception as exc_child:
|
137
140
|
_log.error(
|
138
|
-
f"Error processing child from tag {tag.name}
|
141
|
+
f"Error processing child from tag {tag.name}:\n{traceback.format_exc()}"
|
139
142
|
)
|
140
143
|
raise exc_child
|
141
144
|
elif isinstance(element, NavigableString) and not isinstance(
|
@@ -147,7 +150,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
147
150
|
item for item in element.next_siblings if isinstance(item, Tag)
|
148
151
|
]
|
149
152
|
if element.next_sibling is None or any(
|
150
|
-
|
153
|
+
item.name in TAGS_FOR_NODE_ITEMS for item in siblings
|
151
154
|
):
|
152
155
|
text = text.strip()
|
153
156
|
if text and tag.name in ["div"]:
|
@@ -164,7 +167,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
164
167
|
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
|
165
168
|
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
166
169
|
self.handle_header(tag, doc)
|
167
|
-
elif tag.name in ["p"]:
|
170
|
+
elif tag.name in ["p", "address", "summary"]:
|
168
171
|
self.handle_paragraph(tag, doc)
|
169
172
|
elif tag.name in ["pre", "code"]:
|
170
173
|
self.handle_code(tag, doc)
|
@@ -178,6 +181,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
178
181
|
self.handle_figure(tag, doc)
|
179
182
|
elif tag.name == "img":
|
180
183
|
self.handle_image(tag, doc)
|
184
|
+
elif tag.name == "details":
|
185
|
+
self.handle_details(tag, doc)
|
181
186
|
else:
|
182
187
|
self.walk(tag, doc)
|
183
188
|
|
@@ -202,6 +207,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
202
207
|
|
203
208
|
return ["".join(result) + " "]
|
204
209
|
|
210
|
+
def handle_details(self, element: Tag, doc: DoclingDocument) -> None:
|
211
|
+
"""Handle details tag (details) and its content."""
|
212
|
+
|
213
|
+
self.parents[self.level + 1] = doc.add_group(
|
214
|
+
name="details",
|
215
|
+
label=GroupLabel.SECTION,
|
216
|
+
parent=self.parents[self.level],
|
217
|
+
content_layer=self.content_layer,
|
218
|
+
)
|
219
|
+
|
220
|
+
self.level += 1
|
221
|
+
self.walk(element, doc)
|
222
|
+
self.parents[self.level + 1] = None
|
223
|
+
self.level -= 1
|
224
|
+
|
205
225
|
def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
|
206
226
|
"""Handles header tags (h1, h2, etc.)."""
|
207
227
|
hlevel = int(element.name.replace("h", ""))
|
@@ -222,7 +242,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
222
242
|
)
|
223
243
|
else:
|
224
244
|
if hlevel > self.level:
|
225
|
-
|
226
245
|
# add invisible group
|
227
246
|
for i in range(self.level + 1, hlevel):
|
228
247
|
self.parents[i] = doc.add_group(
|
@@ -234,7 +253,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
234
253
|
self.level = hlevel
|
235
254
|
|
236
255
|
elif hlevel < self.level:
|
237
|
-
|
238
256
|
# remove the tail
|
239
257
|
for key in self.parents.keys():
|
240
258
|
if key > hlevel:
|
@@ -261,7 +279,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
261
279
|
)
|
262
280
|
|
263
281
|
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
|
264
|
-
"""Handles paragraph tags (p)."""
|
282
|
+
"""Handles paragraph tags (p) or equivalent ones."""
|
265
283
|
if element.text is None:
|
266
284
|
return
|
267
285
|
text = element.text.strip()
|
@@ -360,7 +378,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
360
378
|
marker = ""
|
361
379
|
enumerated = False
|
362
380
|
if parent_label == GroupLabel.ORDERED_LIST:
|
363
|
-
marker = f"{
|
381
|
+
marker = f"{index_in_list!s}."
|
364
382
|
enumerated = True
|
365
383
|
doc.add_list_item(
|
366
384
|
text=text,
|
@@ -373,46 +391,64 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
373
391
|
_log.debug(f"list-item has no text: {element}")
|
374
392
|
|
375
393
|
@staticmethod
|
376
|
-
def parse_table_data(element: Tag) -> Optional[TableData]:
|
394
|
+
def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
|
377
395
|
nested_tables = element.find("table")
|
378
396
|
if nested_tables is not None:
|
379
397
|
_log.debug("Skipping nested table.")
|
380
398
|
return None
|
381
399
|
|
382
|
-
#
|
383
|
-
num_rows =
|
384
|
-
|
385
|
-
# Find the number of columns (taking into account colspan)
|
400
|
+
# Find the number of rows and columns (taking into account spans)
|
401
|
+
num_rows = 0
|
386
402
|
num_cols = 0
|
387
403
|
for row in element("tr"):
|
388
404
|
col_count = 0
|
405
|
+
is_row_header = True
|
389
406
|
if not isinstance(row, Tag):
|
390
407
|
continue
|
391
408
|
for cell in row(["td", "th"]):
|
392
409
|
if not isinstance(row, Tag):
|
393
410
|
continue
|
394
|
-
|
411
|
+
cell_tag = cast(Tag, cell)
|
412
|
+
val = cell_tag.get("colspan", "1")
|
395
413
|
colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
|
396
414
|
col_count += colspan
|
415
|
+
if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
|
416
|
+
is_row_header = False
|
397
417
|
num_cols = max(num_cols, col_count)
|
418
|
+
if not is_row_header:
|
419
|
+
num_rows += 1
|
420
|
+
|
421
|
+
_log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
|
398
422
|
|
399
423
|
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
400
424
|
|
401
425
|
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
402
426
|
|
403
427
|
# Iterate over the rows in the table
|
404
|
-
|
428
|
+
start_row_span = 0
|
429
|
+
row_idx = -1
|
430
|
+
for row in element("tr"):
|
405
431
|
if not isinstance(row, Tag):
|
406
432
|
continue
|
407
433
|
|
408
434
|
# For each row, find all the column cells (both <td> and <th>)
|
409
435
|
cells = row(["td", "th"])
|
410
436
|
|
411
|
-
# Check if
|
437
|
+
# Check if cell is in a column header or row header
|
412
438
|
col_header = True
|
439
|
+
row_header = True
|
413
440
|
for html_cell in cells:
|
414
|
-
if isinstance(html_cell, Tag)
|
415
|
-
|
441
|
+
if isinstance(html_cell, Tag):
|
442
|
+
if html_cell.name == "td":
|
443
|
+
col_header = False
|
444
|
+
row_header = False
|
445
|
+
elif html_cell.get("rowspan") is None:
|
446
|
+
row_header = False
|
447
|
+
if not row_header:
|
448
|
+
row_idx += 1
|
449
|
+
start_row_span = 0
|
450
|
+
else:
|
451
|
+
start_row_span += 1
|
416
452
|
|
417
453
|
# Extract the text content of each cell
|
418
454
|
col_idx = 0
|
@@ -443,19 +479,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
443
479
|
if isinstance(row_val, str) and row_val.isnumeric()
|
444
480
|
else 1
|
445
481
|
)
|
446
|
-
|
447
|
-
|
482
|
+
if row_header:
|
483
|
+
row_span -= 1
|
484
|
+
while (
|
485
|
+
col_idx < num_cols
|
486
|
+
and grid[row_idx + start_row_span][col_idx] is not None
|
487
|
+
):
|
448
488
|
col_idx += 1
|
449
|
-
for r in range(row_span):
|
489
|
+
for r in range(start_row_span, start_row_span + row_span):
|
450
490
|
for c in range(col_span):
|
451
|
-
|
491
|
+
if row_idx + r < num_rows and col_idx + c < num_cols:
|
492
|
+
grid[row_idx + r][col_idx + c] = text
|
452
493
|
|
453
494
|
table_cell = TableCell(
|
454
495
|
text=text,
|
455
496
|
row_span=row_span,
|
456
497
|
col_span=col_span,
|
457
|
-
start_row_offset_idx=row_idx,
|
458
|
-
end_row_offset_idx=row_idx + row_span,
|
498
|
+
start_row_offset_idx=start_row_span + row_idx,
|
499
|
+
end_row_offset_idx=start_row_span + row_idx + row_span,
|
459
500
|
start_col_offset_idx=col_idx,
|
460
501
|
end_col_offset_idx=col_idx + col_span,
|
461
502
|
column_header=col_header,
|
@@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
83
83
|
# otherwise they represent emphasis (bold or italic)
|
84
84
|
self.markdown = self._shorten_underscore_sequences(text_stream)
|
85
85
|
if isinstance(self.path_or_stream, Path):
|
86
|
-
with open(self.path_or_stream,
|
86
|
+
with open(self.path_or_stream, encoding="utf-8") as f:
|
87
87
|
md_content = f.read()
|
88
88
|
# remove invalid sequences
|
89
89
|
# very long sequences of underscores will lead to unnecessary long processing times.
|
@@ -168,7 +168,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
168
168
|
)
|
169
169
|
self.inline_texts = []
|
170
170
|
|
171
|
-
def _iterate_elements(
|
171
|
+
def _iterate_elements( # noqa: C901
|
172
172
|
self,
|
173
173
|
element: marko.element.Element,
|
174
174
|
depth: int,
|
@@ -176,7 +176,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
176
176
|
visited: Set[marko.element.Element],
|
177
177
|
parent_item: Optional[NodeItem] = None,
|
178
178
|
):
|
179
|
-
|
180
179
|
if element in visited:
|
181
180
|
return
|
182
181
|
|
@@ -236,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
236
235
|
if has_non_empty_list_items:
|
237
236
|
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
238
237
|
parent_item = doc.add_group(
|
239
|
-
label=label, name=
|
238
|
+
label=label, name="list", parent=parent_item
|
240
239
|
)
|
241
240
|
|
242
241
|
elif (
|
@@ -320,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
320
319
|
self._html_blocks += 1
|
321
320
|
self._process_inline_text(parent_item, doc)
|
322
321
|
self._close_table(doc)
|
323
|
-
_log.debug("HTML Block: {}"
|
322
|
+
_log.debug(f"HTML Block: {element}")
|
324
323
|
if (
|
325
324
|
len(element.body) > 0
|
326
325
|
): # If Marko doesn't return any content for HTML block, skip it
|
@@ -332,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
332
331
|
else:
|
333
332
|
if not isinstance(element, str):
|
334
333
|
self._close_table(doc)
|
335
|
-
_log.debug("Some other element: {}"
|
334
|
+
_log.debug(f"Some other element: {element}")
|
336
335
|
|
337
336
|
processed_block_types = (
|
338
337
|
marko.block.Heading,
|
@@ -398,7 +397,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
398
397
|
|
399
398
|
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
400
399
|
if self._html_blocks > 0:
|
401
|
-
|
402
400
|
# export to HTML
|
403
401
|
html_backend_cls = HTMLDocumentBackend
|
404
402
|
html_str = doc.export_to_html()
|
@@ -411,7 +409,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
411
409
|
)
|
412
410
|
return _txt
|
413
411
|
|
414
|
-
# restore original HTML by removing
|
412
|
+
# restore original HTML by removing previously added markers
|
415
413
|
for regex in [
|
416
414
|
rf"<pre>\s*<code>\s*{_START_MARKER}",
|
417
415
|
rf"{_STOP_MARKER}\s*</code>\s*</pre>",
|
@@ -184,7 +184,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
184
184
|
"""
|
185
185
|
|
186
186
|
if self.workbook is not None:
|
187
|
-
|
188
187
|
# Iterate over all sheets
|
189
188
|
for sheet_name in self.workbook.sheetnames:
|
190
189
|
_log.info(f"Processing sheet: {sheet_name}")
|
@@ -253,7 +252,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
253
252
|
)
|
254
253
|
|
255
254
|
for excel_cell in excel_table.data:
|
256
|
-
|
257
255
|
cell = TableCell(
|
258
256
|
text=excel_cell.text,
|
259
257
|
row_span=excel_cell.row_span,
|
@@ -303,7 +301,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
303
301
|
# Iterate over all cells in the sheet
|
304
302
|
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
|
305
303
|
for rj, cell in enumerate(row):
|
306
|
-
|
307
304
|
# Skip empty or already visited cells
|
308
305
|
if cell.value is None or (ri, rj) in visited:
|
309
306
|
continue
|
@@ -342,7 +339,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
342
339
|
visited_cells: set[tuple[int, int]] = set()
|
343
340
|
for ri in range(start_row, max_row + 1):
|
344
341
|
for rj in range(start_col, max_col + 1):
|
345
|
-
|
346
342
|
cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
|
347
343
|
|
348
344
|
# Check if the cell belongs to a merged range
|
@@ -350,14 +346,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
350
346
|
col_span = 1
|
351
347
|
|
352
348
|
for merged_range in sheet.merged_cells.ranges:
|
353
|
-
|
354
349
|
if (
|
355
350
|
merged_range.min_row <= ri + 1
|
356
351
|
and ri + 1 <= merged_range.max_row
|
357
352
|
and merged_range.min_col <= rj + 1
|
358
353
|
and rj + 1 <= merged_range.max_col
|
359
354
|
):
|
360
|
-
|
361
355
|
row_span = merged_range.max_row - merged_range.min_row + 1
|
362
356
|
col_span = merged_range.max_col - merged_range.min_col + 1
|
363
357
|
break
|
@@ -499,7 +493,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
499
493
|
),
|
500
494
|
),
|
501
495
|
)
|
502
|
-
except:
|
496
|
+
except Exception:
|
503
497
|
_log.error("could not extract the image from excel sheets")
|
504
498
|
|
505
499
|
return doc
|