docling 2.5.0__tar.gz → 2.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.5.0 → docling-2.5.1}/PKG-INFO +1 -1
- {docling-2.5.0 → docling-2.5.1}/docling/backend/msword_backend.py +8 -8
- {docling-2.5.0 → docling-2.5.1}/pyproject.toml +1 -1
- {docling-2.5.0 → docling-2.5.1}/LICENSE +0 -0
- {docling-2.5.0 → docling-2.5.1}/README.md +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/__init__.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/backend/__init__.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/backend/abstract_backend.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/backend/html_backend.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/backend/md_backend.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/backend/pdf_backend.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/cli/__init__.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/cli/main.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/datamodel/__init__.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/datamodel/base_models.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/datamodel/document.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/datamodel/pipeline_options.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/datamodel/settings.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/document_converter.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/models/__init__.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/models/base_model.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/models/base_ocr_model.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/models/ds_glm_model.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/models/easyocr_model.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/models/layout_model.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/models/page_assemble_model.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/models/table_structure_model.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/pipeline/__init__.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/utils/__init__.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/utils/export.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/utils/layout_utils.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/utils/profiling.py +0 -0
- {docling-2.5.0 → docling-2.5.1}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.5.
|
3
|
+
Version: 2.5.1
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -130,7 +130,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
130
130
|
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
131
131
|
for element in body:
|
132
132
|
tag_name = etree.QName(element).localname
|
133
|
-
|
134
133
|
# Check for Inline Images (drawings or blip elements)
|
135
134
|
found_drawing = etree.ElementBase.xpath(
|
136
135
|
element, ".//w:drawing", namespaces=self.xml_namespaces
|
@@ -201,7 +200,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
201
200
|
label_str = ""
|
202
201
|
label_level = 0
|
203
202
|
if parts[0] == "Heading":
|
204
|
-
# print("{} - {}".format(parts[0], parts[1]))
|
205
203
|
label_str = parts[0]
|
206
204
|
label_level = self.str_to_int(parts[1], default=None)
|
207
205
|
if parts[1] == "Heading":
|
@@ -217,19 +215,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
217
215
|
if paragraph.text is None:
|
218
216
|
# _log.warn(f"paragraph has text==None")
|
219
217
|
return
|
220
|
-
|
221
218
|
text = paragraph.text.strip()
|
222
219
|
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
|
223
220
|
|
224
221
|
# Common styles for bullet and numbered lists.
|
225
222
|
# "List Bullet", "List Number", "List Paragraph"
|
226
|
-
#
|
223
|
+
# Identify wether list is a numbered list or not
|
227
224
|
# is_numbered = "List Bullet" not in paragraph.style.name
|
228
225
|
is_numbered = False
|
229
|
-
|
230
226
|
p_style_name, p_level = self.get_label_and_level(paragraph)
|
231
227
|
numid, ilevel = self.get_numId_and_ilvl(paragraph)
|
232
|
-
# print("numid: {}, ilevel: {}, text: {}".format(numid, ilevel, text))
|
233
228
|
|
234
229
|
if numid == 0:
|
235
230
|
numid = None
|
@@ -450,8 +445,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
450
445
|
for row in table.rows:
|
451
446
|
# Calculate the max number of columns
|
452
447
|
num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
|
453
|
-
|
454
|
-
|
448
|
+
|
449
|
+
if num_rows == 1 and num_cols == 1:
|
450
|
+
cell_element = table.rows[0].cells[0]
|
451
|
+
# In case we have a table of only 1 cell, we consider it furniture
|
452
|
+
# And proceed processing the content of the cell as though it's in the document body
|
453
|
+
self.walk_linear(cell_element._element, docx_obj, doc)
|
454
|
+
return
|
455
455
|
|
456
456
|
# Initialize the table grid
|
457
457
|
table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.5.
|
3
|
+
version = "2.5.1" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|