docling-core 2.2.1__py3-none-any.whl → 2.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/hierarchical_chunker.py +2 -2
- docling_core/types/doc/document.py +8 -3
- docling_core/types/doc/labels.py +12 -0
- {docling_core-2.2.1.dist-info → docling_core-2.2.3.dist-info}/METADATA +1 -1
- {docling_core-2.2.1.dist-info → docling_core-2.2.3.dist-info}/RECORD +8 -8
- {docling_core-2.2.1.dist-info → docling_core-2.2.3.dist-info}/LICENSE +0 -0
- {docling_core-2.2.1.dist-info → docling_core-2.2.3.dist-info}/WHEEL +0 -0
- {docling_core-2.2.1.dist-info → docling_core-2.2.3.dist-info}/entry_points.txt +0 -0
|
@@ -129,8 +129,8 @@ class HierarchicalChunker(BaseChunker):
|
|
|
129
129
|
table_df.index = table_df.index + 1
|
|
130
130
|
table_df = table_df.sort_index()
|
|
131
131
|
|
|
132
|
-
rows = [item.strip() for item in table_df.iloc[:, 0].to_list()]
|
|
133
|
-
cols = [item.strip() for item in table_df.iloc[0, :].to_list()]
|
|
132
|
+
rows = [str(item).strip() for item in table_df.iloc[:, 0].to_list()]
|
|
133
|
+
cols = [str(item).strip() for item in table_df.iloc[0, :].to_list()]
|
|
134
134
|
|
|
135
135
|
nrows = table_df.shape[0]
|
|
136
136
|
ncols = table_df.shape[1]
|
|
@@ -4,6 +4,7 @@ import base64
|
|
|
4
4
|
import mimetypes
|
|
5
5
|
import re
|
|
6
6
|
import sys
|
|
7
|
+
import textwrap
|
|
7
8
|
import typing
|
|
8
9
|
from io import BytesIO
|
|
9
10
|
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
|
|
@@ -1125,6 +1126,7 @@ class DoclingDocument(BaseModel):
|
|
|
1125
1126
|
image_placeholder: str = "<!-- image -->",
|
|
1126
1127
|
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
1127
1128
|
indent: int = 4,
|
|
1129
|
+
text_width: int = -1,
|
|
1128
1130
|
) -> str:
|
|
1129
1131
|
r"""Serialize to Markdown.
|
|
1130
1132
|
|
|
@@ -1207,8 +1209,8 @@ class DoclingDocument(BaseModel):
|
|
|
1207
1209
|
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
|
|
1208
1210
|
in_list = False
|
|
1209
1211
|
marker = "" if strict_text else "#"
|
|
1210
|
-
text = f"{marker} {item.text}
|
|
1211
|
-
mdtexts.append(text.strip())
|
|
1212
|
+
text = f"{marker} {item.text}"
|
|
1213
|
+
mdtexts.append(text.strip() + "\n")
|
|
1212
1214
|
|
|
1213
1215
|
elif (
|
|
1214
1216
|
isinstance(item, TextItem)
|
|
@@ -1251,7 +1253,10 @@ class DoclingDocument(BaseModel):
|
|
|
1251
1253
|
|
|
1252
1254
|
elif isinstance(item, TextItem) and item.label in labels:
|
|
1253
1255
|
in_list = False
|
|
1254
|
-
if len(item.text):
|
|
1256
|
+
if len(item.text) and text_width > 0:
|
|
1257
|
+
wrapped_text = textwrap.fill(text, width=text_width)
|
|
1258
|
+
mdtexts.append(wrapped_text + "\n")
|
|
1259
|
+
elif len(item.text):
|
|
1255
1260
|
text = f"{item.text}\n"
|
|
1256
1261
|
mdtexts.append(text)
|
|
1257
1262
|
|
docling_core/types/doc/labels.py
CHANGED
|
@@ -29,6 +29,10 @@ class DocItemLabel(str, Enum):
|
|
|
29
29
|
PARAGRAPH = "paragraph" # explicitly a paragraph and not arbitrary text
|
|
30
30
|
REFERENCE = "reference"
|
|
31
31
|
|
|
32
|
+
def __str__(self):
|
|
33
|
+
"""Get string value."""
|
|
34
|
+
return str(self.value)
|
|
35
|
+
|
|
32
36
|
|
|
33
37
|
class GroupLabel(str, Enum):
|
|
34
38
|
"""GroupLabel."""
|
|
@@ -43,6 +47,10 @@ class GroupLabel(str, Enum):
|
|
|
43
47
|
SHEET = "sheet"
|
|
44
48
|
SLIDE = "slide"
|
|
45
49
|
|
|
50
|
+
def __str__(self):
|
|
51
|
+
"""Get string value."""
|
|
52
|
+
return str(self.value)
|
|
53
|
+
|
|
46
54
|
|
|
47
55
|
class TableCellLabel(str, Enum):
|
|
48
56
|
"""TableCellLabel."""
|
|
@@ -51,3 +59,7 @@ class TableCellLabel(str, Enum):
|
|
|
51
59
|
ROW_HEADER = "row_header"
|
|
52
60
|
ROW_SECTION = "row_section"
|
|
53
61
|
BODY = "body"
|
|
62
|
+
|
|
63
|
+
def __str__(self):
|
|
64
|
+
"""Get string value."""
|
|
65
|
+
return str(self.value)
|
|
@@ -16,13 +16,13 @@ docling_core/search/package.py,sha256=Lz2ml2eDy5t0ZimnGTq-DXHAn-f18w0bn4H5xrhs75
|
|
|
16
16
|
docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9ACDd57ds,106
|
|
17
17
|
docling_core/transforms/chunker/__init__.py,sha256=sSSTnt7ZCt8Og1e0jhApNTtA0pyyHyzwcl8yXFLb2J8,292
|
|
18
18
|
docling_core/transforms/chunker/base.py,sha256=iPouZOJ3cYWvai4P0Gpd3QmsTKQuY5fFUXzTMk_XNmE,1571
|
|
19
|
-
docling_core/transforms/chunker/hierarchical_chunker.py,sha256
|
|
19
|
+
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=uG7nNoUCFqWeQAKydQg731JYJ9sayUe4J48nMF0VHE8,8097
|
|
20
20
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
21
21
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
22
22
|
docling_core/types/doc/__init__.py,sha256=bEL4zKVOG7Wxm6xQrgF58mu-Teds9aSavuEAKVNhrTU,639
|
|
23
23
|
docling_core/types/doc/base.py,sha256=zvx631U_yQCcJam83hNdDanXEYnO3eN-CCw9vDr6S-I,4442
|
|
24
|
-
docling_core/types/doc/document.py,sha256=
|
|
25
|
-
docling_core/types/doc/labels.py,sha256=
|
|
24
|
+
docling_core/types/doc/document.py,sha256=B56FA5lGAEodjfIUncXSstQclAmyt3GOybMiKEEIc7s,52138
|
|
25
|
+
docling_core/types/doc/labels.py,sha256=A8vWP82VAeXO1rlCO0oDKo_Hb8uDeQe0myOTY3P03hk,1596
|
|
26
26
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
27
27
|
docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
|
|
28
28
|
docling_core/types/legacy_doc/__init__.py,sha256=Pzj_8rft6SJTVTCHgXRwHtuZjL6LK_6dcBWjikL9biY,125
|
|
@@ -49,8 +49,8 @@ docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6
|
|
|
49
49
|
docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
|
|
50
50
|
docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
|
|
51
51
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
52
|
-
docling_core-2.2.
|
|
53
|
-
docling_core-2.2.
|
|
54
|
-
docling_core-2.2.
|
|
55
|
-
docling_core-2.2.
|
|
56
|
-
docling_core-2.2.
|
|
52
|
+
docling_core-2.2.3.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
53
|
+
docling_core-2.2.3.dist-info/METADATA,sha256=DlV-TrYKPq-qbI9d0iS4mrOJs_CwV9QZNflqGEy0crE,5432
|
|
54
|
+
docling_core-2.2.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
55
|
+
docling_core-2.2.3.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
|
|
56
|
+
docling_core-2.2.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|