docling-core 2.24.1__py3-none-any.whl → 2.26.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/experimental/serializer/base.py +23 -2
- docling_core/experimental/serializer/common.py +79 -34
- docling_core/experimental/serializer/doctags.py +83 -47
- docling_core/experimental/serializer/html.py +931 -0
- docling_core/experimental/serializer/html_styles.py +212 -0
- docling_core/experimental/serializer/markdown.py +95 -57
- docling_core/transforms/chunker/base.py +8 -2
- docling_core/transforms/chunker/hierarchical_chunker.py +130 -109
- docling_core/transforms/chunker/hybrid_chunker.py +54 -12
- docling_core/types/doc/base.py +4 -1
- docling_core/types/doc/document.py +738 -490
- docling_core/types/doc/labels.py +2 -0
- docling_core/types/doc/page.py +12 -17
- docling_core/types/doc/tokens.py +3 -0
- {docling_core-2.24.1.dist-info → docling_core-2.26.0.dist-info}/METADATA +1 -1
- {docling_core-2.24.1.dist-info → docling_core-2.26.0.dist-info}/RECORD +19 -17
- {docling_core-2.24.1.dist-info → docling_core-2.26.0.dist-info}/LICENSE +0 -0
- {docling_core-2.24.1.dist-info → docling_core-2.26.0.dist-info}/WHEEL +0 -0
- {docling_core-2.24.1.dist-info → docling_core-2.26.0.dist-info}/entry_points.txt +0 -0
|
@@ -11,24 +11,36 @@ import logging
|
|
|
11
11
|
import re
|
|
12
12
|
from typing import Any, ClassVar, Final, Iterator, Literal, Optional
|
|
13
13
|
|
|
14
|
-
from
|
|
15
|
-
from
|
|
16
|
-
|
|
17
|
-
|
|
14
|
+
from pydantic import ConfigDict, Field, StringConstraints, field_validator
|
|
15
|
+
from typing_extensions import Annotated, override
|
|
16
|
+
|
|
17
|
+
from docling_core.experimental.serializer.base import (
|
|
18
|
+
BaseDocSerializer,
|
|
19
|
+
BaseSerializerProvider,
|
|
20
|
+
BaseTableSerializer,
|
|
21
|
+
SerializationResult,
|
|
22
|
+
)
|
|
23
|
+
from docling_core.experimental.serializer.common import create_ser_result
|
|
24
|
+
from docling_core.experimental.serializer.markdown import (
|
|
25
|
+
MarkdownDocSerializer,
|
|
26
|
+
MarkdownParams,
|
|
27
|
+
)
|
|
18
28
|
from docling_core.search.package import VERSION_PATTERN
|
|
19
29
|
from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
|
|
20
30
|
from docling_core.types import DoclingDocument as DLDocument
|
|
31
|
+
from docling_core.types.doc.base import ImageRefMode
|
|
21
32
|
from docling_core.types.doc.document import (
|
|
22
|
-
CodeItem,
|
|
23
33
|
DocItem,
|
|
34
|
+
DoclingDocument,
|
|
24
35
|
DocumentOrigin,
|
|
36
|
+
InlineGroup,
|
|
25
37
|
LevelNumber,
|
|
26
|
-
|
|
38
|
+
OrderedList,
|
|
27
39
|
SectionHeaderItem,
|
|
28
40
|
TableItem,
|
|
29
|
-
|
|
41
|
+
TitleItem,
|
|
42
|
+
UnorderedList,
|
|
30
43
|
)
|
|
31
|
-
from docling_core.types.doc.labels import DocItemLabel
|
|
32
44
|
|
|
33
45
|
_VERSION: Final = "1.0.0"
|
|
34
46
|
|
|
@@ -64,7 +76,8 @@ class DocMeta(BaseMeta):
|
|
|
64
76
|
alias=_KEY_HEADINGS,
|
|
65
77
|
min_length=1,
|
|
66
78
|
)
|
|
67
|
-
captions: Optional[list[str]] = Field(
|
|
79
|
+
captions: Optional[list[str]] = Field( # deprecated
|
|
80
|
+
deprecated=True,
|
|
68
81
|
default=None,
|
|
69
82
|
alias=_KEY_CAPTIONS,
|
|
70
83
|
min_length=1,
|
|
@@ -110,6 +123,76 @@ class DocChunk(BaseChunk):
|
|
|
110
123
|
meta: DocMeta
|
|
111
124
|
|
|
112
125
|
|
|
126
|
+
class TripletTableSerializer(BaseTableSerializer):
|
|
127
|
+
"""Triplet-based table item serializer."""
|
|
128
|
+
|
|
129
|
+
@override
|
|
130
|
+
def serialize(
|
|
131
|
+
self,
|
|
132
|
+
*,
|
|
133
|
+
item: TableItem,
|
|
134
|
+
doc_serializer: BaseDocSerializer,
|
|
135
|
+
doc: DoclingDocument,
|
|
136
|
+
**kwargs,
|
|
137
|
+
) -> SerializationResult:
|
|
138
|
+
"""Serializes the passed item."""
|
|
139
|
+
parts: list[SerializationResult] = []
|
|
140
|
+
|
|
141
|
+
cap_res = doc_serializer.serialize_captions(
|
|
142
|
+
item=item,
|
|
143
|
+
**kwargs,
|
|
144
|
+
)
|
|
145
|
+
if cap_res.text:
|
|
146
|
+
parts.append(cap_res)
|
|
147
|
+
|
|
148
|
+
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
149
|
+
table_df = item.export_to_dataframe()
|
|
150
|
+
if table_df.shape[0] >= 1 and table_df.shape[1] >= 2:
|
|
151
|
+
|
|
152
|
+
# copy header as first row and shift all rows by one
|
|
153
|
+
table_df.loc[-1] = table_df.columns # type: ignore[call-overload]
|
|
154
|
+
table_df.index = table_df.index + 1
|
|
155
|
+
table_df = table_df.sort_index()
|
|
156
|
+
|
|
157
|
+
rows = [str(item).strip() for item in table_df.iloc[:, 0].to_list()]
|
|
158
|
+
cols = [str(item).strip() for item in table_df.iloc[0, :].to_list()]
|
|
159
|
+
|
|
160
|
+
nrows = table_df.shape[0]
|
|
161
|
+
ncols = table_df.shape[1]
|
|
162
|
+
table_text_parts = [
|
|
163
|
+
f"{rows[i]}, {cols[j]} = {str(table_df.iloc[i, j]).strip()}"
|
|
164
|
+
for i in range(1, nrows)
|
|
165
|
+
for j in range(1, ncols)
|
|
166
|
+
]
|
|
167
|
+
table_text = ". ".join(table_text_parts)
|
|
168
|
+
parts.append(create_ser_result(text=table_text, span_source=item))
|
|
169
|
+
|
|
170
|
+
text_res = "\n\n".join([r.text for r in parts])
|
|
171
|
+
|
|
172
|
+
return create_ser_result(text=text_res, span_source=parts)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
class ChunkingDocSerializer(MarkdownDocSerializer):
|
|
176
|
+
"""Doc serializer used for chunking purposes."""
|
|
177
|
+
|
|
178
|
+
table_serializer: BaseTableSerializer = TripletTableSerializer()
|
|
179
|
+
params: MarkdownParams = MarkdownParams(
|
|
180
|
+
image_mode=ImageRefMode.PLACEHOLDER,
|
|
181
|
+
image_placeholder="",
|
|
182
|
+
escape_underscores=False,
|
|
183
|
+
escape_html=False,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class ChunkingSerializerProvider(BaseSerializerProvider):
|
|
188
|
+
"""Serializer provider used for chunking purposes."""
|
|
189
|
+
|
|
190
|
+
@override
|
|
191
|
+
def get_serializer(self, doc: DoclingDocument) -> BaseDocSerializer:
|
|
192
|
+
"""Get the associated serializer."""
|
|
193
|
+
return ChunkingDocSerializer(doc=doc)
|
|
194
|
+
|
|
195
|
+
|
|
113
196
|
class HierarchicalChunker(BaseChunker):
|
|
114
197
|
r"""Chunker implementation leveraging the document layout.
|
|
115
198
|
|
|
@@ -119,31 +202,18 @@ class HierarchicalChunker(BaseChunker):
|
|
|
119
202
|
delim (str): Delimiter to use for merging text. Defaults to "\n".
|
|
120
203
|
"""
|
|
121
204
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
@classmethod
|
|
125
|
-
def _triplet_serialize(cls, table_df: DataFrame) -> str:
|
|
126
|
-
|
|
127
|
-
# copy header as first row and shift all rows by one
|
|
128
|
-
table_df.loc[-1] = table_df.columns # type: ignore[call-overload]
|
|
129
|
-
table_df.index = table_df.index + 1
|
|
130
|
-
table_df = table_df.sort_index()
|
|
131
|
-
|
|
132
|
-
rows = [str(item).strip() for item in table_df.iloc[:, 0].to_list()]
|
|
133
|
-
cols = [str(item).strip() for item in table_df.iloc[0, :].to_list()]
|
|
205
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
134
206
|
|
|
135
|
-
|
|
136
|
-
ncols = table_df.shape[1]
|
|
137
|
-
texts = [
|
|
138
|
-
f"{rows[i]}, {cols[j]} = {str(table_df.iloc[i, j]).strip()}"
|
|
139
|
-
for i in range(1, nrows)
|
|
140
|
-
for j in range(1, ncols)
|
|
141
|
-
]
|
|
142
|
-
output_text = ". ".join(texts)
|
|
207
|
+
serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider()
|
|
143
208
|
|
|
144
|
-
|
|
209
|
+
# deprecated:
|
|
210
|
+
merge_list_items: Annotated[bool, Field(deprecated=True)] = True
|
|
145
211
|
|
|
146
|
-
def chunk(
|
|
212
|
+
def chunk(
|
|
213
|
+
self,
|
|
214
|
+
dl_doc: DLDocument,
|
|
215
|
+
**kwargs: Any,
|
|
216
|
+
) -> Iterator[BaseChunk]:
|
|
147
217
|
r"""Chunk the provided document.
|
|
148
218
|
|
|
149
219
|
Args:
|
|
@@ -152,90 +222,41 @@ class HierarchicalChunker(BaseChunker):
|
|
|
152
222
|
Yields:
|
|
153
223
|
Iterator[Chunk]: iterator over extracted chunks
|
|
154
224
|
"""
|
|
225
|
+
my_doc_ser = self.serializer_provider.get_serializer(doc=dl_doc)
|
|
155
226
|
heading_by_level: dict[LevelNumber, str] = {}
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
),
|
|
183
|
-
)
|
|
184
|
-
list_items = [] # reset
|
|
185
|
-
|
|
186
|
-
if isinstance(item, SectionHeaderItem) or (
|
|
187
|
-
isinstance(item, TextItem)
|
|
188
|
-
and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE]
|
|
189
|
-
):
|
|
190
|
-
level = (
|
|
191
|
-
item.level
|
|
192
|
-
if isinstance(item, SectionHeaderItem)
|
|
193
|
-
else (0 if item.label == DocItemLabel.TITLE else 1)
|
|
194
|
-
)
|
|
195
|
-
heading_by_level[level] = item.text
|
|
196
|
-
|
|
197
|
-
# remove headings of higher level as they just went out of scope
|
|
198
|
-
keys_to_del = [k for k in heading_by_level if k > level]
|
|
199
|
-
for k in keys_to_del:
|
|
200
|
-
heading_by_level.pop(k, None)
|
|
201
|
-
continue
|
|
202
|
-
|
|
203
|
-
if (
|
|
204
|
-
isinstance(item, TextItem)
|
|
205
|
-
or ((not self.merge_list_items) and isinstance(item, ListItem))
|
|
206
|
-
or isinstance(item, CodeItem)
|
|
207
|
-
):
|
|
208
|
-
text = item.text
|
|
209
|
-
elif isinstance(item, TableItem):
|
|
210
|
-
table_df = item.export_to_dataframe()
|
|
211
|
-
if table_df.shape[0] < 1 or table_df.shape[1] < 2:
|
|
212
|
-
# at least two cols needed, as first column contains row headers
|
|
213
|
-
continue
|
|
214
|
-
text = self._triplet_serialize(table_df=table_df)
|
|
215
|
-
captions = [
|
|
216
|
-
c.text for c in [r.resolve(dl_doc) for r in item.captions]
|
|
217
|
-
] or None
|
|
218
|
-
else:
|
|
219
|
-
continue
|
|
227
|
+
visited: set[str] = set()
|
|
228
|
+
ser_res = create_ser_result()
|
|
229
|
+
excluded_refs = my_doc_ser.get_excluded_refs(**kwargs)
|
|
230
|
+
for item, level in dl_doc.iterate_items(with_groups=True):
|
|
231
|
+
if item.self_ref in excluded_refs:
|
|
232
|
+
continue
|
|
233
|
+
if isinstance(item, (TitleItem, SectionHeaderItem)):
|
|
234
|
+
level = item.level if isinstance(item, SectionHeaderItem) else 0
|
|
235
|
+
heading_by_level[level] = item.text
|
|
236
|
+
|
|
237
|
+
# remove headings of higher level as they just went out of scope
|
|
238
|
+
keys_to_del = [k for k in heading_by_level if k > level]
|
|
239
|
+
for k in keys_to_del:
|
|
240
|
+
heading_by_level.pop(k, None)
|
|
241
|
+
continue
|
|
242
|
+
elif (
|
|
243
|
+
isinstance(item, (OrderedList, UnorderedList, InlineGroup, DocItem))
|
|
244
|
+
and item.self_ref not in visited
|
|
245
|
+
):
|
|
246
|
+
ser_res = my_doc_ser.serialize(item=item, visited=visited)
|
|
247
|
+
else:
|
|
248
|
+
continue
|
|
249
|
+
|
|
250
|
+
if not ser_res.text:
|
|
251
|
+
continue
|
|
252
|
+
if doc_items := [u.item for u in ser_res.spans]:
|
|
220
253
|
c = DocChunk(
|
|
221
|
-
text=text,
|
|
254
|
+
text=ser_res.text,
|
|
222
255
|
meta=DocMeta(
|
|
223
|
-
doc_items=
|
|
256
|
+
doc_items=doc_items,
|
|
224
257
|
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
|
|
225
258
|
or None,
|
|
226
|
-
captions=captions,
|
|
227
259
|
origin=dl_doc.origin,
|
|
228
260
|
),
|
|
229
261
|
)
|
|
230
262
|
yield c
|
|
231
|
-
|
|
232
|
-
if self.merge_list_items and list_items: # need to yield
|
|
233
|
-
yield DocChunk(
|
|
234
|
-
text=self.delim.join([i.text for i in list_items]),
|
|
235
|
-
meta=DocMeta(
|
|
236
|
-
doc_items=list_items,
|
|
237
|
-
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
|
|
238
|
-
or None,
|
|
239
|
-
origin=dl_doc.origin,
|
|
240
|
-
),
|
|
241
|
-
)
|
|
@@ -4,13 +4,24 @@
|
|
|
4
4
|
#
|
|
5
5
|
|
|
6
6
|
"""Hybrid chunker implementation leveraging both doc structure & token awareness."""
|
|
7
|
-
|
|
8
7
|
import warnings
|
|
8
|
+
from functools import cached_property
|
|
9
9
|
from typing import Any, Iterable, Iterator, Optional, Union
|
|
10
10
|
|
|
11
|
-
from pydantic import
|
|
11
|
+
from pydantic import (
|
|
12
|
+
BaseModel,
|
|
13
|
+
ConfigDict,
|
|
14
|
+
PositiveInt,
|
|
15
|
+
TypeAdapter,
|
|
16
|
+
computed_field,
|
|
17
|
+
model_validator,
|
|
18
|
+
)
|
|
12
19
|
from typing_extensions import Self
|
|
13
20
|
|
|
21
|
+
from docling_core.transforms.chunker.hierarchical_chunker import (
|
|
22
|
+
ChunkingSerializerProvider,
|
|
23
|
+
)
|
|
24
|
+
|
|
14
25
|
try:
|
|
15
26
|
import semchunk
|
|
16
27
|
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
|
@@ -20,6 +31,10 @@ except ImportError:
|
|
|
20
31
|
"`pip install 'docling-core[chunking]'`"
|
|
21
32
|
)
|
|
22
33
|
|
|
34
|
+
from docling_core.experimental.serializer.base import (
|
|
35
|
+
BaseDocSerializer,
|
|
36
|
+
BaseSerializerProvider,
|
|
37
|
+
)
|
|
23
38
|
from docling_core.transforms.chunker import (
|
|
24
39
|
BaseChunk,
|
|
25
40
|
BaseChunker,
|
|
@@ -28,7 +43,6 @@ from docling_core.transforms.chunker import (
|
|
|
28
43
|
HierarchicalChunker,
|
|
29
44
|
)
|
|
30
45
|
from docling_core.types import DoclingDocument
|
|
31
|
-
from docling_core.types.doc.document import TextItem
|
|
32
46
|
|
|
33
47
|
|
|
34
48
|
class HybridChunker(BaseChunker):
|
|
@@ -50,7 +64,7 @@ class HybridChunker(BaseChunker):
|
|
|
50
64
|
max_tokens: int = None # type: ignore[assignment]
|
|
51
65
|
merge_peers: bool = True
|
|
52
66
|
|
|
53
|
-
|
|
67
|
+
serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider()
|
|
54
68
|
|
|
55
69
|
@model_validator(mode="after")
|
|
56
70
|
def _patch_tokenizer_and_max_tokens(self) -> Self:
|
|
@@ -65,6 +79,11 @@ class HybridChunker(BaseChunker):
|
|
|
65
79
|
)
|
|
66
80
|
return self
|
|
67
81
|
|
|
82
|
+
@computed_field # type: ignore[misc]
|
|
83
|
+
@cached_property
|
|
84
|
+
def _inner_chunker(self) -> HierarchicalChunker:
|
|
85
|
+
return HierarchicalChunker(serializer_provider=self.serializer_provider)
|
|
86
|
+
|
|
68
87
|
def _count_text_tokens(self, text: Optional[Union[str, list[str]]]):
|
|
69
88
|
if text is None:
|
|
70
89
|
return 0
|
|
@@ -81,7 +100,7 @@ class HybridChunker(BaseChunker):
|
|
|
81
100
|
other_len: int
|
|
82
101
|
|
|
83
102
|
def _count_chunk_tokens(self, doc_chunk: DocChunk):
|
|
84
|
-
ser_txt = self.
|
|
103
|
+
ser_txt = self.contextualize(chunk=doc_chunk)
|
|
85
104
|
return len(self._tokenizer.tokenize(text=ser_txt))
|
|
86
105
|
|
|
87
106
|
def _doc_chunk_length(self, doc_chunk: DocChunk):
|
|
@@ -94,7 +113,11 @@ class HybridChunker(BaseChunker):
|
|
|
94
113
|
)
|
|
95
114
|
|
|
96
115
|
def _make_chunk_from_doc_items(
|
|
97
|
-
self,
|
|
116
|
+
self,
|
|
117
|
+
doc_chunk: DocChunk,
|
|
118
|
+
window_start: int,
|
|
119
|
+
window_end: int,
|
|
120
|
+
doc_serializer: BaseDocSerializer,
|
|
98
121
|
):
|
|
99
122
|
doc_items = doc_chunk.meta.doc_items[window_start : window_end + 1]
|
|
100
123
|
meta = DocMeta(
|
|
@@ -106,18 +129,21 @@ class HybridChunker(BaseChunker):
|
|
|
106
129
|
window_text = (
|
|
107
130
|
doc_chunk.text
|
|
108
131
|
if len(doc_chunk.meta.doc_items) == 1
|
|
132
|
+
# TODO: merging should ideally be done by the serializer:
|
|
109
133
|
else self.delim.join(
|
|
110
134
|
[
|
|
111
|
-
|
|
135
|
+
res_text
|
|
112
136
|
for doc_item in doc_items
|
|
113
|
-
if
|
|
137
|
+
if (res_text := doc_serializer.serialize(item=doc_item).text)
|
|
114
138
|
]
|
|
115
139
|
)
|
|
116
140
|
)
|
|
117
141
|
new_chunk = DocChunk(text=window_text, meta=meta)
|
|
118
142
|
return new_chunk
|
|
119
143
|
|
|
120
|
-
def _split_by_doc_items(
|
|
144
|
+
def _split_by_doc_items(
|
|
145
|
+
self, doc_chunk: DocChunk, doc_serializer: BaseDocSerializer
|
|
146
|
+
) -> list[DocChunk]:
|
|
121
147
|
chunks = []
|
|
122
148
|
window_start = 0
|
|
123
149
|
window_end = 0 # an inclusive index
|
|
@@ -127,6 +153,7 @@ class HybridChunker(BaseChunker):
|
|
|
127
153
|
doc_chunk=doc_chunk,
|
|
128
154
|
window_start=window_start,
|
|
129
155
|
window_end=window_end,
|
|
156
|
+
doc_serializer=doc_serializer,
|
|
130
157
|
)
|
|
131
158
|
if self._count_chunk_tokens(doc_chunk=new_chunk) <= self.max_tokens:
|
|
132
159
|
if window_end < num_items - 1:
|
|
@@ -153,6 +180,7 @@ class HybridChunker(BaseChunker):
|
|
|
153
180
|
doc_chunk=doc_chunk,
|
|
154
181
|
window_start=window_start,
|
|
155
182
|
window_end=window_end - 1,
|
|
183
|
+
doc_serializer=doc_serializer,
|
|
156
184
|
)
|
|
157
185
|
window_start = window_end
|
|
158
186
|
chunks.append(new_chunk)
|
|
@@ -199,6 +227,7 @@ class HybridChunker(BaseChunker):
|
|
|
199
227
|
chks = chunks[window_start : window_end + 1]
|
|
200
228
|
doc_items = [it for chk in chks for it in chk.meta.doc_items]
|
|
201
229
|
candidate = DocChunk(
|
|
230
|
+
# TODO: merging should ideally be done by the serializer:
|
|
202
231
|
text=self.delim.join([chk.text for chk in chks]),
|
|
203
232
|
meta=DocMeta(
|
|
204
233
|
doc_items=doc_items,
|
|
@@ -231,7 +260,11 @@ class HybridChunker(BaseChunker):
|
|
|
231
260
|
|
|
232
261
|
return output_chunks
|
|
233
262
|
|
|
234
|
-
def chunk(
|
|
263
|
+
def chunk(
|
|
264
|
+
self,
|
|
265
|
+
dl_doc: DoclingDocument,
|
|
266
|
+
**kwargs: Any,
|
|
267
|
+
) -> Iterator[BaseChunk]:
|
|
235
268
|
r"""Chunk the provided document.
|
|
236
269
|
|
|
237
270
|
Args:
|
|
@@ -240,9 +273,18 @@ class HybridChunker(BaseChunker):
|
|
|
240
273
|
Yields:
|
|
241
274
|
Iterator[Chunk]: iterator over extracted chunks
|
|
242
275
|
"""
|
|
276
|
+
my_doc_ser = self.serializer_provider.get_serializer(doc=dl_doc)
|
|
243
277
|
res: Iterable[DocChunk]
|
|
244
|
-
res = self._inner_chunker.chunk(
|
|
245
|
-
|
|
278
|
+
res = self._inner_chunker.chunk(
|
|
279
|
+
dl_doc=dl_doc,
|
|
280
|
+
doc_serializer=my_doc_ser,
|
|
281
|
+
**kwargs,
|
|
282
|
+
) # type: ignore
|
|
283
|
+
res = [
|
|
284
|
+
x
|
|
285
|
+
for c in res
|
|
286
|
+
for x in self._split_by_doc_items(c, doc_serializer=my_doc_ser)
|
|
287
|
+
]
|
|
246
288
|
res = [x for c in res for x in self._split_using_plain_text(c)]
|
|
247
289
|
if self.merge_peers:
|
|
248
290
|
res = self._merge_chunks_with_matching_metadata(res)
|
docling_core/types/doc/base.py
CHANGED
|
@@ -182,7 +182,10 @@ class BoundingBox(BaseModel):
|
|
|
182
182
|
) -> float:
|
|
183
183
|
"""intersection_over_self."""
|
|
184
184
|
intersection_area = self.intersection_area_with(other=other)
|
|
185
|
-
|
|
185
|
+
if self.area() > 0:
|
|
186
|
+
return intersection_area / self.area()
|
|
187
|
+
else:
|
|
188
|
+
return 0.0
|
|
186
189
|
|
|
187
190
|
def to_bottom_left_origin(self, page_height: float) -> "BoundingBox":
|
|
188
191
|
"""to_bottom_left_origin.
|