docling-core 2.24.1__py3-none-any.whl → 2.26.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -11,24 +11,36 @@ import logging
11
11
  import re
12
12
  from typing import Any, ClassVar, Final, Iterator, Literal, Optional
13
13
 
14
- from pandas import DataFrame
15
- from pydantic import Field, StringConstraints, field_validator
16
- from typing_extensions import Annotated
17
-
14
+ from pydantic import ConfigDict, Field, StringConstraints, field_validator
15
+ from typing_extensions import Annotated, override
16
+
17
+ from docling_core.experimental.serializer.base import (
18
+ BaseDocSerializer,
19
+ BaseSerializerProvider,
20
+ BaseTableSerializer,
21
+ SerializationResult,
22
+ )
23
+ from docling_core.experimental.serializer.common import create_ser_result
24
+ from docling_core.experimental.serializer.markdown import (
25
+ MarkdownDocSerializer,
26
+ MarkdownParams,
27
+ )
18
28
  from docling_core.search.package import VERSION_PATTERN
19
29
  from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
20
30
  from docling_core.types import DoclingDocument as DLDocument
31
+ from docling_core.types.doc.base import ImageRefMode
21
32
  from docling_core.types.doc.document import (
22
- CodeItem,
23
33
  DocItem,
34
+ DoclingDocument,
24
35
  DocumentOrigin,
36
+ InlineGroup,
25
37
  LevelNumber,
26
- ListItem,
38
+ OrderedList,
27
39
  SectionHeaderItem,
28
40
  TableItem,
29
- TextItem,
41
+ TitleItem,
42
+ UnorderedList,
30
43
  )
31
- from docling_core.types.doc.labels import DocItemLabel
32
44
 
33
45
  _VERSION: Final = "1.0.0"
34
46
 
@@ -64,7 +76,8 @@ class DocMeta(BaseMeta):
64
76
  alias=_KEY_HEADINGS,
65
77
  min_length=1,
66
78
  )
67
- captions: Optional[list[str]] = Field(
79
+ captions: Optional[list[str]] = Field( # deprecated
80
+ deprecated=True,
68
81
  default=None,
69
82
  alias=_KEY_CAPTIONS,
70
83
  min_length=1,
@@ -110,6 +123,76 @@ class DocChunk(BaseChunk):
110
123
  meta: DocMeta
111
124
 
112
125
 
126
+ class TripletTableSerializer(BaseTableSerializer):
127
+ """Triplet-based table item serializer."""
128
+
129
+ @override
130
+ def serialize(
131
+ self,
132
+ *,
133
+ item: TableItem,
134
+ doc_serializer: BaseDocSerializer,
135
+ doc: DoclingDocument,
136
+ **kwargs,
137
+ ) -> SerializationResult:
138
+ """Serializes the passed item."""
139
+ parts: list[SerializationResult] = []
140
+
141
+ cap_res = doc_serializer.serialize_captions(
142
+ item=item,
143
+ **kwargs,
144
+ )
145
+ if cap_res.text:
146
+ parts.append(cap_res)
147
+
148
+ if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
149
+ table_df = item.export_to_dataframe()
150
+ if table_df.shape[0] >= 1 and table_df.shape[1] >= 2:
151
+
152
+ # copy header as first row and shift all rows by one
153
+ table_df.loc[-1] = table_df.columns # type: ignore[call-overload]
154
+ table_df.index = table_df.index + 1
155
+ table_df = table_df.sort_index()
156
+
157
+ rows = [str(item).strip() for item in table_df.iloc[:, 0].to_list()]
158
+ cols = [str(item).strip() for item in table_df.iloc[0, :].to_list()]
159
+
160
+ nrows = table_df.shape[0]
161
+ ncols = table_df.shape[1]
162
+ table_text_parts = [
163
+ f"{rows[i]}, {cols[j]} = {str(table_df.iloc[i, j]).strip()}"
164
+ for i in range(1, nrows)
165
+ for j in range(1, ncols)
166
+ ]
167
+ table_text = ". ".join(table_text_parts)
168
+ parts.append(create_ser_result(text=table_text, span_source=item))
169
+
170
+ text_res = "\n\n".join([r.text for r in parts])
171
+
172
+ return create_ser_result(text=text_res, span_source=parts)
173
+
174
+
175
+ class ChunkingDocSerializer(MarkdownDocSerializer):
176
+ """Doc serializer used for chunking purposes."""
177
+
178
+ table_serializer: BaseTableSerializer = TripletTableSerializer()
179
+ params: MarkdownParams = MarkdownParams(
180
+ image_mode=ImageRefMode.PLACEHOLDER,
181
+ image_placeholder="",
182
+ escape_underscores=False,
183
+ escape_html=False,
184
+ )
185
+
186
+
187
+ class ChunkingSerializerProvider(BaseSerializerProvider):
188
+ """Serializer provider used for chunking purposes."""
189
+
190
+ @override
191
+ def get_serializer(self, doc: DoclingDocument) -> BaseDocSerializer:
192
+ """Get the associated serializer."""
193
+ return ChunkingDocSerializer(doc=doc)
194
+
195
+
113
196
  class HierarchicalChunker(BaseChunker):
114
197
  r"""Chunker implementation leveraging the document layout.
115
198
 
@@ -119,31 +202,18 @@ class HierarchicalChunker(BaseChunker):
119
202
  delim (str): Delimiter to use for merging text. Defaults to "\n".
120
203
  """
121
204
 
122
- merge_list_items: bool = True
123
-
124
- @classmethod
125
- def _triplet_serialize(cls, table_df: DataFrame) -> str:
126
-
127
- # copy header as first row and shift all rows by one
128
- table_df.loc[-1] = table_df.columns # type: ignore[call-overload]
129
- table_df.index = table_df.index + 1
130
- table_df = table_df.sort_index()
131
-
132
- rows = [str(item).strip() for item in table_df.iloc[:, 0].to_list()]
133
- cols = [str(item).strip() for item in table_df.iloc[0, :].to_list()]
205
+ model_config = ConfigDict(arbitrary_types_allowed=True)
134
206
 
135
- nrows = table_df.shape[0]
136
- ncols = table_df.shape[1]
137
- texts = [
138
- f"{rows[i]}, {cols[j]} = {str(table_df.iloc[i, j]).strip()}"
139
- for i in range(1, nrows)
140
- for j in range(1, ncols)
141
- ]
142
- output_text = ". ".join(texts)
207
+ serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider()
143
208
 
144
- return output_text
209
+ # deprecated:
210
+ merge_list_items: Annotated[bool, Field(deprecated=True)] = True
145
211
 
146
- def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
212
+ def chunk(
213
+ self,
214
+ dl_doc: DLDocument,
215
+ **kwargs: Any,
216
+ ) -> Iterator[BaseChunk]:
147
217
  r"""Chunk the provided document.
148
218
 
149
219
  Args:
@@ -152,90 +222,41 @@ class HierarchicalChunker(BaseChunker):
152
222
  Yields:
153
223
  Iterator[Chunk]: iterator over extracted chunks
154
224
  """
225
+ my_doc_ser = self.serializer_provider.get_serializer(doc=dl_doc)
155
226
  heading_by_level: dict[LevelNumber, str] = {}
156
- list_items: list[TextItem] = []
157
- for item, level in dl_doc.iterate_items():
158
- captions = None
159
- if isinstance(item, DocItem):
160
-
161
- # first handle any merging needed
162
- if self.merge_list_items:
163
- if isinstance(
164
- item, ListItem
165
- ) or ( # TODO remove when all captured as ListItem:
166
- isinstance(item, TextItem)
167
- and item.label == DocItemLabel.LIST_ITEM
168
- ):
169
- list_items.append(item)
170
- continue
171
- elif list_items: # need to yield
172
- yield DocChunk(
173
- text=self.delim.join([i.text for i in list_items]),
174
- meta=DocMeta(
175
- doc_items=list_items,
176
- headings=[
177
- heading_by_level[k]
178
- for k in sorted(heading_by_level)
179
- ]
180
- or None,
181
- origin=dl_doc.origin,
182
- ),
183
- )
184
- list_items = [] # reset
185
-
186
- if isinstance(item, SectionHeaderItem) or (
187
- isinstance(item, TextItem)
188
- and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE]
189
- ):
190
- level = (
191
- item.level
192
- if isinstance(item, SectionHeaderItem)
193
- else (0 if item.label == DocItemLabel.TITLE else 1)
194
- )
195
- heading_by_level[level] = item.text
196
-
197
- # remove headings of higher level as they just went out of scope
198
- keys_to_del = [k for k in heading_by_level if k > level]
199
- for k in keys_to_del:
200
- heading_by_level.pop(k, None)
201
- continue
202
-
203
- if (
204
- isinstance(item, TextItem)
205
- or ((not self.merge_list_items) and isinstance(item, ListItem))
206
- or isinstance(item, CodeItem)
207
- ):
208
- text = item.text
209
- elif isinstance(item, TableItem):
210
- table_df = item.export_to_dataframe()
211
- if table_df.shape[0] < 1 or table_df.shape[1] < 2:
212
- # at least two cols needed, as first column contains row headers
213
- continue
214
- text = self._triplet_serialize(table_df=table_df)
215
- captions = [
216
- c.text for c in [r.resolve(dl_doc) for r in item.captions]
217
- ] or None
218
- else:
219
- continue
227
+ visited: set[str] = set()
228
+ ser_res = create_ser_result()
229
+ excluded_refs = my_doc_ser.get_excluded_refs(**kwargs)
230
+ for item, level in dl_doc.iterate_items(with_groups=True):
231
+ if item.self_ref in excluded_refs:
232
+ continue
233
+ if isinstance(item, (TitleItem, SectionHeaderItem)):
234
+ level = item.level if isinstance(item, SectionHeaderItem) else 0
235
+ heading_by_level[level] = item.text
236
+
237
+ # remove headings of higher level as they just went out of scope
238
+ keys_to_del = [k for k in heading_by_level if k > level]
239
+ for k in keys_to_del:
240
+ heading_by_level.pop(k, None)
241
+ continue
242
+ elif (
243
+ isinstance(item, (OrderedList, UnorderedList, InlineGroup, DocItem))
244
+ and item.self_ref not in visited
245
+ ):
246
+ ser_res = my_doc_ser.serialize(item=item, visited=visited)
247
+ else:
248
+ continue
249
+
250
+ if not ser_res.text:
251
+ continue
252
+ if doc_items := [u.item for u in ser_res.spans]:
220
253
  c = DocChunk(
221
- text=text,
254
+ text=ser_res.text,
222
255
  meta=DocMeta(
223
- doc_items=[item],
256
+ doc_items=doc_items,
224
257
  headings=[heading_by_level[k] for k in sorted(heading_by_level)]
225
258
  or None,
226
- captions=captions,
227
259
  origin=dl_doc.origin,
228
260
  ),
229
261
  )
230
262
  yield c
231
-
232
- if self.merge_list_items and list_items: # need to yield
233
- yield DocChunk(
234
- text=self.delim.join([i.text for i in list_items]),
235
- meta=DocMeta(
236
- doc_items=list_items,
237
- headings=[heading_by_level[k] for k in sorted(heading_by_level)]
238
- or None,
239
- origin=dl_doc.origin,
240
- ),
241
- )
@@ -4,13 +4,24 @@
4
4
  #
5
5
 
6
6
  """Hybrid chunker implementation leveraging both doc structure & token awareness."""
7
-
8
7
  import warnings
8
+ from functools import cached_property
9
9
  from typing import Any, Iterable, Iterator, Optional, Union
10
10
 
11
- from pydantic import BaseModel, ConfigDict, PositiveInt, TypeAdapter, model_validator
11
+ from pydantic import (
12
+ BaseModel,
13
+ ConfigDict,
14
+ PositiveInt,
15
+ TypeAdapter,
16
+ computed_field,
17
+ model_validator,
18
+ )
12
19
  from typing_extensions import Self
13
20
 
21
+ from docling_core.transforms.chunker.hierarchical_chunker import (
22
+ ChunkingSerializerProvider,
23
+ )
24
+
14
25
  try:
15
26
  import semchunk
16
27
  from transformers import AutoTokenizer, PreTrainedTokenizerBase
@@ -20,6 +31,10 @@ except ImportError:
20
31
  "`pip install 'docling-core[chunking]'`"
21
32
  )
22
33
 
34
+ from docling_core.experimental.serializer.base import (
35
+ BaseDocSerializer,
36
+ BaseSerializerProvider,
37
+ )
23
38
  from docling_core.transforms.chunker import (
24
39
  BaseChunk,
25
40
  BaseChunker,
@@ -28,7 +43,6 @@ from docling_core.transforms.chunker import (
28
43
  HierarchicalChunker,
29
44
  )
30
45
  from docling_core.types import DoclingDocument
31
- from docling_core.types.doc.document import TextItem
32
46
 
33
47
 
34
48
  class HybridChunker(BaseChunker):
@@ -50,7 +64,7 @@ class HybridChunker(BaseChunker):
50
64
  max_tokens: int = None # type: ignore[assignment]
51
65
  merge_peers: bool = True
52
66
 
53
- _inner_chunker: HierarchicalChunker = HierarchicalChunker()
67
+ serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider()
54
68
 
55
69
  @model_validator(mode="after")
56
70
  def _patch_tokenizer_and_max_tokens(self) -> Self:
@@ -65,6 +79,11 @@ class HybridChunker(BaseChunker):
65
79
  )
66
80
  return self
67
81
 
82
+ @computed_field # type: ignore[misc]
83
+ @cached_property
84
+ def _inner_chunker(self) -> HierarchicalChunker:
85
+ return HierarchicalChunker(serializer_provider=self.serializer_provider)
86
+
68
87
  def _count_text_tokens(self, text: Optional[Union[str, list[str]]]):
69
88
  if text is None:
70
89
  return 0
@@ -81,7 +100,7 @@ class HybridChunker(BaseChunker):
81
100
  other_len: int
82
101
 
83
102
  def _count_chunk_tokens(self, doc_chunk: DocChunk):
84
- ser_txt = self.serialize(chunk=doc_chunk)
103
+ ser_txt = self.contextualize(chunk=doc_chunk)
85
104
  return len(self._tokenizer.tokenize(text=ser_txt))
86
105
 
87
106
  def _doc_chunk_length(self, doc_chunk: DocChunk):
@@ -94,7 +113,11 @@ class HybridChunker(BaseChunker):
94
113
  )
95
114
 
96
115
  def _make_chunk_from_doc_items(
97
- self, doc_chunk: DocChunk, window_start: int, window_end: int
116
+ self,
117
+ doc_chunk: DocChunk,
118
+ window_start: int,
119
+ window_end: int,
120
+ doc_serializer: BaseDocSerializer,
98
121
  ):
99
122
  doc_items = doc_chunk.meta.doc_items[window_start : window_end + 1]
100
123
  meta = DocMeta(
@@ -106,18 +129,21 @@ class HybridChunker(BaseChunker):
106
129
  window_text = (
107
130
  doc_chunk.text
108
131
  if len(doc_chunk.meta.doc_items) == 1
132
+ # TODO: merging should ideally be done by the serializer:
109
133
  else self.delim.join(
110
134
  [
111
- doc_item.text
135
+ res_text
112
136
  for doc_item in doc_items
113
- if isinstance(doc_item, TextItem)
137
+ if (res_text := doc_serializer.serialize(item=doc_item).text)
114
138
  ]
115
139
  )
116
140
  )
117
141
  new_chunk = DocChunk(text=window_text, meta=meta)
118
142
  return new_chunk
119
143
 
120
- def _split_by_doc_items(self, doc_chunk: DocChunk) -> list[DocChunk]:
144
+ def _split_by_doc_items(
145
+ self, doc_chunk: DocChunk, doc_serializer: BaseDocSerializer
146
+ ) -> list[DocChunk]:
121
147
  chunks = []
122
148
  window_start = 0
123
149
  window_end = 0 # an inclusive index
@@ -127,6 +153,7 @@ class HybridChunker(BaseChunker):
127
153
  doc_chunk=doc_chunk,
128
154
  window_start=window_start,
129
155
  window_end=window_end,
156
+ doc_serializer=doc_serializer,
130
157
  )
131
158
  if self._count_chunk_tokens(doc_chunk=new_chunk) <= self.max_tokens:
132
159
  if window_end < num_items - 1:
@@ -153,6 +180,7 @@ class HybridChunker(BaseChunker):
153
180
  doc_chunk=doc_chunk,
154
181
  window_start=window_start,
155
182
  window_end=window_end - 1,
183
+ doc_serializer=doc_serializer,
156
184
  )
157
185
  window_start = window_end
158
186
  chunks.append(new_chunk)
@@ -199,6 +227,7 @@ class HybridChunker(BaseChunker):
199
227
  chks = chunks[window_start : window_end + 1]
200
228
  doc_items = [it for chk in chks for it in chk.meta.doc_items]
201
229
  candidate = DocChunk(
230
+ # TODO: merging should ideally be done by the serializer:
202
231
  text=self.delim.join([chk.text for chk in chks]),
203
232
  meta=DocMeta(
204
233
  doc_items=doc_items,
@@ -231,7 +260,11 @@ class HybridChunker(BaseChunker):
231
260
 
232
261
  return output_chunks
233
262
 
234
- def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]:
263
+ def chunk(
264
+ self,
265
+ dl_doc: DoclingDocument,
266
+ **kwargs: Any,
267
+ ) -> Iterator[BaseChunk]:
235
268
  r"""Chunk the provided document.
236
269
 
237
270
  Args:
@@ -240,9 +273,18 @@ class HybridChunker(BaseChunker):
240
273
  Yields:
241
274
  Iterator[Chunk]: iterator over extracted chunks
242
275
  """
276
+ my_doc_ser = self.serializer_provider.get_serializer(doc=dl_doc)
243
277
  res: Iterable[DocChunk]
244
- res = self._inner_chunker.chunk(dl_doc=dl_doc, **kwargs) # type: ignore
245
- res = [x for c in res for x in self._split_by_doc_items(c)]
278
+ res = self._inner_chunker.chunk(
279
+ dl_doc=dl_doc,
280
+ doc_serializer=my_doc_ser,
281
+ **kwargs,
282
+ ) # type: ignore
283
+ res = [
284
+ x
285
+ for c in res
286
+ for x in self._split_by_doc_items(c, doc_serializer=my_doc_ser)
287
+ ]
246
288
  res = [x for c in res for x in self._split_using_plain_text(c)]
247
289
  if self.merge_peers:
248
290
  res = self._merge_chunks_with_matching_metadata(res)
@@ -182,7 +182,10 @@ class BoundingBox(BaseModel):
182
182
  ) -> float:
183
183
  """intersection_over_self."""
184
184
  intersection_area = self.intersection_area_with(other=other)
185
- return intersection_area / self.area()
185
+ if self.area() > 0:
186
+ return intersection_area / self.area()
187
+ else:
188
+ return 0.0
186
189
 
187
190
  def to_bottom_left_origin(self, page_height: float) -> "BoundingBox":
188
191
  """to_bottom_left_origin.