docling-core 1.7.2__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (36) hide show
  1. docling_core/transforms/chunker/__init__.py +2 -8
  2. docling_core/transforms/chunker/base.py +27 -40
  3. docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
  4. docling_core/types/__init__.py +3 -18
  5. docling_core/types/doc/__init__.py +25 -0
  6. docling_core/types/doc/base.py +136 -451
  7. docling_core/types/doc/document.py +1289 -559
  8. docling_core/types/{experimental → doc}/labels.py +4 -1
  9. docling_core/types/legacy_doc/__init__.py +6 -0
  10. docling_core/types/legacy_doc/base.py +485 -0
  11. docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
  12. docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
  13. docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
  14. docling_core/types/legacy_doc/document.py +715 -0
  15. docling_core/types/rec/subject.py +1 -1
  16. docling_core/utils/generate_docs.py +82 -0
  17. docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
  18. docling_core/utils/validators.py +3 -3
  19. {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/METADATA +17 -17
  20. {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/RECORD +24 -31
  21. docling_core-2.0.1.dist-info/entry_points.txt +5 -0
  22. docling_core/transforms/id_generator/__init__.py +0 -12
  23. docling_core/transforms/id_generator/base.py +0 -30
  24. docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
  25. docling_core/transforms/id_generator/uuid_generator.py +0 -34
  26. docling_core/transforms/metadata_extractor/__init__.py +0 -13
  27. docling_core/transforms/metadata_extractor/base.py +0 -59
  28. docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
  29. docling_core/types/experimental/__init__.py +0 -30
  30. docling_core/types/experimental/base.py +0 -167
  31. docling_core/types/experimental/document.py +0 -1192
  32. docling_core/utils/ds_generate_docs.py +0 -144
  33. docling_core-1.7.2.dist-info/entry_points.txt +0 -5
  34. /docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
  35. {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/LICENSE +0 -0
  36. {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/WHEEL +0 -0
@@ -5,11 +5,5 @@
5
5
 
6
6
  """Define the chunker types."""
7
7
 
8
- from docling_core.transforms.chunker.base import ( # noqa
9
- BaseChunker,
10
- Chunk,
11
- ChunkWithMetadata,
12
- )
13
- from docling_core.transforms.chunker.hierarchical_chunker import ( # noqa
14
- HierarchicalChunker,
15
- )
8
+ from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
9
+ from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
@@ -4,71 +4,58 @@
4
4
  #
5
5
 
6
6
  """Define base classes for chunking."""
7
- import re
8
7
  from abc import ABC, abstractmethod
9
- from typing import Final, Iterator, Optional
8
+ from typing import Any, ClassVar, Iterator
10
9
 
11
- from pydantic import BaseModel, Field, field_validator
10
+ from pydantic import BaseModel
12
11
 
13
- from docling_core.types import BoundingBox, Document
14
- from docling_core.types.base import _JSON_POINTER_REGEX
12
+ from docling_core.types.doc import DoclingDocument as DLDocument
15
13
 
16
- # (subset of) JSONPath format, e.g. "$.main-text[84]" (for migration purposes)
17
- _DEPRECATED_JSON_PATH_PATTERN: Final = re.compile(r"^\$\.([\w-]+)\[(\d+)\]$")
18
14
 
15
+ class BaseMeta(BaseModel):
16
+ """Metadata base class."""
19
17
 
20
- def _create_path(pos: int, path_prefix: str = "main-text") -> str:
21
- return f"#/{path_prefix}/{pos}"
18
+ excluded_embed: ClassVar[list[str]] = []
19
+ excluded_llm: ClassVar[list[str]] = []
22
20
 
21
+ def export_json_dict(self) -> dict[str, Any]:
22
+ """Helper method for exporting non-None keys to JSON mode.
23
23
 
24
- class Chunk(BaseModel):
25
- """Data model for Chunk."""
24
+ Returns:
25
+ dict[str, Any]: The exported dictionary.
26
+ """
27
+ return self.model_dump(mode="json", by_alias=True, exclude_none=True)
26
28
 
27
- path: str = Field(pattern=_JSON_POINTER_REGEX)
28
- text: str
29
- heading: Optional[str] = None
30
29
 
31
- @field_validator("path", mode="before")
32
- @classmethod
33
- def _json_pointer_from_json_path(cls, path: str):
34
- if (match := _DEPRECATED_JSON_PATH_PATTERN.match(path)) is not None:
35
- groups = match.groups()
36
- if len(groups) == 2 and groups[0] is not None and groups[1] is not None:
37
- return _create_path(
38
- pos=int(groups[1]),
39
- path_prefix=groups[0],
40
- )
41
- return path
30
+ class BaseChunk(BaseModel):
31
+ """Chunk base class."""
42
32
 
33
+ text: str
34
+ meta: BaseMeta
43
35
 
44
- class ChunkWithMetadata(Chunk):
45
- """Data model for Chunk including metadata."""
36
+ def export_json_dict(self) -> dict[str, Any]:
37
+ """Helper method for exporting non-None keys to JSON mode.
46
38
 
47
- page: Optional[int] = None
48
- bbox: Optional[BoundingBox] = None
39
+ Returns:
40
+ dict[str, Any]: The exported dictionary.
41
+ """
42
+ return self.model_dump(mode="json", by_alias=True, exclude_none=True)
49
43
 
50
44
 
51
45
  class BaseChunker(BaseModel, ABC):
52
- """Base class for Chunker."""
46
+ """Chunker base class."""
53
47
 
54
48
  @abstractmethod
55
- def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]:
49
+ def chunk(self, dl_doc: DLDocument, **kwargs) -> Iterator[BaseChunk]:
56
50
  """Chunk the provided document.
57
51
 
58
52
  Args:
59
- dl_doc (Document): document to chunk
53
+ dl_doc (DLDocument): document to chunk
60
54
 
61
55
  Raises:
62
56
  NotImplementedError: in this abstract implementation
63
57
 
64
58
  Yields:
65
- Iterator[Chunk]: iterator over extracted chunks
59
+ Iterator[BaseChunk]: iterator over extracted chunks
66
60
  """
67
61
  raise NotImplementedError()
68
-
69
- @classmethod
70
- def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
71
- return _create_path(
72
- pos=pos,
73
- path_prefix=path_prefix,
74
- )
@@ -8,347 +8,179 @@
8
8
  from __future__ import annotations
9
9
 
10
10
  import logging
11
- from enum import Enum
12
- from typing import Any, Iterator, Optional, Union
13
-
14
- import pandas as pd
15
- from pydantic import BaseModel, Field, PositiveInt
16
-
17
- from docling_core.transforms.chunker import BaseChunker, Chunk, ChunkWithMetadata
18
- from docling_core.types import BaseText
19
- from docling_core.types import Document as DLDocument
20
- from docling_core.types import Ref, Table
11
+ from typing import Any, ClassVar, Iterator, Optional
12
+
13
+ from pandas import DataFrame
14
+ from pydantic import Field
15
+
16
+ from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
17
+ from docling_core.types.doc import DoclingDocument as DLDocument
18
+ from docling_core.types.doc.document import (
19
+ DocItem,
20
+ LevelNumber,
21
+ ListItem,
22
+ SectionHeaderItem,
23
+ TableItem,
24
+ TextItem,
25
+ )
26
+ from docling_core.types.doc.labels import DocItemLabel
27
+
28
+ _KEY_DOC_ITEMS = "doc_items"
29
+ _KEY_HEADINGS = "headings"
30
+ _KEY_CAPTIONS = "captions"
21
31
 
22
32
  _logger = logging.getLogger(__name__)
23
33
 
24
34
 
25
- class HierarchicalChunker(BaseChunker):
26
- """Chunker implementation leveraging the document layout."""
35
+ class DocMeta(BaseMeta):
36
+ """Data model for Hierarchical Chunker metadata."""
27
37
 
28
- heading_as_metadata: bool = Field(
29
- default=False,
30
- description="Whether heading should be in metadata (instead of text)",
38
+ doc_items: list[DocItem] = Field(
39
+ alias=_KEY_DOC_ITEMS,
40
+ min_length=1,
31
41
  )
32
- include_metadata: bool = Field(
33
- default=True,
34
- description="Whether to include extras in the metadata",
42
+ headings: Optional[list[str]] = Field(
43
+ default=None,
44
+ alias=_KEY_HEADINGS,
45
+ min_length=1,
35
46
  )
36
- min_chunk_len: PositiveInt = Field(
37
- default=64, description="Minimum chunk text length to consider (in chars)"
47
+ captions: Optional[list[str]] = Field(
48
+ default=None,
49
+ alias=_KEY_CAPTIONS,
50
+ min_length=1,
38
51
  )
39
52
 
40
- class _NodeType(str, Enum):
41
- PARAGRAPH = "paragraph"
42
- SUBTITLE_LEVEL_1 = "subtitle-level-1"
43
- TABLE = "table"
44
- CAPTION = "caption"
45
-
46
- class _NodeName(str, Enum):
47
- TITLE = "title"
48
- REFERENCE = "reference"
49
- LIST_ITEM = "list-item"
50
- SUBTITLE_LEVEL_1 = "subtitle-level-1"
51
-
52
- _allowed_types: list[str] = [
53
- _NodeType.PARAGRAPH,
54
- _NodeType.SUBTITLE_LEVEL_1,
55
- _NodeType.TABLE,
56
- _NodeType.CAPTION,
57
- ]
58
- _disallowed_names_by_type: dict[str, list[str]] = {
59
- _NodeType.PARAGRAPH: [
60
- _NodeName.REFERENCE,
61
- ],
62
- }
63
-
64
- @classmethod
65
- def _norm(cls, text: Optional[str]) -> Optional[str]:
66
- return text.lower() if text is not None else None
67
-
68
- @classmethod
69
- def _convert_table_to_dataframe(cls, table: Table) -> Optional[pd.DataFrame]:
70
- if table.data:
71
- table_content = [[cell.text for cell in row] for row in table.data]
72
- return pd.DataFrame(table_content)
73
- else:
74
- return None
75
-
76
- @classmethod
77
- def _triplet_serialize(cls, table) -> Optional[str]:
78
- output_text: Optional[str] = None
79
- table_df = cls._convert_table_to_dataframe(table)
80
- if table_df is not None and table_df.shape[0] > 1 and table_df.shape[1] > 1:
81
- rows = [item.strip() for item in table_df.iloc[:, 0].to_list()]
82
- cols = [item.strip() for item in table_df.iloc[0, :].to_list()]
83
- nrows = table_df.shape[0]
84
- ncols = table_df.shape[1]
85
- texts = [
86
- f"{rows[i]}, {cols[j]} = {str(table_df.iloc[i, j]).strip()}"
87
- for i in range(1, nrows)
88
- for j in range(1, ncols)
89
- ]
90
- output_text = ". ".join(texts)
91
-
92
- return output_text
93
-
94
- class _MainTextItemNode(BaseModel):
95
- parent: Optional[int] = None
96
- children: list[int] = []
97
-
98
- class _TitleInfo(BaseModel):
99
- text: str
100
- path_in_doc: str
101
-
102
- class _GlobalContext(BaseModel):
103
- title: Optional[_HC._TitleInfo] = None
104
-
105
- class _DocContext(BaseModel):
106
- dmap: dict[int, _HC._MainTextItemNode] # main text element context
107
- glob: _HC._GlobalContext # global context
108
-
109
- @classmethod
110
- def from_doc(cls, doc: DLDocument) -> _HC._DocContext:
111
- dmap: dict[int, _HC._MainTextItemNode] = {}
112
- glob: _HC._GlobalContext = _HC._GlobalContext()
113
- if doc.description.title:
114
- glob.title = _HC._TitleInfo(
115
- text=doc.description.title,
116
- path_in_doc="description.title",
117
- )
118
-
119
- parent = None
120
- if doc.main_text:
121
- idx = 0
122
- while idx < len(doc.main_text):
123
- item = doc.main_text[idx]
124
- if (
125
- not glob.title
126
- and isinstance(item, BaseText)
127
- and _HC._norm(item.name) == _HC._NodeName.TITLE
128
- ):
129
- glob.title = _HC._TitleInfo(
130
- text=item.text,
131
- path_in_doc=_HC._create_path(idx),
132
- )
133
-
134
- # start of a subtitle-level-1 parent
135
- if (
136
- isinstance(item, BaseText)
137
- and _HC._norm(item.obj_type) == _HC._NodeType.SUBTITLE_LEVEL_1
138
- ):
139
- dmap[idx] = _HC._MainTextItemNode(parent=None)
140
- parent = idx
141
- if not glob.title:
142
- glob.title = _HC._TitleInfo(
143
- text=item.text,
144
- path_in_doc=_HC._create_path(idx),
145
- )
146
-
147
- # start of a list parent
148
- elif (
149
- isinstance(item, BaseText)
150
- and _HC._norm(item.name) != _HC._NodeName.LIST_ITEM
151
- and idx + 1 < len(doc.main_text)
152
- and _HC._norm(doc.main_text[idx + 1].name)
153
- == _HC._NodeName.LIST_ITEM
154
- ):
155
- if parent is not None:
156
- dmap[parent].children.append(idx)
157
- dmap[idx] = _HC._MainTextItemNode(parent=parent)
53
+ excluded_embed: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
54
+ excluded_llm: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
158
55
 
159
- # have all children register locally
160
- li = idx + 1
161
- while (
162
- li < len(doc.main_text)
163
- and _HC._norm(doc.main_text[li].name)
164
- == _HC._NodeName.LIST_ITEM
165
- ):
166
- dmap[idx].children.append(li)
167
- dmap[li] = _HC._MainTextItemNode(parent=idx)
168
- li += 1
169
- idx = li
170
- continue
171
56
 
172
- # normal case
173
- else:
174
- if parent is not None:
175
- dmap[parent].children.append(idx)
176
- dmap[idx] = _HC._MainTextItemNode(parent=parent)
57
+ class DocChunk(BaseChunk):
58
+ """Data model for Hierarchical Chunker chunks."""
177
59
 
178
- idx += 1
179
- else:
180
- pass
181
- return cls(
182
- dmap=dmap,
183
- glob=glob,
184
- )
60
+ meta: DocMeta
185
61
 
186
- class _TextEntry(BaseModel):
187
- text: str
188
- path: str
189
62
 
190
- def _build_chunk_impl(
191
- self, doc: DLDocument, doc_map: _DocContext, idx: int, rec: bool = False
192
- ) -> tuple[list[_TextEntry], Optional[str]]:
193
- if doc.main_text:
194
- item = doc.main_text[idx]
195
- item_type = _HC._norm(item.obj_type)
196
- item_name = _HC._norm(item.name)
197
- if (
198
- item_type not in self._allowed_types
199
- or item_name in self._disallowed_names_by_type.get(item_type, [])
200
- ):
201
- return [], None
63
+ class HierarchicalChunker(BaseChunker):
64
+ r"""Chunker implementation leveraging the document layout.
202
65
 
203
- c2p = doc_map.dmap
66
+ Args:
67
+ merge_list_items (bool): Whether to merge successive list items.
68
+ Defaults to True.
69
+ delim (str): Delimiter to use for merging text. Defaults to "\n".
70
+ """
204
71
 
205
- text_entries: list[_HC._TextEntry] = []
206
- if (
207
- isinstance(item, Ref)
208
- and item_type == _HC._NodeType.TABLE
209
- and doc.tables
210
- ):
211
- # resolve table reference
212
- ref_nr = int(item.ref.split("/")[2]) # e.g. '#/tables/0'
213
- table = doc.tables[ref_nr]
214
- ser_out = _HC._triplet_serialize(table)
215
- if table.data:
216
- text_entries = (
217
- [
218
- self._TextEntry(
219
- text=ser_out,
220
- path=self._create_path(idx),
221
- )
222
- ]
223
- if ser_out
224
- else []
225
- )
226
- else:
227
- return [], None
228
- elif isinstance(item, BaseText):
229
- text_entries = [
230
- self._TextEntry(
231
- text=item.text,
232
- path=self._create_path(idx),
233
- )
234
- ]
72
+ merge_list_items: bool = True
73
+ delim: str = "\n"
235
74
 
236
- # squash in any children of type list-item
237
- if not rec:
238
- if (
239
- c2p[idx].children
240
- and _HC._norm(doc.main_text[c2p[idx].children[0]].name)
241
- == _HC._NodeName.LIST_ITEM
242
- ):
243
- text_entries = text_entries + [
244
- self._TextEntry(
245
- text=doc.main_text[c].text, # type: ignore[union-attr]
246
- path=self._create_path(c),
247
- )
248
- for c in c2p[idx].children
249
- if isinstance(doc.main_text[c], BaseText)
250
- and _HC._norm(doc.main_text[c].name) == _HC._NodeName.LIST_ITEM
251
- ]
252
- elif item_name in [
253
- _HC._NodeName.LIST_ITEM,
254
- _HC._NodeName.SUBTITLE_LEVEL_1,
255
- ]:
256
- return [], None
75
+ @classmethod
76
+ def _triplet_serialize(cls, table_df: DataFrame) -> str:
257
77
 
258
- if (parent := c2p[idx].parent) is not None:
259
- # prepend with ancestors
78
+ # copy header as first row and shift all rows by one
79
+ table_df.loc[-1] = table_df.columns # type: ignore[call-overload]
80
+ table_df.index = table_df.index + 1
81
+ table_df = table_df.sort_index()
260
82
 
261
- parent_res = self._build_chunk_impl(
262
- doc=doc, doc_map=doc_map, idx=parent, rec=True
263
- )
264
- return (
265
- parent_res[0] + text_entries, # expanded text
266
- parent_res[1], # heading
267
- )
268
- else:
269
- if (
270
- self.heading_as_metadata
271
- and isinstance(item, BaseText)
272
- and _HC._norm(item.obj_type) == _HC._NodeType.SUBTITLE_LEVEL_1
273
- ):
274
- return [], text_entries[0].text
275
- else:
276
- return text_entries, None
277
- else:
278
- return [], None
83
+ rows = [item.strip() for item in table_df.iloc[:, 0].to_list()]
84
+ cols = [item.strip() for item in table_df.iloc[0, :].to_list()]
279
85
 
280
- def _build_chunk(
281
- self,
282
- doc: DLDocument,
283
- doc_map: _DocContext,
284
- idx: int,
285
- delim: str,
286
- rec: bool = False,
287
- ) -> Optional[Chunk]:
288
- res = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
289
- texts = res[0]
290
- heading = res[1]
291
- concat = delim.join([t.text for t in texts if t.text])
292
- assert doc.main_text is not None
293
- if len(concat) >= self.min_chunk_len:
294
- orig_item = doc.main_text[idx]
295
- item: Union[BaseText, Table]
296
- if isinstance(orig_item, Ref):
297
- if _HC._norm(orig_item.obj_type) == _HC._NodeType.TABLE and doc.tables:
298
- pos = int(orig_item.ref.split("/")[2])
299
- item = doc.tables[pos]
300
- path = self._create_path(pos, path_prefix="tables")
301
- else: # currently disregarding non-table references
302
- return None
303
- else:
304
- item = orig_item
305
- path = self._create_path(idx)
86
+ nrows = table_df.shape[0]
87
+ ncols = table_df.shape[1]
88
+ texts = [
89
+ f"{rows[i]}, {cols[j]} = {str(table_df.iloc[i, j]).strip()}"
90
+ for i in range(1, nrows)
91
+ for j in range(1, ncols)
92
+ ]
93
+ output_text = ". ".join(texts)
306
94
 
307
- if self.include_metadata:
308
- return ChunkWithMetadata(
309
- text=concat,
310
- path=path,
311
- heading=heading,
312
- page=item.prov[0].page if item.prov else None,
313
- bbox=item.prov[0].bbox if item.prov else None,
314
- )
315
- else:
316
- return Chunk(
317
- text=concat,
318
- path=path,
319
- heading=heading,
320
- )
321
- else:
322
- return None
95
+ return output_text
323
96
 
324
- def chunk(self, dl_doc: DLDocument, delim="\n", **kwargs: Any) -> Iterator[Chunk]:
97
+ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
325
98
  r"""Chunk the provided document.
326
99
 
327
100
  Args:
328
101
  dl_doc (DLDocument): document to chunk
329
- delim (str, optional): delimiter to use when concatenating sub-items.
330
- Defaults to "\n".
331
102
 
332
103
  Yields:
333
104
  Iterator[Chunk]: iterator over extracted chunks
334
105
  """
335
- if dl_doc.main_text:
336
- # extract doc structure incl. metadata for
337
- # each item (e.g. parent, children)
338
- doc_ctx = self._DocContext.from_doc(doc=dl_doc)
339
- _logger.debug(f"{doc_ctx.model_dump()=}")
106
+ heading_by_level: dict[LevelNumber, str] = {}
107
+ list_items: list[TextItem] = []
108
+ for item, level in dl_doc.iterate_items():
109
+ captions = None
110
+ if isinstance(item, DocItem):
111
+
112
+ # first handle any merging needed
113
+ if self.merge_list_items:
114
+ if isinstance(
115
+ item, ListItem
116
+ ) or ( # TODO remove when all captured as ListItem:
117
+ isinstance(item, TextItem)
118
+ and item.label == DocItemLabel.LIST_ITEM
119
+ ):
120
+ list_items.append(item)
121
+ continue
122
+ elif list_items: # need to yield
123
+ yield DocChunk(
124
+ text=self.delim.join([i.text for i in list_items]),
125
+ meta=DocMeta(
126
+ doc_items=list_items,
127
+ headings=[
128
+ heading_by_level[k]
129
+ for k in sorted(heading_by_level)
130
+ ]
131
+ or None,
132
+ ),
133
+ )
134
+ list_items = [] # reset
340
135
 
341
- for i, item in enumerate(dl_doc.main_text):
342
- if (
343
- isinstance(item, BaseText)
344
- or _HC._norm(item.obj_type) == _HC._NodeType.TABLE
136
+ if isinstance(
137
+ item, SectionHeaderItem
138
+ ) or ( # TODO remove when all captured as SectionHeaderItem:
139
+ isinstance(item, TextItem)
140
+ and item.label == DocItemLabel.SECTION_HEADER
345
141
  ):
346
- chunk = self._build_chunk(
347
- doc=dl_doc, doc_map=doc_ctx, idx=i, delim=delim
348
- )
349
- if chunk:
350
- _logger.info(f"{i=}, {chunk=}")
351
- yield chunk
352
-
353
-
354
- _HC = HierarchicalChunker
142
+ # TODO second branch not needed once cleanup above complete:
143
+ level = item.level if isinstance(item, SectionHeaderItem) else 1
144
+ heading_by_level[level] = item.text
145
+
146
+ # remove headings of higher level as they just went out of scope
147
+ keys_to_del = [k for k in heading_by_level if k > level]
148
+ for k in keys_to_del:
149
+ heading_by_level.pop(k, None)
150
+ continue
151
+
152
+ if isinstance(item, TextItem) or (
153
+ (not self.merge_list_items) and isinstance(item, ListItem)
154
+ ):
155
+ text = item.text
156
+ elif isinstance(item, TableItem):
157
+ table_df = item.export_to_dataframe()
158
+ if table_df.shape[0] < 1 or table_df.shape[1] < 2:
159
+ # at least two cols needed, as first column contains row headers
160
+ continue
161
+ text = self._triplet_serialize(table_df=table_df)
162
+ captions = [
163
+ c.text for c in [r.resolve(dl_doc) for r in item.captions]
164
+ ] or None
165
+ else:
166
+ continue
167
+ c = DocChunk(
168
+ text=text,
169
+ meta=DocMeta(
170
+ doc_items=[item],
171
+ headings=[heading_by_level[k] for k in sorted(heading_by_level)]
172
+ or None,
173
+ captions=captions,
174
+ ),
175
+ )
176
+ yield c
177
+
178
+ if self.merge_list_items and list_items: # need to yield
179
+ yield DocChunk(
180
+ text=self.delim.join([i.text for i in list_items]),
181
+ meta=DocMeta(
182
+ doc_items=list_items,
183
+ headings=[heading_by_level[k] for k in sorted(heading_by_level)]
184
+ or None,
185
+ ),
186
+ )
@@ -5,21 +5,6 @@
5
5
 
6
6
  """Define the main types."""
7
7
 
8
- from docling_core.types.doc.base import BoundingBox # noqa
9
- from docling_core.types.doc.base import Table # noqa
10
- from docling_core.types.doc.base import TableCell # noqa
11
- from docling_core.types.doc.base import ( # noqa
12
- BaseCell,
13
- BaseText,
14
- PageDimensions,
15
- PageReference,
16
- Prov,
17
- Ref,
18
- )
19
- from docling_core.types.doc.document import ( # noqa
20
- CCSDocumentDescription as DocumentDescription,
21
- )
22
- from docling_core.types.doc.document import CCSFileInfoObject as FileInfoObject # noqa
23
- from docling_core.types.doc.document import ExportedCCSDocument as Document # noqa
24
- from docling_core.types.gen.generic import Generic # noqa
25
- from docling_core.types.rec.record import Record # noqa
8
+ from docling_core.types.doc.document import DoclingDocument
9
+ from docling_core.types.gen.generic import Generic
10
+ from docling_core.types.rec.record import Record
@@ -4,3 +4,28 @@
4
4
  #
5
5
 
6
6
  """Package for models defined by the Document type."""
7
+
8
+ from .base import BoundingBox, CoordOrigin, Size
9
+ from .document import (
10
+ DocItem,
11
+ DoclingDocument,
12
+ DocumentOrigin,
13
+ FloatingItem,
14
+ GroupItem,
15
+ ImageRef,
16
+ KeyValueItem,
17
+ NodeItem,
18
+ PageItem,
19
+ PictureClassificationClass,
20
+ PictureClassificationData,
21
+ PictureDataType,
22
+ PictureItem,
23
+ ProvenanceItem,
24
+ RefItem,
25
+ SectionHeaderItem,
26
+ TableCell,
27
+ TableData,
28
+ TableItem,
29
+ TextItem,
30
+ )
31
+ from .labels import DocItemLabel, GroupLabel, TableCellLabel