docling-core 1.7.2__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/__init__.py +2 -8
- docling_core/transforms/chunker/base.py +27 -40
- docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
- docling_core/types/__init__.py +12 -8
- docling_core/types/doc/__init__.py +25 -0
- docling_core/types/doc/base.py +136 -451
- docling_core/types/doc/document.py +1288 -559
- docling_core/types/{experimental → doc}/labels.py +4 -1
- docling_core/types/legacy_doc/__init__.py +6 -0
- docling_core/types/legacy_doc/base.py +485 -0
- docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
- docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
- docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
- docling_core/types/legacy_doc/document.py +715 -0
- docling_core/types/rec/subject.py +1 -1
- docling_core/utils/generate_docs.py +82 -0
- docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
- docling_core/utils/validators.py +3 -3
- {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/METADATA +10 -10
- {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/RECORD +24 -31
- docling_core-2.0.0.dist-info/entry_points.txt +5 -0
- docling_core/transforms/id_generator/__init__.py +0 -12
- docling_core/transforms/id_generator/base.py +0 -30
- docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
- docling_core/transforms/id_generator/uuid_generator.py +0 -34
- docling_core/transforms/metadata_extractor/__init__.py +0 -13
- docling_core/transforms/metadata_extractor/base.py +0 -59
- docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
- docling_core/types/experimental/__init__.py +0 -30
- docling_core/types/experimental/base.py +0 -167
- docling_core/types/experimental/document.py +0 -1192
- docling_core/utils/ds_generate_docs.py +0 -144
- docling_core-1.7.2.dist-info/entry_points.txt +0 -5
- /docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
- {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/LICENSE +0 -0
- {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/WHEEL +0 -0
|
@@ -5,11 +5,5 @@
|
|
|
5
5
|
|
|
6
6
|
"""Define the chunker types."""
|
|
7
7
|
|
|
8
|
-
from docling_core.transforms.chunker.base import
|
|
9
|
-
|
|
10
|
-
Chunk,
|
|
11
|
-
ChunkWithMetadata,
|
|
12
|
-
)
|
|
13
|
-
from docling_core.transforms.chunker.hierarchical_chunker import ( # noqa
|
|
14
|
-
HierarchicalChunker,
|
|
15
|
-
)
|
|
8
|
+
from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
|
|
9
|
+
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
|
|
@@ -4,71 +4,58 @@
|
|
|
4
4
|
#
|
|
5
5
|
|
|
6
6
|
"""Define base classes for chunking."""
|
|
7
|
-
import re
|
|
8
7
|
from abc import ABC, abstractmethod
|
|
9
|
-
from typing import
|
|
8
|
+
from typing import Any, ClassVar, Iterator
|
|
10
9
|
|
|
11
|
-
from pydantic import BaseModel
|
|
10
|
+
from pydantic import BaseModel
|
|
12
11
|
|
|
13
|
-
from docling_core.types import
|
|
14
|
-
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
12
|
+
from docling_core.types.doc import DoclingDocument as DLDocument
|
|
15
13
|
|
|
16
|
-
# (subset of) JSONPath format, e.g. "$.main-text[84]" (for migration purposes)
|
|
17
|
-
_DEPRECATED_JSON_PATH_PATTERN: Final = re.compile(r"^\$\.([\w-]+)\[(\d+)\]$")
|
|
18
14
|
|
|
15
|
+
class BaseMeta(BaseModel):
|
|
16
|
+
"""Metadata base class."""
|
|
19
17
|
|
|
20
|
-
|
|
21
|
-
|
|
18
|
+
excluded_embed: ClassVar[list[str]] = []
|
|
19
|
+
excluded_llm: ClassVar[list[str]] = []
|
|
22
20
|
|
|
21
|
+
def export_json_dict(self) -> dict[str, Any]:
|
|
22
|
+
"""Helper method for exporting non-None keys to JSON mode.
|
|
23
23
|
|
|
24
|
-
|
|
25
|
-
|
|
24
|
+
Returns:
|
|
25
|
+
dict[str, Any]: The exported dictionary.
|
|
26
|
+
"""
|
|
27
|
+
return self.model_dump(mode="json", by_alias=True, exclude_none=True)
|
|
26
28
|
|
|
27
|
-
path: str = Field(pattern=_JSON_POINTER_REGEX)
|
|
28
|
-
text: str
|
|
29
|
-
heading: Optional[str] = None
|
|
30
29
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def _json_pointer_from_json_path(cls, path: str):
|
|
34
|
-
if (match := _DEPRECATED_JSON_PATH_PATTERN.match(path)) is not None:
|
|
35
|
-
groups = match.groups()
|
|
36
|
-
if len(groups) == 2 and groups[0] is not None and groups[1] is not None:
|
|
37
|
-
return _create_path(
|
|
38
|
-
pos=int(groups[1]),
|
|
39
|
-
path_prefix=groups[0],
|
|
40
|
-
)
|
|
41
|
-
return path
|
|
30
|
+
class BaseChunk(BaseModel):
|
|
31
|
+
"""Chunk base class."""
|
|
42
32
|
|
|
33
|
+
text: str
|
|
34
|
+
meta: BaseMeta
|
|
43
35
|
|
|
44
|
-
|
|
45
|
-
|
|
36
|
+
def export_json_dict(self) -> dict[str, Any]:
|
|
37
|
+
"""Helper method for exporting non-None keys to JSON mode.
|
|
46
38
|
|
|
47
|
-
|
|
48
|
-
|
|
39
|
+
Returns:
|
|
40
|
+
dict[str, Any]: The exported dictionary.
|
|
41
|
+
"""
|
|
42
|
+
return self.model_dump(mode="json", by_alias=True, exclude_none=True)
|
|
49
43
|
|
|
50
44
|
|
|
51
45
|
class BaseChunker(BaseModel, ABC):
|
|
52
|
-
"""
|
|
46
|
+
"""Chunker base class."""
|
|
53
47
|
|
|
54
48
|
@abstractmethod
|
|
55
|
-
def chunk(self, dl_doc:
|
|
49
|
+
def chunk(self, dl_doc: DLDocument, **kwargs) -> Iterator[BaseChunk]:
|
|
56
50
|
"""Chunk the provided document.
|
|
57
51
|
|
|
58
52
|
Args:
|
|
59
|
-
dl_doc (
|
|
53
|
+
dl_doc (DLDocument): document to chunk
|
|
60
54
|
|
|
61
55
|
Raises:
|
|
62
56
|
NotImplementedError: in this abstract implementation
|
|
63
57
|
|
|
64
58
|
Yields:
|
|
65
|
-
Iterator[
|
|
59
|
+
Iterator[BaseChunk]: iterator over extracted chunks
|
|
66
60
|
"""
|
|
67
61
|
raise NotImplementedError()
|
|
68
|
-
|
|
69
|
-
@classmethod
|
|
70
|
-
def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
|
|
71
|
-
return _create_path(
|
|
72
|
-
pos=pos,
|
|
73
|
-
path_prefix=path_prefix,
|
|
74
|
-
)
|
|
@@ -8,347 +8,179 @@
|
|
|
8
8
|
from __future__ import annotations
|
|
9
9
|
|
|
10
10
|
import logging
|
|
11
|
-
from
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
from docling_core.
|
|
18
|
-
from docling_core.types import
|
|
19
|
-
|
|
20
|
-
|
|
11
|
+
from typing import Any, ClassVar, Iterator, Optional
|
|
12
|
+
|
|
13
|
+
from pandas import DataFrame
|
|
14
|
+
from pydantic import Field
|
|
15
|
+
|
|
16
|
+
from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
|
|
17
|
+
from docling_core.types.doc import DoclingDocument as DLDocument
|
|
18
|
+
from docling_core.types.doc.document import (
|
|
19
|
+
DocItem,
|
|
20
|
+
LevelNumber,
|
|
21
|
+
ListItem,
|
|
22
|
+
SectionHeaderItem,
|
|
23
|
+
TableItem,
|
|
24
|
+
TextItem,
|
|
25
|
+
)
|
|
26
|
+
from docling_core.types.doc.labels import DocItemLabel
|
|
27
|
+
|
|
28
|
+
_KEY_DOC_ITEMS = "doc_items"
|
|
29
|
+
_KEY_HEADINGS = "headings"
|
|
30
|
+
_KEY_CAPTIONS = "captions"
|
|
21
31
|
|
|
22
32
|
_logger = logging.getLogger(__name__)
|
|
23
33
|
|
|
24
34
|
|
|
25
|
-
class
|
|
26
|
-
"""
|
|
35
|
+
class DocMeta(BaseMeta):
|
|
36
|
+
"""Data model for Hierarchical Chunker metadata."""
|
|
27
37
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
38
|
+
doc_items: list[DocItem] = Field(
|
|
39
|
+
alias=_KEY_DOC_ITEMS,
|
|
40
|
+
min_length=1,
|
|
31
41
|
)
|
|
32
|
-
|
|
33
|
-
default=
|
|
34
|
-
|
|
42
|
+
headings: Optional[list[str]] = Field(
|
|
43
|
+
default=None,
|
|
44
|
+
alias=_KEY_HEADINGS,
|
|
45
|
+
min_length=1,
|
|
35
46
|
)
|
|
36
|
-
|
|
37
|
-
default=
|
|
47
|
+
captions: Optional[list[str]] = Field(
|
|
48
|
+
default=None,
|
|
49
|
+
alias=_KEY_CAPTIONS,
|
|
50
|
+
min_length=1,
|
|
38
51
|
)
|
|
39
52
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
SUBTITLE_LEVEL_1 = "subtitle-level-1"
|
|
43
|
-
TABLE = "table"
|
|
44
|
-
CAPTION = "caption"
|
|
45
|
-
|
|
46
|
-
class _NodeName(str, Enum):
|
|
47
|
-
TITLE = "title"
|
|
48
|
-
REFERENCE = "reference"
|
|
49
|
-
LIST_ITEM = "list-item"
|
|
50
|
-
SUBTITLE_LEVEL_1 = "subtitle-level-1"
|
|
51
|
-
|
|
52
|
-
_allowed_types: list[str] = [
|
|
53
|
-
_NodeType.PARAGRAPH,
|
|
54
|
-
_NodeType.SUBTITLE_LEVEL_1,
|
|
55
|
-
_NodeType.TABLE,
|
|
56
|
-
_NodeType.CAPTION,
|
|
57
|
-
]
|
|
58
|
-
_disallowed_names_by_type: dict[str, list[str]] = {
|
|
59
|
-
_NodeType.PARAGRAPH: [
|
|
60
|
-
_NodeName.REFERENCE,
|
|
61
|
-
],
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
@classmethod
|
|
65
|
-
def _norm(cls, text: Optional[str]) -> Optional[str]:
|
|
66
|
-
return text.lower() if text is not None else None
|
|
67
|
-
|
|
68
|
-
@classmethod
|
|
69
|
-
def _convert_table_to_dataframe(cls, table: Table) -> Optional[pd.DataFrame]:
|
|
70
|
-
if table.data:
|
|
71
|
-
table_content = [[cell.text for cell in row] for row in table.data]
|
|
72
|
-
return pd.DataFrame(table_content)
|
|
73
|
-
else:
|
|
74
|
-
return None
|
|
75
|
-
|
|
76
|
-
@classmethod
|
|
77
|
-
def _triplet_serialize(cls, table) -> Optional[str]:
|
|
78
|
-
output_text: Optional[str] = None
|
|
79
|
-
table_df = cls._convert_table_to_dataframe(table)
|
|
80
|
-
if table_df is not None and table_df.shape[0] > 1 and table_df.shape[1] > 1:
|
|
81
|
-
rows = [item.strip() for item in table_df.iloc[:, 0].to_list()]
|
|
82
|
-
cols = [item.strip() for item in table_df.iloc[0, :].to_list()]
|
|
83
|
-
nrows = table_df.shape[0]
|
|
84
|
-
ncols = table_df.shape[1]
|
|
85
|
-
texts = [
|
|
86
|
-
f"{rows[i]}, {cols[j]} = {str(table_df.iloc[i, j]).strip()}"
|
|
87
|
-
for i in range(1, nrows)
|
|
88
|
-
for j in range(1, ncols)
|
|
89
|
-
]
|
|
90
|
-
output_text = ". ".join(texts)
|
|
91
|
-
|
|
92
|
-
return output_text
|
|
93
|
-
|
|
94
|
-
class _MainTextItemNode(BaseModel):
|
|
95
|
-
parent: Optional[int] = None
|
|
96
|
-
children: list[int] = []
|
|
97
|
-
|
|
98
|
-
class _TitleInfo(BaseModel):
|
|
99
|
-
text: str
|
|
100
|
-
path_in_doc: str
|
|
101
|
-
|
|
102
|
-
class _GlobalContext(BaseModel):
|
|
103
|
-
title: Optional[_HC._TitleInfo] = None
|
|
104
|
-
|
|
105
|
-
class _DocContext(BaseModel):
|
|
106
|
-
dmap: dict[int, _HC._MainTextItemNode] # main text element context
|
|
107
|
-
glob: _HC._GlobalContext # global context
|
|
108
|
-
|
|
109
|
-
@classmethod
|
|
110
|
-
def from_doc(cls, doc: DLDocument) -> _HC._DocContext:
|
|
111
|
-
dmap: dict[int, _HC._MainTextItemNode] = {}
|
|
112
|
-
glob: _HC._GlobalContext = _HC._GlobalContext()
|
|
113
|
-
if doc.description.title:
|
|
114
|
-
glob.title = _HC._TitleInfo(
|
|
115
|
-
text=doc.description.title,
|
|
116
|
-
path_in_doc="description.title",
|
|
117
|
-
)
|
|
118
|
-
|
|
119
|
-
parent = None
|
|
120
|
-
if doc.main_text:
|
|
121
|
-
idx = 0
|
|
122
|
-
while idx < len(doc.main_text):
|
|
123
|
-
item = doc.main_text[idx]
|
|
124
|
-
if (
|
|
125
|
-
not glob.title
|
|
126
|
-
and isinstance(item, BaseText)
|
|
127
|
-
and _HC._norm(item.name) == _HC._NodeName.TITLE
|
|
128
|
-
):
|
|
129
|
-
glob.title = _HC._TitleInfo(
|
|
130
|
-
text=item.text,
|
|
131
|
-
path_in_doc=_HC._create_path(idx),
|
|
132
|
-
)
|
|
133
|
-
|
|
134
|
-
# start of a subtitle-level-1 parent
|
|
135
|
-
if (
|
|
136
|
-
isinstance(item, BaseText)
|
|
137
|
-
and _HC._norm(item.obj_type) == _HC._NodeType.SUBTITLE_LEVEL_1
|
|
138
|
-
):
|
|
139
|
-
dmap[idx] = _HC._MainTextItemNode(parent=None)
|
|
140
|
-
parent = idx
|
|
141
|
-
if not glob.title:
|
|
142
|
-
glob.title = _HC._TitleInfo(
|
|
143
|
-
text=item.text,
|
|
144
|
-
path_in_doc=_HC._create_path(idx),
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
# start of a list parent
|
|
148
|
-
elif (
|
|
149
|
-
isinstance(item, BaseText)
|
|
150
|
-
and _HC._norm(item.name) != _HC._NodeName.LIST_ITEM
|
|
151
|
-
and idx + 1 < len(doc.main_text)
|
|
152
|
-
and _HC._norm(doc.main_text[idx + 1].name)
|
|
153
|
-
== _HC._NodeName.LIST_ITEM
|
|
154
|
-
):
|
|
155
|
-
if parent is not None:
|
|
156
|
-
dmap[parent].children.append(idx)
|
|
157
|
-
dmap[idx] = _HC._MainTextItemNode(parent=parent)
|
|
53
|
+
excluded_embed: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
|
|
54
|
+
excluded_llm: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
|
|
158
55
|
|
|
159
|
-
# have all children register locally
|
|
160
|
-
li = idx + 1
|
|
161
|
-
while (
|
|
162
|
-
li < len(doc.main_text)
|
|
163
|
-
and _HC._norm(doc.main_text[li].name)
|
|
164
|
-
== _HC._NodeName.LIST_ITEM
|
|
165
|
-
):
|
|
166
|
-
dmap[idx].children.append(li)
|
|
167
|
-
dmap[li] = _HC._MainTextItemNode(parent=idx)
|
|
168
|
-
li += 1
|
|
169
|
-
idx = li
|
|
170
|
-
continue
|
|
171
56
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
if parent is not None:
|
|
175
|
-
dmap[parent].children.append(idx)
|
|
176
|
-
dmap[idx] = _HC._MainTextItemNode(parent=parent)
|
|
57
|
+
class DocChunk(BaseChunk):
|
|
58
|
+
"""Data model for Hierarchical Chunker chunks."""
|
|
177
59
|
|
|
178
|
-
|
|
179
|
-
else:
|
|
180
|
-
pass
|
|
181
|
-
return cls(
|
|
182
|
-
dmap=dmap,
|
|
183
|
-
glob=glob,
|
|
184
|
-
)
|
|
60
|
+
meta: DocMeta
|
|
185
61
|
|
|
186
|
-
class _TextEntry(BaseModel):
|
|
187
|
-
text: str
|
|
188
|
-
path: str
|
|
189
62
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
) -> tuple[list[_TextEntry], Optional[str]]:
|
|
193
|
-
if doc.main_text:
|
|
194
|
-
item = doc.main_text[idx]
|
|
195
|
-
item_type = _HC._norm(item.obj_type)
|
|
196
|
-
item_name = _HC._norm(item.name)
|
|
197
|
-
if (
|
|
198
|
-
item_type not in self._allowed_types
|
|
199
|
-
or item_name in self._disallowed_names_by_type.get(item_type, [])
|
|
200
|
-
):
|
|
201
|
-
return [], None
|
|
63
|
+
class HierarchicalChunker(BaseChunker):
|
|
64
|
+
r"""Chunker implementation leveraging the document layout.
|
|
202
65
|
|
|
203
|
-
|
|
66
|
+
Args:
|
|
67
|
+
merge_list_items (bool): Whether to merge successive list items.
|
|
68
|
+
Defaults to True.
|
|
69
|
+
delim (str): Delimiter to use for merging text. Defaults to "\n".
|
|
70
|
+
"""
|
|
204
71
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
isinstance(item, Ref)
|
|
208
|
-
and item_type == _HC._NodeType.TABLE
|
|
209
|
-
and doc.tables
|
|
210
|
-
):
|
|
211
|
-
# resolve table reference
|
|
212
|
-
ref_nr = int(item.ref.split("/")[2]) # e.g. '#/tables/0'
|
|
213
|
-
table = doc.tables[ref_nr]
|
|
214
|
-
ser_out = _HC._triplet_serialize(table)
|
|
215
|
-
if table.data:
|
|
216
|
-
text_entries = (
|
|
217
|
-
[
|
|
218
|
-
self._TextEntry(
|
|
219
|
-
text=ser_out,
|
|
220
|
-
path=self._create_path(idx),
|
|
221
|
-
)
|
|
222
|
-
]
|
|
223
|
-
if ser_out
|
|
224
|
-
else []
|
|
225
|
-
)
|
|
226
|
-
else:
|
|
227
|
-
return [], None
|
|
228
|
-
elif isinstance(item, BaseText):
|
|
229
|
-
text_entries = [
|
|
230
|
-
self._TextEntry(
|
|
231
|
-
text=item.text,
|
|
232
|
-
path=self._create_path(idx),
|
|
233
|
-
)
|
|
234
|
-
]
|
|
72
|
+
merge_list_items: bool = True
|
|
73
|
+
delim: str = "\n"
|
|
235
74
|
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
if (
|
|
239
|
-
c2p[idx].children
|
|
240
|
-
and _HC._norm(doc.main_text[c2p[idx].children[0]].name)
|
|
241
|
-
== _HC._NodeName.LIST_ITEM
|
|
242
|
-
):
|
|
243
|
-
text_entries = text_entries + [
|
|
244
|
-
self._TextEntry(
|
|
245
|
-
text=doc.main_text[c].text, # type: ignore[union-attr]
|
|
246
|
-
path=self._create_path(c),
|
|
247
|
-
)
|
|
248
|
-
for c in c2p[idx].children
|
|
249
|
-
if isinstance(doc.main_text[c], BaseText)
|
|
250
|
-
and _HC._norm(doc.main_text[c].name) == _HC._NodeName.LIST_ITEM
|
|
251
|
-
]
|
|
252
|
-
elif item_name in [
|
|
253
|
-
_HC._NodeName.LIST_ITEM,
|
|
254
|
-
_HC._NodeName.SUBTITLE_LEVEL_1,
|
|
255
|
-
]:
|
|
256
|
-
return [], None
|
|
75
|
+
@classmethod
|
|
76
|
+
def _triplet_serialize(cls, table_df: DataFrame) -> str:
|
|
257
77
|
|
|
258
|
-
|
|
259
|
-
|
|
78
|
+
# copy header as first row and shift all rows by one
|
|
79
|
+
table_df.loc[-1] = table_df.columns # type: ignore[call-overload]
|
|
80
|
+
table_df.index = table_df.index + 1
|
|
81
|
+
table_df = table_df.sort_index()
|
|
260
82
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
)
|
|
264
|
-
return (
|
|
265
|
-
parent_res[0] + text_entries, # expanded text
|
|
266
|
-
parent_res[1], # heading
|
|
267
|
-
)
|
|
268
|
-
else:
|
|
269
|
-
if (
|
|
270
|
-
self.heading_as_metadata
|
|
271
|
-
and isinstance(item, BaseText)
|
|
272
|
-
and _HC._norm(item.obj_type) == _HC._NodeType.SUBTITLE_LEVEL_1
|
|
273
|
-
):
|
|
274
|
-
return [], text_entries[0].text
|
|
275
|
-
else:
|
|
276
|
-
return text_entries, None
|
|
277
|
-
else:
|
|
278
|
-
return [], None
|
|
83
|
+
rows = [item.strip() for item in table_df.iloc[:, 0].to_list()]
|
|
84
|
+
cols = [item.strip() for item in table_df.iloc[0, :].to_list()]
|
|
279
85
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
res = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
|
|
289
|
-
texts = res[0]
|
|
290
|
-
heading = res[1]
|
|
291
|
-
concat = delim.join([t.text for t in texts if t.text])
|
|
292
|
-
assert doc.main_text is not None
|
|
293
|
-
if len(concat) >= self.min_chunk_len:
|
|
294
|
-
orig_item = doc.main_text[idx]
|
|
295
|
-
item: Union[BaseText, Table]
|
|
296
|
-
if isinstance(orig_item, Ref):
|
|
297
|
-
if _HC._norm(orig_item.obj_type) == _HC._NodeType.TABLE and doc.tables:
|
|
298
|
-
pos = int(orig_item.ref.split("/")[2])
|
|
299
|
-
item = doc.tables[pos]
|
|
300
|
-
path = self._create_path(pos, path_prefix="tables")
|
|
301
|
-
else: # currently disregarding non-table references
|
|
302
|
-
return None
|
|
303
|
-
else:
|
|
304
|
-
item = orig_item
|
|
305
|
-
path = self._create_path(idx)
|
|
86
|
+
nrows = table_df.shape[0]
|
|
87
|
+
ncols = table_df.shape[1]
|
|
88
|
+
texts = [
|
|
89
|
+
f"{rows[i]}, {cols[j]} = {str(table_df.iloc[i, j]).strip()}"
|
|
90
|
+
for i in range(1, nrows)
|
|
91
|
+
for j in range(1, ncols)
|
|
92
|
+
]
|
|
93
|
+
output_text = ". ".join(texts)
|
|
306
94
|
|
|
307
|
-
|
|
308
|
-
return ChunkWithMetadata(
|
|
309
|
-
text=concat,
|
|
310
|
-
path=path,
|
|
311
|
-
heading=heading,
|
|
312
|
-
page=item.prov[0].page if item.prov else None,
|
|
313
|
-
bbox=item.prov[0].bbox if item.prov else None,
|
|
314
|
-
)
|
|
315
|
-
else:
|
|
316
|
-
return Chunk(
|
|
317
|
-
text=concat,
|
|
318
|
-
path=path,
|
|
319
|
-
heading=heading,
|
|
320
|
-
)
|
|
321
|
-
else:
|
|
322
|
-
return None
|
|
95
|
+
return output_text
|
|
323
96
|
|
|
324
|
-
def chunk(self, dl_doc: DLDocument,
|
|
97
|
+
def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
|
|
325
98
|
r"""Chunk the provided document.
|
|
326
99
|
|
|
327
100
|
Args:
|
|
328
101
|
dl_doc (DLDocument): document to chunk
|
|
329
|
-
delim (str, optional): delimiter to use when concatenating sub-items.
|
|
330
|
-
Defaults to "\n".
|
|
331
102
|
|
|
332
103
|
Yields:
|
|
333
104
|
Iterator[Chunk]: iterator over extracted chunks
|
|
334
105
|
"""
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
106
|
+
heading_by_level: dict[LevelNumber, str] = {}
|
|
107
|
+
list_items: list[TextItem] = []
|
|
108
|
+
for item, level in dl_doc.iterate_items():
|
|
109
|
+
captions = None
|
|
110
|
+
if isinstance(item, DocItem):
|
|
111
|
+
|
|
112
|
+
# first handle any merging needed
|
|
113
|
+
if self.merge_list_items:
|
|
114
|
+
if isinstance(
|
|
115
|
+
item, ListItem
|
|
116
|
+
) or ( # TODO remove when all captured as ListItem:
|
|
117
|
+
isinstance(item, TextItem)
|
|
118
|
+
and item.label == DocItemLabel.LIST_ITEM
|
|
119
|
+
):
|
|
120
|
+
list_items.append(item)
|
|
121
|
+
continue
|
|
122
|
+
elif list_items: # need to yield
|
|
123
|
+
yield DocChunk(
|
|
124
|
+
text=self.delim.join([i.text for i in list_items]),
|
|
125
|
+
meta=DocMeta(
|
|
126
|
+
doc_items=list_items,
|
|
127
|
+
headings=[
|
|
128
|
+
heading_by_level[k]
|
|
129
|
+
for k in sorted(heading_by_level)
|
|
130
|
+
]
|
|
131
|
+
or None,
|
|
132
|
+
),
|
|
133
|
+
)
|
|
134
|
+
list_items = [] # reset
|
|
340
135
|
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
136
|
+
if isinstance(
|
|
137
|
+
item, SectionHeaderItem
|
|
138
|
+
) or ( # TODO remove when all captured as SectionHeaderItem:
|
|
139
|
+
isinstance(item, TextItem)
|
|
140
|
+
and item.label == DocItemLabel.SECTION_HEADER
|
|
345
141
|
):
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
142
|
+
# TODO second branch not needed once cleanup above complete:
|
|
143
|
+
level = item.level if isinstance(item, SectionHeaderItem) else 1
|
|
144
|
+
heading_by_level[level] = item.text
|
|
145
|
+
|
|
146
|
+
# remove headings of higher level as they just went out of scope
|
|
147
|
+
keys_to_del = [k for k in heading_by_level if k > level]
|
|
148
|
+
for k in keys_to_del:
|
|
149
|
+
heading_by_level.pop(k, None)
|
|
150
|
+
continue
|
|
151
|
+
|
|
152
|
+
if isinstance(item, TextItem) or (
|
|
153
|
+
(not self.merge_list_items) and isinstance(item, ListItem)
|
|
154
|
+
):
|
|
155
|
+
text = item.text
|
|
156
|
+
elif isinstance(item, TableItem):
|
|
157
|
+
table_df = item.export_to_dataframe()
|
|
158
|
+
if table_df.shape[0] < 1 or table_df.shape[1] < 2:
|
|
159
|
+
# at least two cols needed, as first column contains row headers
|
|
160
|
+
continue
|
|
161
|
+
text = self._triplet_serialize(table_df=table_df)
|
|
162
|
+
captions = [
|
|
163
|
+
c.text for c in [r.resolve(dl_doc) for r in item.captions]
|
|
164
|
+
] or None
|
|
165
|
+
else:
|
|
166
|
+
continue
|
|
167
|
+
c = DocChunk(
|
|
168
|
+
text=text,
|
|
169
|
+
meta=DocMeta(
|
|
170
|
+
doc_items=[item],
|
|
171
|
+
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
|
|
172
|
+
or None,
|
|
173
|
+
captions=captions,
|
|
174
|
+
),
|
|
175
|
+
)
|
|
176
|
+
yield c
|
|
177
|
+
|
|
178
|
+
if self.merge_list_items and list_items: # need to yield
|
|
179
|
+
yield DocChunk(
|
|
180
|
+
text=self.delim.join([i.text for i in list_items]),
|
|
181
|
+
meta=DocMeta(
|
|
182
|
+
doc_items=list_items,
|
|
183
|
+
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
|
|
184
|
+
or None,
|
|
185
|
+
),
|
|
186
|
+
)
|
docling_core/types/__init__.py
CHANGED
|
@@ -5,10 +5,11 @@
|
|
|
5
5
|
|
|
6
6
|
"""Define the main types."""
|
|
7
7
|
|
|
8
|
-
from docling_core.types.
|
|
9
|
-
from docling_core.types.
|
|
10
|
-
from docling_core.types.
|
|
11
|
-
from docling_core.types.
|
|
8
|
+
from docling_core.types.gen.generic import Generic # noqa
|
|
9
|
+
from docling_core.types.legacy_doc.base import BoundingBox # noqa
|
|
10
|
+
from docling_core.types.legacy_doc.base import Table # noqa
|
|
11
|
+
from docling_core.types.legacy_doc.base import TableCell # noqa
|
|
12
|
+
from docling_core.types.legacy_doc.base import ( # noqa
|
|
12
13
|
BaseCell,
|
|
13
14
|
BaseText,
|
|
14
15
|
PageDimensions,
|
|
@@ -16,10 +17,13 @@ from docling_core.types.doc.base import ( # noqa
|
|
|
16
17
|
Prov,
|
|
17
18
|
Ref,
|
|
18
19
|
)
|
|
19
|
-
from docling_core.types.
|
|
20
|
+
from docling_core.types.legacy_doc.document import ( # noqa
|
|
20
21
|
CCSDocumentDescription as DocumentDescription,
|
|
21
22
|
)
|
|
22
|
-
from docling_core.types.
|
|
23
|
-
|
|
24
|
-
|
|
23
|
+
from docling_core.types.legacy_doc.document import ( # noqa
|
|
24
|
+
CCSFileInfoObject as FileInfoObject,
|
|
25
|
+
)
|
|
26
|
+
from docling_core.types.legacy_doc.document import ( # noqa
|
|
27
|
+
ExportedCCSDocument as Document,
|
|
28
|
+
)
|
|
25
29
|
from docling_core.types.rec.record import Record # noqa
|
|
@@ -4,3 +4,28 @@
|
|
|
4
4
|
#
|
|
5
5
|
|
|
6
6
|
"""Package for models defined by the Document type."""
|
|
7
|
+
|
|
8
|
+
from .base import BoundingBox, CoordOrigin, Size
|
|
9
|
+
from .document import (
|
|
10
|
+
DocItem,
|
|
11
|
+
DoclingDocument,
|
|
12
|
+
DocumentOrigin,
|
|
13
|
+
FloatingItem,
|
|
14
|
+
GroupItem,
|
|
15
|
+
ImageRef,
|
|
16
|
+
KeyValueItem,
|
|
17
|
+
NodeItem,
|
|
18
|
+
PageItem,
|
|
19
|
+
PictureClassificationClass,
|
|
20
|
+
PictureClassificationData,
|
|
21
|
+
PictureDataType,
|
|
22
|
+
PictureItem,
|
|
23
|
+
ProvenanceItem,
|
|
24
|
+
RefItem,
|
|
25
|
+
SectionHeaderItem,
|
|
26
|
+
TableCell,
|
|
27
|
+
TableData,
|
|
28
|
+
TableItem,
|
|
29
|
+
TextItem,
|
|
30
|
+
)
|
|
31
|
+
from .labels import DocItemLabel, GroupLabel, TableCellLabel
|