docling-core 2.14.0__tar.gz → 2.15.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.14.0 → docling_core-2.15.0}/PKG-INFO +1 -1
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/transforms/chunker/base.py +1 -1
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/transforms/chunker/hybrid_chunker.py +81 -108
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/doc/__init__.py +1 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/doc/base.py +2 -2
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/doc/document.py +68 -5
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/doc/labels.py +66 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/pyproject.toml +11 -1
- {docling_core-2.14.0 → docling_core-2.15.0}/LICENSE +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/README.md +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/__init__.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/py.typed +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/search/package.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/base.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.14.0 → docling_core-2.15.0}/docling_core/utils/validators.py +0 -0
|
@@ -51,7 +51,7 @@ class BaseChunker(BaseModel, ABC):
|
|
|
51
51
|
delim: str = DFLT_DELIM
|
|
52
52
|
|
|
53
53
|
@abstractmethod
|
|
54
|
-
def chunk(self, dl_doc: DLDocument, **kwargs) -> Iterator[BaseChunk]:
|
|
54
|
+
def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
|
|
55
55
|
"""Chunk the provided document.
|
|
56
56
|
|
|
57
57
|
Args:
|
{docling_core-2.14.0 → docling_core-2.15.0}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
"""Hybrid chunker implementation leveraging both doc structure & token awareness."""
|
|
7
7
|
|
|
8
8
|
import warnings
|
|
9
|
-
from typing import Iterable, Iterator, Optional, Union
|
|
9
|
+
from typing import Any, Iterable, Iterator, Optional, Union
|
|
10
10
|
|
|
11
11
|
from pydantic import BaseModel, ConfigDict, PositiveInt, TypeAdapter, model_validator
|
|
12
12
|
from typing_extensions import Self
|
|
@@ -65,13 +65,13 @@ class HybridChunker(BaseChunker):
|
|
|
65
65
|
)
|
|
66
66
|
return self
|
|
67
67
|
|
|
68
|
-
def
|
|
68
|
+
def _count_text_tokens(self, text: Optional[Union[str, list[str]]]):
|
|
69
69
|
if text is None:
|
|
70
70
|
return 0
|
|
71
71
|
elif isinstance(text, list):
|
|
72
72
|
total = 0
|
|
73
73
|
for t in text:
|
|
74
|
-
total += self.
|
|
74
|
+
total += self._count_text_tokens(t)
|
|
75
75
|
return total
|
|
76
76
|
return len(self._tokenizer.tokenize(text, max_length=None))
|
|
77
77
|
|
|
@@ -80,11 +80,13 @@ class HybridChunker(BaseChunker):
|
|
|
80
80
|
text_len: int
|
|
81
81
|
other_len: int
|
|
82
82
|
|
|
83
|
+
def _count_chunk_tokens(self, doc_chunk: DocChunk):
|
|
84
|
+
ser_txt = self.serialize(chunk=doc_chunk)
|
|
85
|
+
return len(self._tokenizer.tokenize(text=ser_txt, max_length=None))
|
|
86
|
+
|
|
83
87
|
def _doc_chunk_length(self, doc_chunk: DocChunk):
|
|
84
|
-
text_length = self.
|
|
85
|
-
|
|
86
|
-
captions_length = self._count_tokens(doc_chunk.meta.captions)
|
|
87
|
-
total = text_length + headings_length + captions_length
|
|
88
|
+
text_length = self._count_text_tokens(doc_chunk.text)
|
|
89
|
+
total = self._count_chunk_tokens(doc_chunk=doc_chunk)
|
|
88
90
|
return self._ChunkLengthInfo(
|
|
89
91
|
total_len=total,
|
|
90
92
|
text_len=text_length,
|
|
@@ -92,90 +94,69 @@ class HybridChunker(BaseChunker):
|
|
|
92
94
|
)
|
|
93
95
|
|
|
94
96
|
def _make_chunk_from_doc_items(
|
|
95
|
-
self, doc_chunk: DocChunk,
|
|
97
|
+
self, doc_chunk: DocChunk, window_start: int, window_end: int
|
|
96
98
|
):
|
|
99
|
+
doc_items = doc_chunk.meta.doc_items[window_start : window_end + 1]
|
|
97
100
|
meta = DocMeta(
|
|
98
|
-
doc_items=
|
|
101
|
+
doc_items=doc_items,
|
|
99
102
|
headings=doc_chunk.meta.headings,
|
|
100
103
|
captions=doc_chunk.meta.captions,
|
|
101
104
|
origin=doc_chunk.meta.origin,
|
|
102
105
|
)
|
|
106
|
+
window_text = (
|
|
107
|
+
doc_chunk.text
|
|
108
|
+
if len(doc_chunk.meta.doc_items) == 1
|
|
109
|
+
else self.delim.join(
|
|
110
|
+
[
|
|
111
|
+
doc_item.text
|
|
112
|
+
for doc_item in doc_items
|
|
113
|
+
if isinstance(doc_item, TextItem)
|
|
114
|
+
]
|
|
115
|
+
)
|
|
116
|
+
)
|
|
103
117
|
new_chunk = DocChunk(text=window_text, meta=meta)
|
|
104
118
|
return new_chunk
|
|
105
119
|
|
|
106
|
-
def _merge_text(self, t1, t2):
|
|
107
|
-
if t1 == "":
|
|
108
|
-
return t2
|
|
109
|
-
elif t2 == "":
|
|
110
|
-
return t1
|
|
111
|
-
else:
|
|
112
|
-
return f"{t1}{self.delim}{t2}"
|
|
113
|
-
|
|
114
120
|
def _split_by_doc_items(self, doc_chunk: DocChunk) -> list[DocChunk]:
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
while window_end < num_items:
|
|
129
|
-
doc_item = doc_chunk.meta.doc_items[window_end]
|
|
130
|
-
if isinstance(doc_item, TextItem):
|
|
131
|
-
text = doc_item.text
|
|
132
|
-
else:
|
|
133
|
-
raise RuntimeError("Non-TextItem split not implemented yet")
|
|
134
|
-
text_length = self._count_tokens(text)
|
|
135
|
-
if (
|
|
136
|
-
text_length + window_text_length + other_length < self.max_tokens
|
|
137
|
-
and window_end < num_items - 1
|
|
138
|
-
):
|
|
121
|
+
chunks = []
|
|
122
|
+
window_start = 0
|
|
123
|
+
window_end = 0 # an inclusive index
|
|
124
|
+
num_items = len(doc_chunk.meta.doc_items)
|
|
125
|
+
while window_end < num_items:
|
|
126
|
+
new_chunk = self._make_chunk_from_doc_items(
|
|
127
|
+
doc_chunk=doc_chunk,
|
|
128
|
+
window_start=window_start,
|
|
129
|
+
window_end=window_end,
|
|
130
|
+
)
|
|
131
|
+
if self._count_chunk_tokens(doc_chunk=new_chunk) <= self.max_tokens:
|
|
132
|
+
if window_end < num_items - 1:
|
|
133
|
+
window_end += 1
|
|
139
134
|
# Still room left to add more to this chunk AND still at least one
|
|
140
135
|
# item left
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
window_text = self._merge_text(window_text, text)
|
|
144
|
-
elif text_length + window_text_length + other_length < self.max_tokens:
|
|
136
|
+
continue
|
|
137
|
+
else:
|
|
145
138
|
# All the items in the window fit into the chunk and there are no
|
|
146
139
|
# other items left
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
# Multiple items in the window but they don't fit into the chunk.
|
|
168
|
-
# However, the existing items must have fit or we wouldn't have
|
|
169
|
-
# gotten here. So we put everything but the last item into the chunk
|
|
170
|
-
# and then start a new window INCLUDING the current window end.
|
|
171
|
-
new_chunk = self._make_chunk_from_doc_items(
|
|
172
|
-
doc_chunk, window_text, window_start, window_end - 1
|
|
173
|
-
)
|
|
174
|
-
chunks.append(new_chunk)
|
|
175
|
-
window_start = window_end
|
|
176
|
-
window_text = ""
|
|
177
|
-
window_text_length = 0
|
|
178
|
-
return chunks
|
|
140
|
+
window_end = num_items # signalizing the last loop
|
|
141
|
+
elif window_start == window_end:
|
|
142
|
+
# Only one item in the window and it doesn't fit into the chunk. So
|
|
143
|
+
# we'll just make it a chunk for now and it will get split in the
|
|
144
|
+
# plain text splitter.
|
|
145
|
+
window_end += 1
|
|
146
|
+
window_start = window_end
|
|
147
|
+
else:
|
|
148
|
+
# Multiple items in the window but they don't fit into the chunk.
|
|
149
|
+
# However, the existing items must have fit or we wouldn't have
|
|
150
|
+
# gotten here. So we put everything but the last item into the chunk
|
|
151
|
+
# and then start a new window INCLUDING the current window end.
|
|
152
|
+
new_chunk = self._make_chunk_from_doc_items(
|
|
153
|
+
doc_chunk=doc_chunk,
|
|
154
|
+
window_start=window_start,
|
|
155
|
+
window_end=window_end - 1,
|
|
156
|
+
)
|
|
157
|
+
window_start = window_end
|
|
158
|
+
chunks.append(new_chunk)
|
|
159
|
+
return chunks
|
|
179
160
|
|
|
180
161
|
def _split_using_plain_text(
|
|
181
162
|
self,
|
|
@@ -204,36 +185,38 @@ class HybridChunker(BaseChunker):
|
|
|
204
185
|
def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
|
|
205
186
|
output_chunks = []
|
|
206
187
|
window_start = 0
|
|
207
|
-
window_end = 0
|
|
188
|
+
window_end = 0 # an inclusive index
|
|
208
189
|
num_chunks = len(chunks)
|
|
209
190
|
while window_end < num_chunks:
|
|
210
191
|
chunk = chunks[window_end]
|
|
211
|
-
lengths = self._doc_chunk_length(chunk)
|
|
212
192
|
headings_and_captions = (chunk.meta.headings, chunk.meta.captions)
|
|
213
193
|
ready_to_append = False
|
|
214
194
|
if window_start == window_end:
|
|
215
|
-
# starting a new block of chunks to potentially merge
|
|
216
195
|
current_headings_and_captions = headings_and_captions
|
|
217
|
-
window_text = chunk.text
|
|
218
|
-
window_other_length = lengths.other_len
|
|
219
|
-
window_text_length = lengths.text_len
|
|
220
|
-
window_items = chunk.meta.doc_items
|
|
221
196
|
window_end += 1
|
|
222
197
|
first_chunk_of_window = chunk
|
|
223
|
-
elif (
|
|
224
|
-
headings_and_captions == current_headings_and_captions
|
|
225
|
-
and window_text_length + window_other_length + lengths.text_len
|
|
226
|
-
<= self.max_tokens
|
|
227
|
-
):
|
|
228
|
-
# there is room to include the new chunk so add it to the window and
|
|
229
|
-
# continue
|
|
230
|
-
window_text = self._merge_text(window_text, chunk.text)
|
|
231
|
-
window_text_length += lengths.text_len
|
|
232
|
-
window_items = window_items + chunk.meta.doc_items
|
|
233
|
-
window_end += 1
|
|
234
198
|
else:
|
|
235
|
-
|
|
236
|
-
|
|
199
|
+
chks = chunks[window_start : window_end + 1]
|
|
200
|
+
doc_items = [it for chk in chks for it in chk.meta.doc_items]
|
|
201
|
+
candidate = DocChunk(
|
|
202
|
+
text=self.delim.join([chk.text for chk in chks]),
|
|
203
|
+
meta=DocMeta(
|
|
204
|
+
doc_items=doc_items,
|
|
205
|
+
headings=current_headings_and_captions[0],
|
|
206
|
+
captions=current_headings_and_captions[1],
|
|
207
|
+
origin=chunk.meta.origin,
|
|
208
|
+
),
|
|
209
|
+
)
|
|
210
|
+
if (
|
|
211
|
+
headings_and_captions == current_headings_and_captions
|
|
212
|
+
and self._count_chunk_tokens(doc_chunk=candidate) <= self.max_tokens
|
|
213
|
+
):
|
|
214
|
+
# there is room to include the new chunk so add it to the window and
|
|
215
|
+
# continue
|
|
216
|
+
window_end += 1
|
|
217
|
+
new_chunk = candidate
|
|
218
|
+
else:
|
|
219
|
+
ready_to_append = True
|
|
237
220
|
if ready_to_append or window_end == num_chunks:
|
|
238
221
|
# no more room OR the start of new metadata. Either way, end the block
|
|
239
222
|
# and use the current window_end as the start of a new block
|
|
@@ -241,16 +224,6 @@ class HybridChunker(BaseChunker):
|
|
|
241
224
|
# just one chunk so use it as is
|
|
242
225
|
output_chunks.append(first_chunk_of_window)
|
|
243
226
|
else:
|
|
244
|
-
new_meta = DocMeta(
|
|
245
|
-
doc_items=window_items,
|
|
246
|
-
headings=current_headings_and_captions[0],
|
|
247
|
-
captions=current_headings_and_captions[1],
|
|
248
|
-
origin=chunk.meta.origin,
|
|
249
|
-
)
|
|
250
|
-
new_chunk = DocChunk(
|
|
251
|
-
text=window_text,
|
|
252
|
-
meta=new_meta,
|
|
253
|
-
)
|
|
254
227
|
output_chunks.append(new_chunk)
|
|
255
228
|
# no need to reset window_text, etc. because that will be reset in the
|
|
256
229
|
# next iteration in the if window_start == window_end block
|
|
@@ -258,7 +231,7 @@ class HybridChunker(BaseChunker):
|
|
|
258
231
|
|
|
259
232
|
return output_chunks
|
|
260
233
|
|
|
261
|
-
def chunk(self, dl_doc: DoclingDocument, **kwargs) -> Iterator[BaseChunk]:
|
|
234
|
+
def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]:
|
|
262
235
|
r"""Chunk the provided document.
|
|
263
236
|
|
|
264
237
|
Args:
|
|
@@ -150,7 +150,7 @@ class BoundingBox(BaseModel):
|
|
|
150
150
|
|
|
151
151
|
"""
|
|
152
152
|
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
153
|
-
return self
|
|
153
|
+
return self.model_copy()
|
|
154
154
|
elif self.coord_origin == CoordOrigin.TOPLEFT:
|
|
155
155
|
return BoundingBox(
|
|
156
156
|
l=self.l,
|
|
@@ -167,7 +167,7 @@ class BoundingBox(BaseModel):
|
|
|
167
167
|
|
|
168
168
|
"""
|
|
169
169
|
if self.coord_origin == CoordOrigin.TOPLEFT:
|
|
170
|
-
return self
|
|
170
|
+
return self.model_copy()
|
|
171
171
|
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
172
172
|
return BoundingBox(
|
|
173
173
|
l=self.l,
|
|
@@ -36,7 +36,7 @@ from docling_core.search.package import VERSION_PATTERN
|
|
|
36
36
|
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
37
37
|
from docling_core.types.doc import BoundingBox, Size
|
|
38
38
|
from docling_core.types.doc.base import ImageRefMode
|
|
39
|
-
from docling_core.types.doc.labels import DocItemLabel, GroupLabel
|
|
39
|
+
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
|
|
40
40
|
from docling_core.types.doc.tokens import DocumentToken, TableToken
|
|
41
41
|
from docling_core.types.doc.utils import relative_path
|
|
42
42
|
|
|
@@ -597,7 +597,6 @@ class TextItem(DocItem):
|
|
|
597
597
|
DocItemLabel.CAPTION,
|
|
598
598
|
DocItemLabel.CHECKBOX_SELECTED,
|
|
599
599
|
DocItemLabel.CHECKBOX_UNSELECTED,
|
|
600
|
-
DocItemLabel.CODE,
|
|
601
600
|
DocItemLabel.FOOTNOTE,
|
|
602
601
|
DocItemLabel.FORMULA,
|
|
603
602
|
DocItemLabel.PAGE_FOOTER,
|
|
@@ -656,6 +655,15 @@ class TextItem(DocItem):
|
|
|
656
655
|
return body
|
|
657
656
|
|
|
658
657
|
|
|
658
|
+
class CodeItem(TextItem):
|
|
659
|
+
"""CodeItem."""
|
|
660
|
+
|
|
661
|
+
label: typing.Literal[DocItemLabel.CODE] = (
|
|
662
|
+
DocItemLabel.CODE # type: ignore[assignment]
|
|
663
|
+
)
|
|
664
|
+
code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN
|
|
665
|
+
|
|
666
|
+
|
|
659
667
|
class SectionHeaderItem(TextItem):
|
|
660
668
|
"""SectionItem."""
|
|
661
669
|
|
|
@@ -1302,6 +1310,7 @@ ContentItem = Annotated[
|
|
|
1302
1310
|
TextItem,
|
|
1303
1311
|
SectionHeaderItem,
|
|
1304
1312
|
ListItem,
|
|
1313
|
+
CodeItem,
|
|
1305
1314
|
PictureItem,
|
|
1306
1315
|
TableItem,
|
|
1307
1316
|
KeyValueItem,
|
|
@@ -1397,7 +1406,7 @@ class DoclingDocument(BaseModel):
|
|
|
1397
1406
|
body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
|
|
1398
1407
|
|
|
1399
1408
|
groups: List[GroupItem] = []
|
|
1400
|
-
texts: List[Union[SectionHeaderItem, ListItem, TextItem]] = []
|
|
1409
|
+
texts: List[Union[SectionHeaderItem, ListItem, TextItem, CodeItem]] = []
|
|
1401
1410
|
pictures: List[PictureItem] = []
|
|
1402
1411
|
tables: List[TableItem] = []
|
|
1403
1412
|
key_value_items: List[KeyValueItem] = []
|
|
@@ -1643,6 +1652,46 @@ class DoclingDocument(BaseModel):
|
|
|
1643
1652
|
|
|
1644
1653
|
return text_item
|
|
1645
1654
|
|
|
1655
|
+
def add_code(
|
|
1656
|
+
self,
|
|
1657
|
+
text: str,
|
|
1658
|
+
code_language: Optional[CodeLanguageLabel] = None,
|
|
1659
|
+
orig: Optional[str] = None,
|
|
1660
|
+
prov: Optional[ProvenanceItem] = None,
|
|
1661
|
+
parent: Optional[NodeItem] = None,
|
|
1662
|
+
):
|
|
1663
|
+
"""add_code.
|
|
1664
|
+
|
|
1665
|
+
:param text: str:
|
|
1666
|
+
:param code_language: Optional[str]: (Default value = None)
|
|
1667
|
+
:param orig: Optional[str]: (Default value = None)
|
|
1668
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1669
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1670
|
+
"""
|
|
1671
|
+
if not parent:
|
|
1672
|
+
parent = self.body
|
|
1673
|
+
|
|
1674
|
+
if not orig:
|
|
1675
|
+
orig = text
|
|
1676
|
+
|
|
1677
|
+
text_index = len(self.texts)
|
|
1678
|
+
cref = f"#/texts/{text_index}"
|
|
1679
|
+
code_item = CodeItem(
|
|
1680
|
+
text=text,
|
|
1681
|
+
orig=orig,
|
|
1682
|
+
self_ref=cref,
|
|
1683
|
+
parent=parent.get_ref(),
|
|
1684
|
+
)
|
|
1685
|
+
if code_language:
|
|
1686
|
+
code_item.code_language = code_language
|
|
1687
|
+
if prov:
|
|
1688
|
+
code_item.prov.append(prov)
|
|
1689
|
+
|
|
1690
|
+
self.texts.append(code_item)
|
|
1691
|
+
parent.children.append(RefItem(cref=cref))
|
|
1692
|
+
|
|
1693
|
+
return code_item
|
|
1694
|
+
|
|
1646
1695
|
def add_heading(
|
|
1647
1696
|
self,
|
|
1648
1697
|
text: str,
|
|
@@ -2086,7 +2135,7 @@ class DoclingDocument(BaseModel):
|
|
|
2086
2135
|
text = f"{marker} {item.text}\n"
|
|
2087
2136
|
mdtexts.append(text.strip() + "\n")
|
|
2088
2137
|
|
|
2089
|
-
elif isinstance(item,
|
|
2138
|
+
elif isinstance(item, CodeItem) and item.label in labels:
|
|
2090
2139
|
in_list = False
|
|
2091
2140
|
text = f"```\n{item.text}\n```\n"
|
|
2092
2141
|
mdtexts.append(text)
|
|
@@ -2392,11 +2441,14 @@ class DoclingDocument(BaseModel):
|
|
|
2392
2441
|
text = f"<li>{item.text}</li>"
|
|
2393
2442
|
html_texts.append(text)
|
|
2394
2443
|
|
|
2444
|
+
elif isinstance(item, CodeItem) and item.label in labels:
|
|
2445
|
+
text = f"<pre><code>{item.text}</code></pre>"
|
|
2446
|
+
html_texts.append(text.strip())
|
|
2447
|
+
|
|
2395
2448
|
elif isinstance(item, TextItem) and item.label in labels:
|
|
2396
2449
|
|
|
2397
2450
|
text = f"<p>{item.text}</p>"
|
|
2398
2451
|
html_texts.append(text.strip())
|
|
2399
|
-
|
|
2400
2452
|
elif isinstance(item, TableItem):
|
|
2401
2453
|
|
|
2402
2454
|
text = item.export_to_html(doc=self, add_caption=True)
|
|
@@ -2594,6 +2646,17 @@ class DoclingDocument(BaseModel):
|
|
|
2594
2646
|
add_content=add_content,
|
|
2595
2647
|
add_page_index=add_page_index,
|
|
2596
2648
|
)
|
|
2649
|
+
elif isinstance(item, CodeItem) and (item.label in labels):
|
|
2650
|
+
|
|
2651
|
+
result += item.export_to_document_tokens(
|
|
2652
|
+
doc=self,
|
|
2653
|
+
new_line=delim,
|
|
2654
|
+
xsize=xsize,
|
|
2655
|
+
ysize=ysize,
|
|
2656
|
+
add_location=add_location,
|
|
2657
|
+
add_content=add_content,
|
|
2658
|
+
add_page_index=add_page_index,
|
|
2659
|
+
)
|
|
2597
2660
|
|
|
2598
2661
|
elif isinstance(item, TextItem) and (item.label in labels):
|
|
2599
2662
|
|
|
@@ -138,3 +138,69 @@ class TableCellLabel(str, Enum):
|
|
|
138
138
|
def __str__(self):
|
|
139
139
|
"""Get string value."""
|
|
140
140
|
return str(self.value)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class CodeLanguageLabel(str, Enum):
|
|
144
|
+
"""CodeLanguageLabel."""
|
|
145
|
+
|
|
146
|
+
ADA = "Ada"
|
|
147
|
+
AWK = "Awk"
|
|
148
|
+
BASH = "Bash"
|
|
149
|
+
BC = "bc"
|
|
150
|
+
C = "C"
|
|
151
|
+
C_SHARP = "C#"
|
|
152
|
+
C_PLUS_PLUS = "C++"
|
|
153
|
+
CMAKE = "CMake"
|
|
154
|
+
COBOL = "COBOL"
|
|
155
|
+
CSS = "CSS"
|
|
156
|
+
CEYLON = "Ceylon"
|
|
157
|
+
CLOJURE = "Clojure"
|
|
158
|
+
CRYSTAL = "Crystal"
|
|
159
|
+
CUDA = "Cuda"
|
|
160
|
+
CYTHON = "Cython"
|
|
161
|
+
D = "D"
|
|
162
|
+
DART = "Dart"
|
|
163
|
+
DC = "dc"
|
|
164
|
+
DOCKERFILE = "Dockerfile"
|
|
165
|
+
ELIXIR = "Elixir"
|
|
166
|
+
ERLANG = "Erlang"
|
|
167
|
+
FORTRAN = "FORTRAN"
|
|
168
|
+
FORTH = "Forth"
|
|
169
|
+
GO = "Go"
|
|
170
|
+
HTML = "HTML"
|
|
171
|
+
HASKELL = "Haskell"
|
|
172
|
+
HAXE = "Haxe"
|
|
173
|
+
JAVA = "Java"
|
|
174
|
+
JAVASCRIPT = "JavaScript"
|
|
175
|
+
JULIA = "Julia"
|
|
176
|
+
KOTLIN = "Kotlin"
|
|
177
|
+
LISP = "Lisp"
|
|
178
|
+
LUA = "Lua"
|
|
179
|
+
MATLAB = "Matlab"
|
|
180
|
+
MOONSCRIPT = "MoonScript"
|
|
181
|
+
NIM = "Nim"
|
|
182
|
+
OCAML = "OCaml"
|
|
183
|
+
OBJECTIVEC = "ObjectiveC"
|
|
184
|
+
OCTAVE = "Octave"
|
|
185
|
+
PHP = "PHP"
|
|
186
|
+
PASCAL = "Pascal"
|
|
187
|
+
PERL = "Perl"
|
|
188
|
+
PROLOG = "Prolog"
|
|
189
|
+
PYTHON = "Python"
|
|
190
|
+
RACKET = "Racket"
|
|
191
|
+
RUBY = "Ruby"
|
|
192
|
+
RUST = "Rust"
|
|
193
|
+
SML = "SML"
|
|
194
|
+
SQL = "SQL"
|
|
195
|
+
SCALA = "Scala"
|
|
196
|
+
SCHEME = "Scheme"
|
|
197
|
+
SWIFT = "Swift"
|
|
198
|
+
TYPESCRIPT = "TypeScript"
|
|
199
|
+
UNKNOWN = "unknown"
|
|
200
|
+
VISUALBASIC = "VisualBasic"
|
|
201
|
+
XML = "XML"
|
|
202
|
+
YAML = "YAML"
|
|
203
|
+
|
|
204
|
+
def __str__(self):
|
|
205
|
+
"""Get string value."""
|
|
206
|
+
return str(self.value)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "docling-core"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.15.0"
|
|
4
4
|
description = "A python library to define and validate data types in Docling."
|
|
5
5
|
license = "MIT"
|
|
6
6
|
authors = [
|
|
@@ -79,6 +79,15 @@ types-setuptools = "^70.3.0"
|
|
|
79
79
|
python-semantic-release = "^7.32.2"
|
|
80
80
|
pandas-stubs = "^2.1.4.231227"
|
|
81
81
|
|
|
82
|
+
[tool.poetry.group.constraints]
|
|
83
|
+
optional = true
|
|
84
|
+
|
|
85
|
+
[tool.poetry.group.constraints.dependencies]
|
|
86
|
+
numpy = [
|
|
87
|
+
{ version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' },
|
|
88
|
+
{ version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' },
|
|
89
|
+
]
|
|
90
|
+
|
|
82
91
|
[tool.setuptools.packages.find]
|
|
83
92
|
where = ["docling_core/resources/schemas"]
|
|
84
93
|
|
|
@@ -127,6 +136,7 @@ module = [
|
|
|
127
136
|
"jsonref.*",
|
|
128
137
|
"jsonschema.*",
|
|
129
138
|
"requests.*",
|
|
139
|
+
"semchunk.*",
|
|
130
140
|
"tabulate.*",
|
|
131
141
|
"transformers.*",
|
|
132
142
|
"yaml.*",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.14.0 → docling_core-2.15.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.14.0 → docling_core-2.15.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.14.0 → docling_core-2.15.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|