docling-core 2.14.0__tar.gz → 2.15.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (62) hide show
  1. {docling_core-2.14.0 → docling_core-2.15.1}/PKG-INFO +1 -1
  2. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/transforms/chunker/base.py +1 -1
  3. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/transforms/chunker/hybrid_chunker.py +81 -108
  4. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/doc/__init__.py +1 -0
  5. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/doc/base.py +2 -2
  6. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/doc/document.py +71 -5
  7. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/doc/labels.py +66 -0
  8. {docling_core-2.14.0 → docling_core-2.15.1}/pyproject.toml +11 -1
  9. {docling_core-2.14.0 → docling_core-2.15.1}/LICENSE +0 -0
  10. {docling_core-2.14.0 → docling_core-2.15.1}/README.md +0 -0
  11. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/__init__.py +0 -0
  12. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/cli/__init__.py +0 -0
  13. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/cli/view.py +0 -0
  14. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/py.typed +0 -0
  15. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
  16. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
  17. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  18. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
  19. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  20. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  21. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  22. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  23. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/search/__init__.py +0 -0
  24. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  25. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/search/mapping.py +0 -0
  26. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/search/meta.py +0 -0
  27. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/search/package.py +0 -0
  28. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/transforms/__init__.py +0 -0
  29. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/transforms/chunker/__init__.py +0 -0
  30. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  31. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/__init__.py +0 -0
  32. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/base.py +0 -0
  33. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/doc/tokens.py +0 -0
  34. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/doc/utils.py +0 -0
  35. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/gen/__init__.py +0 -0
  36. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/gen/generic.py +0 -0
  37. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/io/__init__.py +0 -0
  38. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/legacy_doc/__init__.py +0 -0
  39. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/legacy_doc/base.py +0 -0
  40. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  41. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  42. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  43. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/legacy_doc/document.py +0 -0
  44. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/legacy_doc/tokens.py +0 -0
  45. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/nlp/__init__.py +0 -0
  46. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/nlp/qa.py +0 -0
  47. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/nlp/qa_labels.py +0 -0
  48. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/rec/__init__.py +0 -0
  49. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/rec/attribute.py +0 -0
  50. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/rec/base.py +0 -0
  51. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/rec/predicate.py +0 -0
  52. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/rec/record.py +0 -0
  53. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/rec/statement.py +0 -0
  54. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/types/rec/subject.py +0 -0
  55. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/utils/__init__.py +0 -0
  56. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/utils/alias.py +0 -0
  57. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/utils/file.py +0 -0
  58. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/utils/generate_docs.py +0 -0
  59. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/utils/generate_jsonschema.py +0 -0
  60. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/utils/legacy.py +0 -0
  61. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/utils/validate.py +0 -0
  62. {docling_core-2.14.0 → docling_core-2.15.1}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.14.0
3
+ Version: 2.15.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -51,7 +51,7 @@ class BaseChunker(BaseModel, ABC):
51
51
  delim: str = DFLT_DELIM
52
52
 
53
53
  @abstractmethod
54
- def chunk(self, dl_doc: DLDocument, **kwargs) -> Iterator[BaseChunk]:
54
+ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
55
55
  """Chunk the provided document.
56
56
 
57
57
  Args:
@@ -6,7 +6,7 @@
6
6
  """Hybrid chunker implementation leveraging both doc structure & token awareness."""
7
7
 
8
8
  import warnings
9
- from typing import Iterable, Iterator, Optional, Union
9
+ from typing import Any, Iterable, Iterator, Optional, Union
10
10
 
11
11
  from pydantic import BaseModel, ConfigDict, PositiveInt, TypeAdapter, model_validator
12
12
  from typing_extensions import Self
@@ -65,13 +65,13 @@ class HybridChunker(BaseChunker):
65
65
  )
66
66
  return self
67
67
 
68
- def _count_tokens(self, text: Optional[Union[str, list[str]]]):
68
+ def _count_text_tokens(self, text: Optional[Union[str, list[str]]]):
69
69
  if text is None:
70
70
  return 0
71
71
  elif isinstance(text, list):
72
72
  total = 0
73
73
  for t in text:
74
- total += self._count_tokens(t)
74
+ total += self._count_text_tokens(t)
75
75
  return total
76
76
  return len(self._tokenizer.tokenize(text, max_length=None))
77
77
 
@@ -80,11 +80,13 @@ class HybridChunker(BaseChunker):
80
80
  text_len: int
81
81
  other_len: int
82
82
 
83
+ def _count_chunk_tokens(self, doc_chunk: DocChunk):
84
+ ser_txt = self.serialize(chunk=doc_chunk)
85
+ return len(self._tokenizer.tokenize(text=ser_txt, max_length=None))
86
+
83
87
  def _doc_chunk_length(self, doc_chunk: DocChunk):
84
- text_length = self._count_tokens(doc_chunk.text)
85
- headings_length = self._count_tokens(doc_chunk.meta.headings)
86
- captions_length = self._count_tokens(doc_chunk.meta.captions)
87
- total = text_length + headings_length + captions_length
88
+ text_length = self._count_text_tokens(doc_chunk.text)
89
+ total = self._count_chunk_tokens(doc_chunk=doc_chunk)
88
90
  return self._ChunkLengthInfo(
89
91
  total_len=total,
90
92
  text_len=text_length,
@@ -92,90 +94,69 @@ class HybridChunker(BaseChunker):
92
94
  )
93
95
 
94
96
  def _make_chunk_from_doc_items(
95
- self, doc_chunk: DocChunk, window_text: str, window_start: int, window_end: int
97
+ self, doc_chunk: DocChunk, window_start: int, window_end: int
96
98
  ):
99
+ doc_items = doc_chunk.meta.doc_items[window_start : window_end + 1]
97
100
  meta = DocMeta(
98
- doc_items=doc_chunk.meta.doc_items[window_start : window_end + 1],
101
+ doc_items=doc_items,
99
102
  headings=doc_chunk.meta.headings,
100
103
  captions=doc_chunk.meta.captions,
101
104
  origin=doc_chunk.meta.origin,
102
105
  )
106
+ window_text = (
107
+ doc_chunk.text
108
+ if len(doc_chunk.meta.doc_items) == 1
109
+ else self.delim.join(
110
+ [
111
+ doc_item.text
112
+ for doc_item in doc_items
113
+ if isinstance(doc_item, TextItem)
114
+ ]
115
+ )
116
+ )
103
117
  new_chunk = DocChunk(text=window_text, meta=meta)
104
118
  return new_chunk
105
119
 
106
- def _merge_text(self, t1, t2):
107
- if t1 == "":
108
- return t2
109
- elif t2 == "":
110
- return t1
111
- else:
112
- return f"{t1}{self.delim}{t2}"
113
-
114
120
  def _split_by_doc_items(self, doc_chunk: DocChunk) -> list[DocChunk]:
115
- if doc_chunk.meta.doc_items is None or len(doc_chunk.meta.doc_items) <= 1:
116
- return [doc_chunk]
117
- length = self._doc_chunk_length(doc_chunk)
118
- if length.total_len <= self.max_tokens:
119
- return [doc_chunk]
120
- else:
121
- chunks = []
122
- window_start = 0
123
- window_end = 0
124
- window_text = ""
125
- window_text_length = 0
126
- other_length = length.other_len
127
- num_items = len(doc_chunk.meta.doc_items)
128
- while window_end < num_items:
129
- doc_item = doc_chunk.meta.doc_items[window_end]
130
- if isinstance(doc_item, TextItem):
131
- text = doc_item.text
132
- else:
133
- raise RuntimeError("Non-TextItem split not implemented yet")
134
- text_length = self._count_tokens(text)
135
- if (
136
- text_length + window_text_length + other_length < self.max_tokens
137
- and window_end < num_items - 1
138
- ):
121
+ chunks = []
122
+ window_start = 0
123
+ window_end = 0 # an inclusive index
124
+ num_items = len(doc_chunk.meta.doc_items)
125
+ while window_end < num_items:
126
+ new_chunk = self._make_chunk_from_doc_items(
127
+ doc_chunk=doc_chunk,
128
+ window_start=window_start,
129
+ window_end=window_end,
130
+ )
131
+ if self._count_chunk_tokens(doc_chunk=new_chunk) <= self.max_tokens:
132
+ if window_end < num_items - 1:
133
+ window_end += 1
139
134
  # Still room left to add more to this chunk AND still at least one
140
135
  # item left
141
- window_end += 1
142
- window_text_length += text_length
143
- window_text = self._merge_text(window_text, text)
144
- elif text_length + window_text_length + other_length < self.max_tokens:
136
+ continue
137
+ else:
145
138
  # All the items in the window fit into the chunk and there are no
146
139
  # other items left
147
- window_text = self._merge_text(window_text, text)
148
- new_chunk = self._make_chunk_from_doc_items(
149
- doc_chunk, window_text, window_start, window_end
150
- )
151
- chunks.append(new_chunk)
152
- window_end = num_items
153
- elif window_start == window_end:
154
- # Only one item in the window and it doesn't fit into the chunk. So
155
- # we'll just make it a chunk for now and it will get split in the
156
- # plain text splitter.
157
- window_text = self._merge_text(window_text, text)
158
- new_chunk = self._make_chunk_from_doc_items(
159
- doc_chunk, window_text, window_start, window_end
160
- )
161
- chunks.append(new_chunk)
162
- window_start = window_end + 1
163
- window_end = window_start
164
- window_text = ""
165
- window_text_length = 0
166
- else:
167
- # Multiple items in the window but they don't fit into the chunk.
168
- # However, the existing items must have fit or we wouldn't have
169
- # gotten here. So we put everything but the last item into the chunk
170
- # and then start a new window INCLUDING the current window end.
171
- new_chunk = self._make_chunk_from_doc_items(
172
- doc_chunk, window_text, window_start, window_end - 1
173
- )
174
- chunks.append(new_chunk)
175
- window_start = window_end
176
- window_text = ""
177
- window_text_length = 0
178
- return chunks
140
+ window_end = num_items # signalizing the last loop
141
+ elif window_start == window_end:
142
+ # Only one item in the window and it doesn't fit into the chunk. So
143
+ # we'll just make it a chunk for now and it will get split in the
144
+ # plain text splitter.
145
+ window_end += 1
146
+ window_start = window_end
147
+ else:
148
+ # Multiple items in the window but they don't fit into the chunk.
149
+ # However, the existing items must have fit or we wouldn't have
150
+ # gotten here. So we put everything but the last item into the chunk
151
+ # and then start a new window INCLUDING the current window end.
152
+ new_chunk = self._make_chunk_from_doc_items(
153
+ doc_chunk=doc_chunk,
154
+ window_start=window_start,
155
+ window_end=window_end - 1,
156
+ )
157
+ window_start = window_end
158
+ chunks.append(new_chunk)
159
+ return chunks
179
160
 
180
161
  def _split_using_plain_text(
181
162
  self,
@@ -204,36 +185,38 @@ class HybridChunker(BaseChunker):
204
185
  def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
205
186
  output_chunks = []
206
187
  window_start = 0
207
- window_end = 0
188
+ window_end = 0 # an inclusive index
208
189
  num_chunks = len(chunks)
209
190
  while window_end < num_chunks:
210
191
  chunk = chunks[window_end]
211
- lengths = self._doc_chunk_length(chunk)
212
192
  headings_and_captions = (chunk.meta.headings, chunk.meta.captions)
213
193
  ready_to_append = False
214
194
  if window_start == window_end:
215
- # starting a new block of chunks to potentially merge
216
195
  current_headings_and_captions = headings_and_captions
217
- window_text = chunk.text
218
- window_other_length = lengths.other_len
219
- window_text_length = lengths.text_len
220
- window_items = chunk.meta.doc_items
221
196
  window_end += 1
222
197
  first_chunk_of_window = chunk
223
- elif (
224
- headings_and_captions == current_headings_and_captions
225
- and window_text_length + window_other_length + lengths.text_len
226
- <= self.max_tokens
227
- ):
228
- # there is room to include the new chunk so add it to the window and
229
- # continue
230
- window_text = self._merge_text(window_text, chunk.text)
231
- window_text_length += lengths.text_len
232
- window_items = window_items + chunk.meta.doc_items
233
- window_end += 1
234
198
  else:
235
- ready_to_append = True
236
-
199
+ chks = chunks[window_start : window_end + 1]
200
+ doc_items = [it for chk in chks for it in chk.meta.doc_items]
201
+ candidate = DocChunk(
202
+ text=self.delim.join([chk.text for chk in chks]),
203
+ meta=DocMeta(
204
+ doc_items=doc_items,
205
+ headings=current_headings_and_captions[0],
206
+ captions=current_headings_and_captions[1],
207
+ origin=chunk.meta.origin,
208
+ ),
209
+ )
210
+ if (
211
+ headings_and_captions == current_headings_and_captions
212
+ and self._count_chunk_tokens(doc_chunk=candidate) <= self.max_tokens
213
+ ):
214
+ # there is room to include the new chunk so add it to the window and
215
+ # continue
216
+ window_end += 1
217
+ new_chunk = candidate
218
+ else:
219
+ ready_to_append = True
237
220
  if ready_to_append or window_end == num_chunks:
238
221
  # no more room OR the start of new metadata. Either way, end the block
239
222
  # and use the current window_end as the start of a new block
@@ -241,16 +224,6 @@ class HybridChunker(BaseChunker):
241
224
  # just one chunk so use it as is
242
225
  output_chunks.append(first_chunk_of_window)
243
226
  else:
244
- new_meta = DocMeta(
245
- doc_items=window_items,
246
- headings=current_headings_and_captions[0],
247
- captions=current_headings_and_captions[1],
248
- origin=chunk.meta.origin,
249
- )
250
- new_chunk = DocChunk(
251
- text=window_text,
252
- meta=new_meta,
253
- )
254
227
  output_chunks.append(new_chunk)
255
228
  # no need to reset window_text, etc. because that will be reset in the
256
229
  # next iteration in the if window_start == window_end block
@@ -258,7 +231,7 @@ class HybridChunker(BaseChunker):
258
231
 
259
232
  return output_chunks
260
233
 
261
- def chunk(self, dl_doc: DoclingDocument, **kwargs) -> Iterator[BaseChunk]:
234
+ def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]:
262
235
  r"""Chunk the provided document.
263
236
 
264
237
  Args:
@@ -7,6 +7,7 @@
7
7
 
8
8
  from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
9
9
  from .document import (
10
+ CodeItem,
10
11
  DocItem,
11
12
  DoclingDocument,
12
13
  DocumentOrigin,
@@ -150,7 +150,7 @@ class BoundingBox(BaseModel):
150
150
 
151
151
  """
152
152
  if self.coord_origin == CoordOrigin.BOTTOMLEFT:
153
- return self
153
+ return self.model_copy()
154
154
  elif self.coord_origin == CoordOrigin.TOPLEFT:
155
155
  return BoundingBox(
156
156
  l=self.l,
@@ -167,7 +167,7 @@ class BoundingBox(BaseModel):
167
167
 
168
168
  """
169
169
  if self.coord_origin == CoordOrigin.TOPLEFT:
170
- return self
170
+ return self.model_copy()
171
171
  elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
172
172
  return BoundingBox(
173
173
  l=self.l,
@@ -36,7 +36,7 @@ from docling_core.search.package import VERSION_PATTERN
36
36
  from docling_core.types.base import _JSON_POINTER_REGEX
37
37
  from docling_core.types.doc import BoundingBox, Size
38
38
  from docling_core.types.doc.base import ImageRefMode
39
- from docling_core.types.doc.labels import DocItemLabel, GroupLabel
39
+ from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
40
40
  from docling_core.types.doc.tokens import DocumentToken, TableToken
41
41
  from docling_core.types.doc.utils import relative_path
42
42
 
@@ -597,7 +597,6 @@ class TextItem(DocItem):
597
597
  DocItemLabel.CAPTION,
598
598
  DocItemLabel.CHECKBOX_SELECTED,
599
599
  DocItemLabel.CHECKBOX_UNSELECTED,
600
- DocItemLabel.CODE,
601
600
  DocItemLabel.FOOTNOTE,
602
601
  DocItemLabel.FORMULA,
603
602
  DocItemLabel.PAGE_FOOTER,
@@ -656,6 +655,15 @@ class TextItem(DocItem):
656
655
  return body
657
656
 
658
657
 
658
+ class CodeItem(TextItem):
659
+ """CodeItem."""
660
+
661
+ label: typing.Literal[DocItemLabel.CODE] = (
662
+ DocItemLabel.CODE # type: ignore[assignment]
663
+ )
664
+ code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN
665
+
666
+
659
667
  class SectionHeaderItem(TextItem):
660
668
  """SectionItem."""
661
669
 
@@ -1302,6 +1310,7 @@ ContentItem = Annotated[
1302
1310
  TextItem,
1303
1311
  SectionHeaderItem,
1304
1312
  ListItem,
1313
+ CodeItem,
1305
1314
  PictureItem,
1306
1315
  TableItem,
1307
1316
  KeyValueItem,
@@ -1397,7 +1406,7 @@ class DoclingDocument(BaseModel):
1397
1406
  body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
1398
1407
 
1399
1408
  groups: List[GroupItem] = []
1400
- texts: List[Union[SectionHeaderItem, ListItem, TextItem]] = []
1409
+ texts: List[Union[SectionHeaderItem, ListItem, TextItem, CodeItem]] = []
1401
1410
  pictures: List[PictureItem] = []
1402
1411
  tables: List[TableItem] = []
1403
1412
  key_value_items: List[KeyValueItem] = []
@@ -1506,6 +1515,9 @@ class DoclingDocument(BaseModel):
1506
1515
  elif label in [DocItemLabel.SECTION_HEADER]:
1507
1516
  return self.add_heading(text=text, orig=orig, prov=prov, parent=parent)
1508
1517
 
1518
+ elif label in [DocItemLabel.CODE]:
1519
+ return self.add_code(text=text, orig=orig, prov=prov, parent=parent)
1520
+
1509
1521
  else:
1510
1522
 
1511
1523
  if not parent:
@@ -1643,6 +1655,46 @@ class DoclingDocument(BaseModel):
1643
1655
 
1644
1656
  return text_item
1645
1657
 
1658
+ def add_code(
1659
+ self,
1660
+ text: str,
1661
+ code_language: Optional[CodeLanguageLabel] = None,
1662
+ orig: Optional[str] = None,
1663
+ prov: Optional[ProvenanceItem] = None,
1664
+ parent: Optional[NodeItem] = None,
1665
+ ):
1666
+ """add_code.
1667
+
1668
+ :param text: str:
1669
+ :param code_language: Optional[str]: (Default value = None)
1670
+ :param orig: Optional[str]: (Default value = None)
1671
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
1672
+ :param parent: Optional[NodeItem]: (Default value = None)
1673
+ """
1674
+ if not parent:
1675
+ parent = self.body
1676
+
1677
+ if not orig:
1678
+ orig = text
1679
+
1680
+ text_index = len(self.texts)
1681
+ cref = f"#/texts/{text_index}"
1682
+ code_item = CodeItem(
1683
+ text=text,
1684
+ orig=orig,
1685
+ self_ref=cref,
1686
+ parent=parent.get_ref(),
1687
+ )
1688
+ if code_language:
1689
+ code_item.code_language = code_language
1690
+ if prov:
1691
+ code_item.prov.append(prov)
1692
+
1693
+ self.texts.append(code_item)
1694
+ parent.children.append(RefItem(cref=cref))
1695
+
1696
+ return code_item
1697
+
1646
1698
  def add_heading(
1647
1699
  self,
1648
1700
  text: str,
@@ -2086,7 +2138,7 @@ class DoclingDocument(BaseModel):
2086
2138
  text = f"{marker} {item.text}\n"
2087
2139
  mdtexts.append(text.strip() + "\n")
2088
2140
 
2089
- elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
2141
+ elif isinstance(item, CodeItem) and item.label in labels:
2090
2142
  in_list = False
2091
2143
  text = f"```\n{item.text}\n```\n"
2092
2144
  mdtexts.append(text)
@@ -2392,11 +2444,14 @@ class DoclingDocument(BaseModel):
2392
2444
  text = f"<li>{item.text}</li>"
2393
2445
  html_texts.append(text)
2394
2446
 
2447
+ elif isinstance(item, CodeItem) and item.label in labels:
2448
+ text = f"<pre><code>{item.text}</code></pre>"
2449
+ html_texts.append(text.strip())
2450
+
2395
2451
  elif isinstance(item, TextItem) and item.label in labels:
2396
2452
 
2397
2453
  text = f"<p>{item.text}</p>"
2398
2454
  html_texts.append(text.strip())
2399
-
2400
2455
  elif isinstance(item, TableItem):
2401
2456
 
2402
2457
  text = item.export_to_html(doc=self, add_caption=True)
@@ -2594,6 +2649,17 @@ class DoclingDocument(BaseModel):
2594
2649
  add_content=add_content,
2595
2650
  add_page_index=add_page_index,
2596
2651
  )
2652
+ elif isinstance(item, CodeItem) and (item.label in labels):
2653
+
2654
+ result += item.export_to_document_tokens(
2655
+ doc=self,
2656
+ new_line=delim,
2657
+ xsize=xsize,
2658
+ ysize=ysize,
2659
+ add_location=add_location,
2660
+ add_content=add_content,
2661
+ add_page_index=add_page_index,
2662
+ )
2597
2663
 
2598
2664
  elif isinstance(item, TextItem) and (item.label in labels):
2599
2665
 
@@ -138,3 +138,69 @@ class TableCellLabel(str, Enum):
138
138
  def __str__(self):
139
139
  """Get string value."""
140
140
  return str(self.value)
141
+
142
+
143
+ class CodeLanguageLabel(str, Enum):
144
+ """CodeLanguageLabel."""
145
+
146
+ ADA = "Ada"
147
+ AWK = "Awk"
148
+ BASH = "Bash"
149
+ BC = "bc"
150
+ C = "C"
151
+ C_SHARP = "C#"
152
+ C_PLUS_PLUS = "C++"
153
+ CMAKE = "CMake"
154
+ COBOL = "COBOL"
155
+ CSS = "CSS"
156
+ CEYLON = "Ceylon"
157
+ CLOJURE = "Clojure"
158
+ CRYSTAL = "Crystal"
159
+ CUDA = "Cuda"
160
+ CYTHON = "Cython"
161
+ D = "D"
162
+ DART = "Dart"
163
+ DC = "dc"
164
+ DOCKERFILE = "Dockerfile"
165
+ ELIXIR = "Elixir"
166
+ ERLANG = "Erlang"
167
+ FORTRAN = "FORTRAN"
168
+ FORTH = "Forth"
169
+ GO = "Go"
170
+ HTML = "HTML"
171
+ HASKELL = "Haskell"
172
+ HAXE = "Haxe"
173
+ JAVA = "Java"
174
+ JAVASCRIPT = "JavaScript"
175
+ JULIA = "Julia"
176
+ KOTLIN = "Kotlin"
177
+ LISP = "Lisp"
178
+ LUA = "Lua"
179
+ MATLAB = "Matlab"
180
+ MOONSCRIPT = "MoonScript"
181
+ NIM = "Nim"
182
+ OCAML = "OCaml"
183
+ OBJECTIVEC = "ObjectiveC"
184
+ OCTAVE = "Octave"
185
+ PHP = "PHP"
186
+ PASCAL = "Pascal"
187
+ PERL = "Perl"
188
+ PROLOG = "Prolog"
189
+ PYTHON = "Python"
190
+ RACKET = "Racket"
191
+ RUBY = "Ruby"
192
+ RUST = "Rust"
193
+ SML = "SML"
194
+ SQL = "SQL"
195
+ SCALA = "Scala"
196
+ SCHEME = "Scheme"
197
+ SWIFT = "Swift"
198
+ TYPESCRIPT = "TypeScript"
199
+ UNKNOWN = "unknown"
200
+ VISUALBASIC = "VisualBasic"
201
+ XML = "XML"
202
+ YAML = "YAML"
203
+
204
+ def __str__(self):
205
+ """Get string value."""
206
+ return str(self.value)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.14.0"
3
+ version = "2.15.1"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
@@ -79,6 +79,15 @@ types-setuptools = "^70.3.0"
79
79
  python-semantic-release = "^7.32.2"
80
80
  pandas-stubs = "^2.1.4.231227"
81
81
 
82
+ [tool.poetry.group.constraints]
83
+ optional = true
84
+
85
+ [tool.poetry.group.constraints.dependencies]
86
+ numpy = [
87
+ { version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' },
88
+ { version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' },
89
+ ]
90
+
82
91
  [tool.setuptools.packages.find]
83
92
  where = ["docling_core/resources/schemas"]
84
93
 
@@ -127,6 +136,7 @@ module = [
127
136
  "jsonref.*",
128
137
  "jsonschema.*",
129
138
  "requests.*",
139
+ "semchunk.*",
130
140
  "tabulate.*",
131
141
  "transformers.*",
132
142
  "yaml.*",
File without changes
File without changes