docling-core 1.1.4__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -0,0 +1,6 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Data transformations package."""
@@ -0,0 +1,15 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define the chunker types."""
7
+
8
+ from docling_core.transforms.chunker.base import ( # noqa
9
+ BaseChunker,
10
+ Chunk,
11
+ ChunkWithMetadata,
12
+ )
13
+ from docling_core.transforms.chunker.hierarchical_chunker import ( # noqa
14
+ HierarchicalChunker,
15
+ )
@@ -0,0 +1,45 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define base classes for chunking."""
7
+ from abc import ABC, abstractmethod
8
+ from typing import Iterator, Optional
9
+
10
+ from pydantic import BaseModel
11
+
12
+ from docling_core.types import BoundingBox, Document
13
+
14
+
15
+ class Chunk(BaseModel):
16
+ """Data model for Chunk."""
17
+
18
+ path: str
19
+ text: str
20
+
21
+
22
+ class ChunkWithMetadata(Chunk):
23
+ """Data model for Chunk including metadata."""
24
+
25
+ page: Optional[int]
26
+ bbox: Optional[BoundingBox]
27
+
28
+
29
+ class BaseChunker(BaseModel, ABC):
30
+ """Base class for Chunker."""
31
+
32
+ @abstractmethod
33
+ def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]:
34
+ """Chunk the provided document.
35
+
36
+ Args:
37
+ dl_doc (Document): document to chunk
38
+
39
+ Raises:
40
+ NotImplementedError: in this abstract implementation
41
+
42
+ Yields:
43
+ Iterator[Chunk]: iterator over extracted chunks
44
+ """
45
+ raise NotImplementedError()
@@ -0,0 +1,337 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Chunker implementation leveraging the document structure."""
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from enum import Enum
12
+ from typing import Any, Iterator, Optional, Union
13
+
14
+ import pandas as pd
15
+ from pydantic import BaseModel, PositiveInt
16
+
17
+ from docling_core.transforms.chunker import BaseChunker, Chunk, ChunkWithMetadata
18
+ from docling_core.types import BaseText
19
+ from docling_core.types import Document as DLDocument
20
+ from docling_core.types import Ref, Table
21
+
22
+ _logger = logging.getLogger(__name__)
23
+
24
+
25
+ class HierarchicalChunker(BaseChunker):
26
+ """Chunker implementation leveraging the document layout."""
27
+
28
+ include_metadata: bool = True
29
+ min_chunk_len: PositiveInt = 64
30
+
31
+ class _NodeType(str, Enum):
32
+ PARAGRAPH = "paragraph"
33
+ SUBTITLE_LEVEL_1 = "subtitle-level-1"
34
+ TABLE = "table"
35
+ CAPTION = "caption"
36
+
37
+ class _NodeName(str, Enum):
38
+ TITLE = "title"
39
+ REFERENCE = "reference"
40
+ LIST_ITEM = "list-item"
41
+ SUBTITLE_LEVEL_1 = "subtitle-level-1"
42
+
43
+ _allowed_types: list[str] = [
44
+ _NodeType.PARAGRAPH,
45
+ _NodeType.SUBTITLE_LEVEL_1,
46
+ _NodeType.TABLE,
47
+ _NodeType.CAPTION,
48
+ ]
49
+ _disallowed_names_by_type: dict[str, list[str]] = {
50
+ _NodeType.PARAGRAPH: [
51
+ _NodeName.REFERENCE,
52
+ ],
53
+ }
54
+
55
+ @classmethod
56
+ def _norm(cls, text: Optional[str]) -> Optional[str]:
57
+ return text.lower() if text is not None else None
58
+
59
+ @classmethod
60
+ def _convert_table_to_dataframe(cls, table: Table) -> Optional[pd.DataFrame]:
61
+ if table.data:
62
+ table_content = [[cell.text for cell in row] for row in table.data]
63
+ return pd.DataFrame(table_content)
64
+ else:
65
+ return None
66
+
67
+ @classmethod
68
+ def _triplet_serialize(cls, table) -> Optional[str]:
69
+ output_text: Optional[str] = None
70
+ table_df = cls._convert_table_to_dataframe(table)
71
+ if table_df is not None and table_df.shape[0] > 1 and table_df.shape[1] > 1:
72
+ rows = [item.strip() for item in table_df.iloc[:, 0].to_list()]
73
+ cols = [item.strip() for item in table_df.iloc[0, :].to_list()]
74
+ nrows = table_df.shape[0]
75
+ ncols = table_df.shape[1]
76
+ texts = [
77
+ f"{rows[i]}, {cols[j]} = {table_df.iloc[i, j].strip()}"
78
+ for i in range(1, nrows)
79
+ for j in range(1, ncols)
80
+ ]
81
+ output_text = ". ".join(texts)
82
+
83
+ return output_text
84
+
85
+ @classmethod
86
+ def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
87
+ return f"$.{path_prefix}[{pos}]"
88
+
89
+ class _MainTextItemNode(BaseModel):
90
+ parent: Optional[int] = None
91
+ children: list[int] = []
92
+
93
+ class _TitleInfo(BaseModel):
94
+ text: str
95
+ path_in_doc: str
96
+
97
+ class _GlobalContext(BaseModel):
98
+ title: Optional[_HC._TitleInfo] = None
99
+
100
+ class _DocContext(BaseModel):
101
+ dmap: dict[int, _HC._MainTextItemNode] # main text element context
102
+ glob: _HC._GlobalContext # global context
103
+
104
+ @classmethod
105
+ def from_doc(cls, doc: DLDocument) -> _HC._DocContext:
106
+ dmap: dict[int, _HC._MainTextItemNode] = {}
107
+ glob: _HC._GlobalContext = _HC._GlobalContext()
108
+ if doc.description.title:
109
+ glob.title = _HC._TitleInfo(
110
+ text=doc.description.title,
111
+ path_in_doc="description.title",
112
+ )
113
+
114
+ parent = None
115
+ if doc.main_text:
116
+ idx = 0
117
+ while idx < len(doc.main_text):
118
+ item = doc.main_text[idx]
119
+ if (
120
+ not glob.title
121
+ and isinstance(item, BaseText)
122
+ and _HC._norm(item.name) == _HC._NodeName.TITLE
123
+ ):
124
+ glob.title = _HC._TitleInfo(
125
+ text=item.text,
126
+ path_in_doc=_HC._create_path(idx),
127
+ )
128
+
129
+ # start of a subtitle-level-1 parent
130
+ if (
131
+ isinstance(item, BaseText)
132
+ and _HC._norm(item.obj_type) == _HC._NodeType.SUBTITLE_LEVEL_1
133
+ ):
134
+ dmap[idx] = _HC._MainTextItemNode(parent=None)
135
+ parent = idx
136
+ if not glob.title:
137
+ glob.title = _HC._TitleInfo(
138
+ text=item.text,
139
+ path_in_doc=_HC._create_path(idx),
140
+ )
141
+
142
+ # start of a list parent
143
+ elif (
144
+ isinstance(item, BaseText)
145
+ and _HC._norm(item.name) != _HC._NodeName.LIST_ITEM
146
+ and idx + 1 < len(doc.main_text)
147
+ and _HC._norm(doc.main_text[idx + 1].name)
148
+ == _HC._NodeName.LIST_ITEM
149
+ ):
150
+ if parent is not None:
151
+ dmap[parent].children.append(idx)
152
+ dmap[idx] = _HC._MainTextItemNode(parent=parent)
153
+
154
+ # have all children register locally
155
+ li = idx + 1
156
+ while (
157
+ li < len(doc.main_text)
158
+ and _HC._norm(doc.main_text[li].name)
159
+ == _HC._NodeName.LIST_ITEM
160
+ ):
161
+ dmap[idx].children.append(li)
162
+ dmap[li] = _HC._MainTextItemNode(parent=idx)
163
+ li += 1
164
+ idx = li
165
+ continue
166
+
167
+ # normal case
168
+ else:
169
+ if parent is not None:
170
+ dmap[parent].children.append(idx)
171
+ dmap[idx] = _HC._MainTextItemNode(parent=parent)
172
+
173
+ idx += 1
174
+ else:
175
+ pass
176
+ return cls(
177
+ dmap=dmap,
178
+ glob=glob,
179
+ )
180
+
181
+ class _TextEntry(BaseModel):
182
+ text: str
183
+ path: str
184
+
185
+ def _build_chunk_impl(
186
+ self, doc: DLDocument, doc_map: _DocContext, idx: int, rec: bool = False
187
+ ) -> list[_TextEntry]:
188
+ if doc.main_text:
189
+ item = doc.main_text[idx]
190
+ item_type = _HC._norm(item.obj_type)
191
+ item_name = _HC._norm(item.name)
192
+ if (
193
+ item_type not in self._allowed_types
194
+ or item_name in self._disallowed_names_by_type.get(item_type, [])
195
+ ):
196
+ return []
197
+
198
+ c2p = doc_map.dmap
199
+
200
+ text_entries: list[_HC._TextEntry] = []
201
+ if (
202
+ isinstance(item, Ref)
203
+ and item_type == _HC._NodeType.TABLE
204
+ and doc.tables
205
+ ):
206
+ # resolve table reference
207
+ ref_nr = int(item.ref.split("/")[2]) # e.g. '#/tables/0'
208
+ table = doc.tables[ref_nr]
209
+ ser_out = _HC._triplet_serialize(table)
210
+ if table.data:
211
+ text_entries = (
212
+ [
213
+ self._TextEntry(
214
+ text=ser_out,
215
+ path=self._create_path(idx),
216
+ )
217
+ ]
218
+ if ser_out
219
+ else []
220
+ )
221
+ else:
222
+ return []
223
+ elif isinstance(item, BaseText):
224
+ text_entries = [
225
+ self._TextEntry(
226
+ text=item.text,
227
+ path=self._create_path(idx),
228
+ )
229
+ ]
230
+
231
+ # squash in any children of type list-item
232
+ if not rec:
233
+ if (
234
+ c2p[idx].children
235
+ and _HC._norm(doc.main_text[c2p[idx].children[0]].name)
236
+ == _HC._NodeName.LIST_ITEM
237
+ ):
238
+ text_entries = text_entries + [
239
+ self._TextEntry(
240
+ text=doc.main_text[c].text, # type: ignore[union-attr]
241
+ path=self._create_path(c),
242
+ )
243
+ for c in c2p[idx].children
244
+ if isinstance(doc.main_text[c], BaseText)
245
+ and _HC._norm(doc.main_text[c].name) == _HC._NodeName.LIST_ITEM
246
+ ]
247
+ elif item_name in [
248
+ _HC._NodeName.LIST_ITEM,
249
+ _HC._NodeName.SUBTITLE_LEVEL_1,
250
+ ]:
251
+ return []
252
+
253
+ if (parent := c2p[idx].parent) is not None:
254
+ # prepend with ancestors
255
+ return (
256
+ self._build_chunk_impl(
257
+ doc=doc, doc_map=doc_map, idx=parent, rec=True
258
+ )
259
+ + text_entries
260
+ )
261
+ else:
262
+ # if root, augment with title (if available and different)
263
+ return text_entries
264
+ else:
265
+ return []
266
+
267
+ def _build_chunk(
268
+ self,
269
+ doc: DLDocument,
270
+ doc_map: _DocContext,
271
+ idx: int,
272
+ delim: str,
273
+ rec: bool = False,
274
+ ) -> Optional[Chunk]:
275
+ texts = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
276
+ concat = delim.join([t.text for t in texts if t.text])
277
+ assert doc.main_text is not None
278
+ if len(concat) >= self.min_chunk_len:
279
+ orig_item = doc.main_text[idx]
280
+ item: Union[BaseText, Table]
281
+ if isinstance(orig_item, Ref):
282
+ if _HC._norm(orig_item.obj_type) == _HC._NodeType.TABLE and doc.tables:
283
+ pos = int(orig_item.ref.split("/")[2])
284
+ item = doc.tables[pos]
285
+ path = self._create_path(pos, path_prefix="tables")
286
+ else: # currently disregarding non-table references
287
+ return None
288
+ else:
289
+ item = orig_item
290
+ path = self._create_path(idx)
291
+
292
+ if self.include_metadata:
293
+ return ChunkWithMetadata(
294
+ text=concat,
295
+ path=path,
296
+ page=item.prov[0].page if item.prov else None,
297
+ bbox=item.prov[0].bbox if item.prov else None,
298
+ )
299
+ else:
300
+ return Chunk(
301
+ text=concat,
302
+ path=path,
303
+ )
304
+ else:
305
+ return None
306
+
307
+ def chunk(self, dl_doc: DLDocument, delim="\n", **kwargs: Any) -> Iterator[Chunk]:
308
+ r"""Chunk the provided document.
309
+
310
+ Args:
311
+ dl_doc (DLDocument): document to chunk
312
+ delim (str, optional): delimiter to use when concatenating sub-items.
313
+ Defaults to "\n".
314
+
315
+ Yields:
316
+ Iterator[Chunk]: iterator over extracted chunks
317
+ """
318
+ if dl_doc.main_text:
319
+ # extract doc structure incl. metadata for
320
+ # each item (e.g. parent, children)
321
+ doc_ctx = self._DocContext.from_doc(doc=dl_doc)
322
+ _logger.debug(f"{doc_ctx.model_dump()=}")
323
+
324
+ for i, item in enumerate(dl_doc.main_text):
325
+ if (
326
+ isinstance(item, BaseText)
327
+ or _HC._norm(item.obj_type) == _HC._NodeType.TABLE
328
+ ):
329
+ chunk = self._build_chunk(
330
+ doc=dl_doc, doc_map=doc_ctx, idx=i, delim=delim
331
+ )
332
+ if chunk:
333
+ _logger.info(f"{i=}, {chunk=}")
334
+ yield chunk
335
+
336
+
337
+ _HC = HierarchicalChunker
@@ -131,6 +131,7 @@ class GlmTableCell(TableCell):
131
131
  class BaseCell(AliasModel):
132
132
  """Base cell."""
133
133
 
134
+ # FIXME: we need to check why we have bounding_box (this should be in prov)
134
135
  bounding_box: Optional[BoundingBoxContainer] = Field(
135
136
  default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
136
137
  )
@@ -152,6 +153,11 @@ class Table(BaseCell):
152
153
  model: Optional[str] = None
153
154
 
154
155
 
156
+ # FIXME: let's add some figure specific data-types later
157
+ class Figure(BaseCell):
158
+ """Figure."""
159
+
160
+
155
161
  class BaseText(AliasModel):
156
162
  """Base model for text objects."""
157
163
 
@@ -6,7 +6,8 @@
6
6
  """Models for the Docling Document data type."""
7
7
 
8
8
  from datetime import datetime
9
- from typing import Generic, Optional, Union
9
+ from enum import Enum
10
+ from typing import Generic, Optional, Tuple, Union
10
11
 
11
12
  from pydantic import (
12
13
  AnyHttpUrl,
@@ -35,6 +36,7 @@ from docling_core.types.doc.base import (
35
36
  BaseCell,
36
37
  BaseText,
37
38
  BitmapObject,
39
+ Figure,
38
40
  PageDimensions,
39
41
  PageReference,
40
42
  Ref,
@@ -275,7 +277,7 @@ class MinimalDocument(
275
277
  main_text: Optional[list[Union[Ref, BaseText]]] = Field(
276
278
  default=None, alias="main-text"
277
279
  )
278
- figures: Optional[list[BaseCell]] = None
280
+ figures: Optional[list[Figure]] = None
279
281
  tables: Optional[list[Table]] = None
280
282
 
281
283
 
@@ -345,6 +347,107 @@ class CCSDocument(
345
347
  return data
346
348
 
347
349
 
350
+ class DocumentToken(Enum):
351
+ """Class to represent an LLM friendly representation of a Document."""
352
+
353
+ BEG_DOCUMENT = "<document>"
354
+ END_DOCUMENT = "</document>"
355
+
356
+ BEG_TITLE = "<title>"
357
+ END_TITLE = "</title>"
358
+
359
+ BEG_ABSTRACT = "<abstract>"
360
+ END_ABSTRACT = "</abstract>"
361
+
362
+ BEG_DOI = "<doi>"
363
+ END_DOI = "</doi>"
364
+ BEG_DATE = "<date>"
365
+ END_DATE = "</date>"
366
+
367
+ BEG_AUTHORS = "<authors>"
368
+ END_AUTHORS = "</authors>"
369
+ BEG_AUTHOR = "<author>"
370
+ END_AUTHOR = "</author>"
371
+
372
+ BEG_AFFILIATIONS = "<affiliations>"
373
+ END_AFFILIATIONS = "</affiliations>"
374
+ BEG_AFFILIATION = "<affiliation>"
375
+ END_AFFILIATION = "</affiliation>"
376
+
377
+ BEG_HEADER = "<section-header>"
378
+ END_HEADER = "</section-header>"
379
+ BEG_TEXT = "<text>"
380
+ END_TEXT = "</text>"
381
+ BEG_PARAGRAPH = "<paragraph>"
382
+ END_PARAGRAPH = "</paragraph>"
383
+ BEG_TABLE = "<table>"
384
+ END_TABLE = "</table>"
385
+ BEG_FIGURE = "<figure>"
386
+ END_FIGURE = "</figure>"
387
+ BEG_CAPTION = "<caption>"
388
+ END_CAPTION = "</caption>"
389
+ BEG_EQUATION = "<equation>"
390
+ END_EQUATION = "</equation>"
391
+ BEG_LIST = "<list>"
392
+ END_LIST = "</list>"
393
+ BEG_LISTITEM = "<list-item>"
394
+ END_LISTITEM = "</list-item>"
395
+
396
+ BEG_LOCATION = "<location>"
397
+ END_LOCATION = "</location>"
398
+ BEG_GROUP = "<group>"
399
+ END_GROUP = "</group>"
400
+
401
+ @classmethod
402
+ def get_special_tokens(
403
+ cls,
404
+ max_rows: int = 100,
405
+ max_cols: int = 100,
406
+ max_pages: int = 1000,
407
+ page_dimension: Tuple[int, int] = (100, 100),
408
+ ):
409
+ """Function to get all special document tokens."""
410
+ special_tokens = [token.value for token in cls]
411
+
412
+ # Adding dynamically generated row and col tokens
413
+ for i in range(0, max_rows):
414
+ special_tokens += [f"<row_{i}>", f"</row_{i}>"]
415
+
416
+ for i in range(0, max_cols):
417
+ special_tokens += [f"<col_{i}>", f"</col_{i}>"]
418
+
419
+ for i in range(6):
420
+ special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
421
+
422
+ # Adding dynamically generated page-tokens
423
+ for i in range(0, max_pages):
424
+ special_tokens.append(f"<page_{i}>")
425
+
426
+ # Adding dynamically generated location-tokens
427
+ for i in range(0, max(page_dimension[0], page_dimension[1])):
428
+ special_tokens.append(f"<loc_{i}>")
429
+
430
+ return special_tokens
431
+
432
+ @staticmethod
433
+ def get_page_token(page: int):
434
+ """Function to get page tokens."""
435
+ return f"<page_{page}>"
436
+
437
+ @staticmethod
438
+ def get_location_token(val: float, rnorm: int = 100):
439
+ """Function to get location tokens."""
440
+ val_ = round(rnorm * val)
441
+
442
+ if val_ < 0:
443
+ return "<loc_0>"
444
+
445
+ if val_ > rnorm:
446
+ return f"<loc_{rnorm}>"
447
+
448
+ return f"<loc_{val_}>"
449
+
450
+
348
451
  class ExportedCCSDocument(
349
452
  MinimalDocument,
350
453
  Generic[
@@ -427,6 +530,14 @@ class ExportedCCSDocument(
427
530
  delim: str = "\n\n",
428
531
  main_text_start: int = 0,
429
532
  main_text_stop: Optional[int] = None,
533
+ main_text_labels: list[str] = [
534
+ "title",
535
+ "subtitle-level-1",
536
+ "paragraph",
537
+ "caption",
538
+ "table",
539
+ ],
540
+ strict_text: bool = False,
430
541
  ) -> str:
431
542
  r"""Serialize to Markdown.
432
543
 
@@ -461,12 +572,7 @@ class ExportedCCSDocument(
461
572
  continue
462
573
 
463
574
  item_type = item.obj_type
464
- if isinstance(item, BaseText) and item_type in {
465
- "title",
466
- "subtitle-level-1",
467
- "paragraph",
468
- "caption",
469
- }:
575
+ if isinstance(item, BaseText) and item_type in main_text_labels:
470
576
  text = item.text
471
577
 
472
578
  # ignore repeated text
@@ -477,20 +583,31 @@ class ExportedCCSDocument(
477
583
 
478
584
  # first title match
479
585
  if item_type == "title" and not has_title:
480
- markdown_text = f"# {text}"
586
+ if strict_text:
587
+ markdown_text = f"{text}"
588
+ else:
589
+ markdown_text = f"# {text}"
481
590
  has_title = True
482
591
 
483
592
  # secondary titles
484
593
  elif item_type in {"title", "subtitle-level-1"} or (
485
594
  has_title and item_type == "title"
486
595
  ):
487
- markdown_text = f"## {text}"
596
+ if strict_text:
597
+ markdown_text = f"{text}"
598
+ else:
599
+ markdown_text = f"## {text}"
488
600
 
489
601
  # normal text
490
602
  else:
491
603
  markdown_text = text
492
604
 
493
- elif isinstance(item, Table) and item.data:
605
+ elif (
606
+ isinstance(item, Table)
607
+ and item.data
608
+ and item_type in main_text_labels
609
+ and not strict_text
610
+ ):
494
611
  table = []
495
612
  for row in item.data:
496
613
  tmp = []
@@ -518,3 +635,157 @@ class ExportedCCSDocument(
518
635
 
519
636
  result = delim.join(md_texts)
520
637
  return result
638
+
639
+ def export_to_document_tokens(
640
+ self,
641
+ delim: str = "\n\n",
642
+ main_text_start: int = 0,
643
+ main_text_stop: Optional[int] = None,
644
+ main_text_labels: list[str] = [
645
+ "title",
646
+ "subtitle-level-1",
647
+ "paragraph",
648
+ "caption",
649
+ "table",
650
+ "figure",
651
+ ],
652
+ page_tagging: bool = True,
653
+ location_tagging: bool = True,
654
+ location_dimensions: Tuple[int, int] = (100, 100),
655
+ add_new_line: bool = True,
656
+ ) -> str:
657
+ r"""Exports the document content to an DocumentToken format.
658
+
659
+ Operates on a slice of the document's main_text as defined through arguments
660
+ main_text_start and main_text_stop; defaulting to the whole main_text.
661
+
662
+ Args:
663
+ delim (str, optional): The delimiter used to separate text blocks in the
664
+ exported XML. Default is two newline characters ("\n\n").
665
+ main_text_start (int, optional): The starting index of the main text to
666
+ be included in the XML. Default is 0 (the beginning of the text).
667
+ main_text_stop (Optional[int], optional): The stopping index of the main
668
+ text. If set to None, the export includes text up to the end.
669
+ Default is None.
670
+ main_text_labels (list[str], optional): A list of text labels that
671
+ categorize the different sections of the document (e.g., "title",
672
+ "subtitle-level-1", "paragraph", "caption"). Default labels are
673
+ "title", "subtitle-level-1", "paragraph", and "caption".
674
+ location_tagging (bool, optional): Determines whether to include
675
+ location-based tagging in the XML. If True, the exported XML will
676
+ contain information about the locations of the text elements.
677
+ Default is True.
678
+ location_dimensions (Tuple[int, int], optional): Specifies the dimensions
679
+ (width and height) for the location tagging, if enabled.
680
+ Default is [100, 100].
681
+ add_new_line (bool, optional): Whether to add new line characters after
682
+ each text block. If True, a new line is added after each block of
683
+ text in the XML. Default is True.
684
+
685
+ Returns:
686
+ str: The content of the document formatted as an XML string.
687
+ """
688
+ xml_str = DocumentToken.BEG_DOCUMENT.value
689
+
690
+ new_line = ""
691
+ if add_new_line:
692
+ new_line = "\n"
693
+
694
+ if self.main_text is not None:
695
+ for orig_item in self.main_text[main_text_start:main_text_stop]:
696
+
697
+ item = (
698
+ self._resolve_ref(orig_item)
699
+ if isinstance(orig_item, Ref)
700
+ else orig_item
701
+ )
702
+
703
+ if item is None:
704
+ continue
705
+
706
+ prov = item.prov
707
+
708
+ loc_str = "" # default is zero
709
+ if (
710
+ location_tagging
711
+ and self.page_dimensions is not None
712
+ and prov is not None
713
+ and len(prov) > 0
714
+ ):
715
+
716
+ page = prov[0].page
717
+ page_dim = self.page_dimensions[page - 1]
718
+
719
+ page_w = float(page_dim.width)
720
+ page_h = float(page_dim.height)
721
+
722
+ x0 = float(prov[0].bbox[0]) / float(page_w)
723
+ y0 = float(prov[0].bbox[1]) / float(page_h)
724
+ x1 = float(prov[0].bbox[2]) / float(page_w)
725
+ y1 = float(prov[0].bbox[3]) / float(page_h)
726
+
727
+ page_tok = ""
728
+ if page_tagging:
729
+ page_tok = DocumentToken.get_page_token(page=page)
730
+
731
+ x0_tok = DocumentToken.get_location_token(
732
+ val=min(x0, x1), rnorm=location_dimensions[0]
733
+ )
734
+ y0_tok = DocumentToken.get_location_token(
735
+ val=min(y0, y1), rnorm=location_dimensions[1]
736
+ )
737
+ x1_tok = DocumentToken.get_location_token(
738
+ val=max(x0, x1), rnorm=location_dimensions[0]
739
+ )
740
+ y1_tok = DocumentToken.get_location_token(
741
+ val=max(y0, y1), rnorm=location_dimensions[1]
742
+ )
743
+
744
+ # update
745
+ loc_str = f"{DocumentToken.BEG_LOCATION.value}"
746
+ loc_str += f"{page_tok}"
747
+ loc_str += f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
748
+ loc_str += f"{DocumentToken.END_LOCATION.value}"
749
+
750
+ item_type = item.obj_type
751
+ if isinstance(item, BaseText) and (item_type in main_text_labels):
752
+ text = item.text
753
+
754
+ xml_str += f"<{item_type}>{loc_str}{text}</{item_type}>{new_line}"
755
+
756
+ elif isinstance(item, Table) and (item_type in main_text_labels):
757
+
758
+ xml_str += f"<{item_type}>{loc_str}"
759
+
760
+ if item.text is not None and len(item.text) > 0:
761
+ xml_str += f"{DocumentToken.BEG_CAPTION.value}"
762
+ xml_str += (
763
+ f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
764
+ )
765
+
766
+ if item.data is not None and len(item.data) > 0:
767
+ for i, row in enumerate(item.data):
768
+ xml_str += f"<row_{i}>"
769
+ for j, col in enumerate(row):
770
+ text = col.text
771
+ xml_str += f"<col_{j}>{text}</col_{j}>"
772
+
773
+ xml_str += f"</row_{i}>{new_line}"
774
+
775
+ xml_str += f"</{item_type}>{new_line}"
776
+
777
+ elif isinstance(item, Figure) and (item_type in main_text_labels):
778
+
779
+ xml_str += f"<{item_type}>{loc_str}"
780
+
781
+ if item.text is not None and len(item.text) > 0:
782
+ xml_str += f"{DocumentToken.BEG_CAPTION.value}"
783
+ xml_str += (
784
+ f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
785
+ )
786
+
787
+ xml_str += f"</{item_type}>{new_line}"
788
+
789
+ xml_str += DocumentToken.END_DOCUMENT.value
790
+
791
+ return xml_str
@@ -4,6 +4,7 @@
4
4
  #
5
5
 
6
6
  """Define the model Statement."""
7
+ from enum import Enum
7
8
  from typing import Generic
8
9
 
9
10
  from pydantic import Field
@@ -21,6 +22,39 @@ from docling_core.types.rec.attribute import Attribute
21
22
  from docling_core.types.rec.subject import Subject
22
23
 
23
24
 
25
+ class StatementToken(Enum):
26
+ """Class to represent an LLM friendly representation of statements."""
27
+
28
+ BEG_STATEMENTS = "<statements>"
29
+ END_STATEMENTS = "</statements>"
30
+
31
+ BEG_STATEMENT = "<statement>"
32
+ END_STATEMENT = "</statement>"
33
+
34
+ BEG_PROV = "<prov>"
35
+ END_PROV = "</prov>"
36
+
37
+ BEG_SUBJECT = "<subject>"
38
+ END_SUBJECT = "</subject>"
39
+
40
+ BEG_PREDICATE = "<predicate>"
41
+ END_PREDICATE = "</predicate>"
42
+
43
+ BEG_PROPERTY = "<property>"
44
+ END_PROPERTY = "</property>"
45
+
46
+ BEG_VALUE = "<value>"
47
+ END_VALUE = "</value>"
48
+
49
+ BEG_UNIT = "<unit>"
50
+ END_UNIT = "</unit>"
51
+
52
+ @classmethod
53
+ def get_special_tokens(cls):
54
+ """Function to get all special statements tokens."""
55
+ return [token.value for token in cls]
56
+
57
+
24
58
  class Statement(
25
59
  Attribute,
26
60
  Generic[
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 1.1.4
3
+ Version: 1.3.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -28,6 +28,7 @@ Classifier: Typing :: Typed
28
28
  Requires-Dist: json-schema-for-humans (>=1.0.0,<2.0.0)
29
29
  Requires-Dist: jsonref (>=1.1.0,<2.0.0)
30
30
  Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
31
+ Requires-Dist: pandas (>=2.2.2,<3.0.0)
31
32
  Requires-Dist: pydantic (>=2.6.0,<3.0.0)
32
33
  Requires-Dist: pyproject-toml (>=0.0.10,<0.0.11)
33
34
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
@@ -13,14 +13,18 @@ docling_core/search/json_schema_to_search_mapper.py,sha256=9crSFuSbcXrJej7j1rYWK
13
13
  docling_core/search/mapping.py,sha256=6rqG7LgYSeWmooKNEcRa5gFDLp1ZdzPqDGlwTA5gpOk,724
14
14
  docling_core/search/meta.py,sha256=wSurrsqdP1N3gQKx027fVdzVmc33a7Y6rPl-FClQvtA,3318
15
15
  docling_core/search/package.py,sha256=Lz2ml2eDy5t0ZimnGTq-DXHAn-f18w0bn4H5xrhs75A,1841
16
+ docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9ACDd57ds,106
17
+ docling_core/transforms/chunker/__init__.py,sha256=xZ5ELOB8tbCoJY1dKUvOrFqxYyoHmmCNUSHxrrRi8a4,317
18
+ docling_core/transforms/chunker/base.py,sha256=y1YswRxkdIaNX3Ek7asa1D__KuErRgRKcB8CZ_fQ1uM,970
19
+ docling_core/transforms/chunker/hierarchical_chunker.py,sha256=lAeHgJ4relA1EU0YVOKeuX6mLASmA-SZ5_ChgliSCKk,11996
16
20
  docling_core/types/__init__.py,sha256=6mrAEKRW85uHJwNQBufwjPcMWCjm3oocA6MaO4_NLgg,805
17
21
  docling_core/types/base.py,sha256=fNtfQ20NKa_RBNBWbq0DfO8o0zC1Cec8UAMu0Znsltk,8170
18
22
  docling_core/types/doc/__init__.py,sha256=Pzj_8rft6SJTVTCHgXRwHtuZjL6LK_6dcBWjikL9biY,125
19
- docling_core/types/doc/base.py,sha256=Vwh-8Q8n9meFxbrbMUx2zNzt1JnUo3Y3Hpwmmf82IlM,5206
23
+ docling_core/types/doc/base.py,sha256=Jqw5vqiJSJPseq4TUXsARGtH5h095VnA5IJsxvcobns,5387
20
24
  docling_core/types/doc/doc_ann.py,sha256=8pV2efUglw19jxl4_oqB__mSxjWvtGIcllyCdqA-b2s,1196
21
25
  docling_core/types/doc/doc_ocr.py,sha256=6PC0C-OczF-MyfgRxEI1xs3PWgNOzi7i2yEQbTqZz0I,1387
22
26
  docling_core/types/doc/doc_raw.py,sha256=Y69G6IiauNDaoT-5el4xo1ypWpnBJQ75akGGkCMTZSc,3888
23
- docling_core/types/doc/document.py,sha256=kpnBa3cjhH0SKdDaZDUuNIFX7VnPZOHhoB2FlDhwq2g,17187
27
+ docling_core/types/doc/document.py,sha256=6puIPc3aK3kecklCFqVgJXgU4gvGPqjuXePx82poFYE,26934
24
28
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
25
29
  docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
26
30
  docling_core/types/nlp/__init__.py,sha256=hGcztAeVK7xkRBqRRvc4zbY4PGeJ0r0QrEsetnSx9nI,119
@@ -31,7 +35,7 @@ docling_core/types/rec/attribute.py,sha256=PzPdaPhP5NWbFo8rYOoBl3Vfyx4zJUxN6ZpXl
31
35
  docling_core/types/rec/base.py,sha256=jhTfInNGyB9NUw7o33PElrFGL80TqhU8MLcLZNZYj3E,3222
32
36
  docling_core/types/rec/predicate.py,sha256=4iDwXl9c4jzHTDIlRNE88yvDzKA9_od0xjPUUUP5IjI,3959
33
37
  docling_core/types/rec/record.py,sha256=r1QgPepwH3YjmMHlwwmeK00ZHEJnAsvyOMeXFY_D9_Q,2750
34
- docling_core/types/rec/statement.py,sha256=BXkuKBz0BL7eiowL_aaYxsz_WBLfR4hfgiqTby4TRnk,920
38
+ docling_core/types/rec/statement.py,sha256=YwcV4CbVaAbzNwh14yJ_6Py3Ww0XnUJrEEUiKRdCZ5o,1701
35
39
  docling_core/types/rec/subject.py,sha256=wX9qsihwDbR7ZNSzY3vQymxi0eN1nxxsonrhSZzsMhA,2565
36
40
  docling_core/utils/__init__.py,sha256=VauNNpWRHG0_ISKrsy5-gTxicrdQZSau6qMfuMl3iqk,120
37
41
  docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,874
@@ -39,8 +43,8 @@ docling_core/utils/ds_generate_docs.py,sha256=0xGBagdC_PGjyeHXYZo90VnVrSTMZgHb0S
39
43
  docling_core/utils/ds_generate_jsonschema.py,sha256=EhNQutqWJFWuN-yl9UUPFZ7DJTvGqg54qBIvUMHTHdA,1647
40
44
  docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
41
45
  docling_core/utils/validators.py,sha256=fBdyWX4PvFh7o_d25ZTs4iwmeo75QTbrxsvXv2kXkTg,2777
42
- docling_core-1.1.4.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
43
- docling_core-1.1.4.dist-info/METADATA,sha256=nrVfDBk66tXsL8wbyBiE3XcGJcpc0TT5lnRoB41qH5Y,5393
44
- docling_core-1.1.4.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
45
- docling_core-1.1.4.dist-info/entry_points.txt,sha256=XHhtJEkdUuLxXSNxLdFIzx_siQ3z2UFQEKp-P8VYAE4,189
46
- docling_core-1.1.4.dist-info/RECORD,,
46
+ docling_core-1.3.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
47
+ docling_core-1.3.0.dist-info/METADATA,sha256=gD3LDYHPJeRhUO7-OA21cU6EV4PKwvxrZzOjAdUcWB0,5432
48
+ docling_core-1.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
49
+ docling_core-1.3.0.dist-info/entry_points.txt,sha256=XHhtJEkdUuLxXSNxLdFIzx_siQ3z2UFQEKp-P8VYAE4,189
50
+ docling_core-1.3.0.dist-info/RECORD,,