docling-core 1.6.2__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -8,7 +8,7 @@ import re
8
8
  from copy import deepcopy
9
9
  from typing import Any, Optional, Pattern, Tuple, TypedDict
10
10
 
11
- from jsonref import JsonRef
11
+ from jsonref import replace_refs
12
12
 
13
13
 
14
14
  class SearchIndexDefinition(TypedDict):
@@ -95,7 +95,11 @@ class JsonSchemaToSearchMapper:
95
95
  which define the fields, their data types, and other specifications to index
96
96
  JSON documents into a Lucene index.
97
97
  """
98
- mapping = JsonRef.replace_refs(schema)
98
+ mapping = deepcopy(schema)
99
+
100
+ mapping = self._suppress(mapping, self._suppress_key)
101
+
102
+ mapping = replace_refs(mapping)
99
103
 
100
104
  mapping = self._merge_unions(mapping)
101
105
 
@@ -105,8 +109,6 @@ class JsonSchemaToSearchMapper:
105
109
 
106
110
  mapping = self._remove_keys(mapping, self._rm_keys)
107
111
 
108
- mapping = self._suppress(mapping, self._suppress_key)
109
-
110
112
  mapping = self._translate_keys_re(mapping)
111
113
 
112
114
  mapping = self._clean(mapping)
@@ -22,8 +22,9 @@ class Chunk(BaseModel):
22
22
  class ChunkWithMetadata(Chunk):
23
23
  """Data model for Chunk including metadata."""
24
24
 
25
- page: Optional[int]
26
- bbox: Optional[BoundingBox]
25
+ page: Optional[int] = None
26
+ bbox: Optional[BoundingBox] = None
27
+ heading: Optional[str] = None
27
28
 
28
29
 
29
30
  class BaseChunker(BaseModel, ABC):
@@ -26,6 +26,7 @@ class HierarchicalChunker(BaseChunker):
26
26
  """Chunker implementation leveraging the document layout."""
27
27
 
28
28
  include_metadata: bool = True
29
+ heading_as_metadata: bool = False
29
30
  min_chunk_len: PositiveInt = 64
30
31
 
31
32
  class _NodeType(str, Enum):
@@ -184,7 +185,7 @@ class HierarchicalChunker(BaseChunker):
184
185
 
185
186
  def _build_chunk_impl(
186
187
  self, doc: DLDocument, doc_map: _DocContext, idx: int, rec: bool = False
187
- ) -> list[_TextEntry]:
188
+ ) -> tuple[list[_TextEntry], Optional[str]]:
188
189
  if doc.main_text:
189
190
  item = doc.main_text[idx]
190
191
  item_type = _HC._norm(item.obj_type)
@@ -193,7 +194,7 @@ class HierarchicalChunker(BaseChunker):
193
194
  item_type not in self._allowed_types
194
195
  or item_name in self._disallowed_names_by_type.get(item_type, [])
195
196
  ):
196
- return []
197
+ return [], None
197
198
 
198
199
  c2p = doc_map.dmap
199
200
 
@@ -219,7 +220,7 @@ class HierarchicalChunker(BaseChunker):
219
220
  else []
220
221
  )
221
222
  else:
222
- return []
223
+ return [], None
223
224
  elif isinstance(item, BaseText):
224
225
  text_entries = [
225
226
  self._TextEntry(
@@ -248,21 +249,29 @@ class HierarchicalChunker(BaseChunker):
248
249
  _HC._NodeName.LIST_ITEM,
249
250
  _HC._NodeName.SUBTITLE_LEVEL_1,
250
251
  ]:
251
- return []
252
+ return [], None
252
253
 
253
254
  if (parent := c2p[idx].parent) is not None:
254
255
  # prepend with ancestors
256
+
257
+ parent_res = self._build_chunk_impl(
258
+ doc=doc, doc_map=doc_map, idx=parent, rec=True
259
+ )
255
260
  return (
256
- self._build_chunk_impl(
257
- doc=doc, doc_map=doc_map, idx=parent, rec=True
258
- )
259
- + text_entries
261
+ parent_res[0] + text_entries, # expanded text
262
+ parent_res[1], # heading
260
263
  )
261
264
  else:
262
- # if root, augment with title (if available and different)
263
- return text_entries
265
+ if (
266
+ self.heading_as_metadata
267
+ and isinstance(item, BaseText)
268
+ and _HC._norm(item.obj_type) == _HC._NodeType.SUBTITLE_LEVEL_1
269
+ ):
270
+ return [], text_entries[0].text
271
+ else:
272
+ return text_entries, None
264
273
  else:
265
- return []
274
+ return [], None
266
275
 
267
276
  def _build_chunk(
268
277
  self,
@@ -272,7 +281,9 @@ class HierarchicalChunker(BaseChunker):
272
281
  delim: str,
273
282
  rec: bool = False,
274
283
  ) -> Optional[Chunk]:
275
- texts = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
284
+ res = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
285
+ texts = res[0]
286
+ heading = res[1]
276
287
  concat = delim.join([t.text for t in texts if t.text])
277
288
  assert doc.main_text is not None
278
289
  if len(concat) >= self.min_chunk_len:
@@ -295,6 +306,7 @@ class HierarchicalChunker(BaseChunker):
295
306
  path=path,
296
307
  page=item.prov[0].page if item.prov else None,
297
308
  bbox=item.prov[0].bbox if item.prov else None,
309
+ heading=heading,
298
310
  )
299
311
  else:
300
312
  return Chunk(
@@ -315,6 +327,11 @@ class HierarchicalChunker(BaseChunker):
315
327
  Yields:
316
328
  Iterator[Chunk]: iterator over extracted chunks
317
329
  """
330
+ if (not self.include_metadata) and self.heading_as_metadata:
331
+ raise RuntimeError(
332
+ "To enable `heading_as_metadata`, also `include_metadata` must be True."
333
+ )
334
+
318
335
  if dl_doc.main_text:
319
336
  # extract doc structure incl. metadata for
320
337
  # each item (e.g. parent, children)
@@ -0,0 +1,12 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define the ID generator types."""
7
+
8
+ from docling_core.transforms.id_generator.base import BaseIDGenerator # noqa
9
+ from docling_core.transforms.id_generator.doc_hash_id_generator import ( # noqa
10
+ DocHashIDGenerator,
11
+ )
12
+ from docling_core.transforms.id_generator.uuid_generator import UUIDGenerator # noqa
@@ -0,0 +1,30 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Base document ID generator module."""
7
+
8
+ from abc import ABC, abstractmethod
9
+ from typing import Any
10
+
11
+ from docling_core.types import Document as DLDocument
12
+
13
+
14
+ class BaseIDGenerator(ABC):
15
+ """Document ID generator base class."""
16
+
17
+ @abstractmethod
18
+ def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
19
+ """Generate an ID for the given document.
20
+
21
+ Args:
22
+ doc (DLDocument): document to generate ID for
23
+
24
+ Raises:
25
+ NotImplementedError: in this abstract implementation
26
+
27
+ Returns:
28
+ str: the generated ID
29
+ """
30
+ raise NotImplementedError()
@@ -0,0 +1,27 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Doc-hash-based ID generator module."""
7
+
8
+
9
+ from typing import Any
10
+
11
+ from docling_core.transforms.id_generator import BaseIDGenerator
12
+ from docling_core.types import Document as DLDocument
13
+
14
+
15
+ class DocHashIDGenerator(BaseIDGenerator):
16
+ """Doc-hash-based ID generator class."""
17
+
18
+ def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
19
+ """Generate an ID for the given document.
20
+
21
+ Args:
22
+ doc (DLDocument): document to generate ID for
23
+
24
+ Returns:
25
+ str: the generated ID
26
+ """
27
+ return doc.file_info.document_hash
@@ -0,0 +1,34 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """UUID-based ID generator module."""
7
+
8
+ from random import Random
9
+ from typing import Annotated, Any, Optional
10
+ from uuid import UUID
11
+
12
+ from pydantic import BaseModel, Field
13
+
14
+ from docling_core.transforms.id_generator import BaseIDGenerator
15
+ from docling_core.types import Document as DLDocument
16
+
17
+
18
+ class UUIDGenerator(BaseModel, BaseIDGenerator):
19
+ """UUID-based ID generator class."""
20
+
21
+ seed: Optional[int] = None
22
+ uuid_version: Annotated[int, Field(strict=True, ge=1, le=5)] = 4
23
+
24
+ def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
25
+ """Generate an ID for the given document.
26
+
27
+ Args:
28
+ doc (DLDocument): document to generate ID for
29
+
30
+ Returns:
31
+ str: the generated ID
32
+ """
33
+ rd = Random(x=self.seed)
34
+ return str(UUID(int=rd.getrandbits(128), version=self.uuid_version))
@@ -0,0 +1,13 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define the metadata extractor types."""
7
+
8
+ from docling_core.transforms.metadata_extractor.base import ( # noqa
9
+ BaseMetadataExtractor,
10
+ )
11
+ from docling_core.transforms.metadata_extractor.simple_metadata_extractor import ( # noqa
12
+ SimpleMetadataExtractor,
13
+ )
@@ -0,0 +1,59 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Base metadata extractor module."""
7
+
8
+
9
+ from abc import ABC, abstractmethod
10
+ from typing import Any
11
+
12
+ from pydantic import BaseModel
13
+
14
+ from docling_core.types import Document as DLDocument
15
+
16
+
17
+ class BaseMetadataExtractor(BaseModel, ABC):
18
+ """Metadata extractor base class."""
19
+
20
+ @abstractmethod
21
+ def get_metadata(
22
+ self, doc: DLDocument, *args: Any, **kwargs: Any
23
+ ) -> dict[str, Any]:
24
+ """Extract metadata for the given document.
25
+
26
+ Args:
27
+ doc (DLDocument): document to extract metadata for
28
+
29
+ Raises:
30
+ NotImplementedError: in this abstract implementation
31
+
32
+ Returns:
33
+ dict[str, Any]: the extracted metadata
34
+ """
35
+ raise NotImplementedError()
36
+
37
+ @abstractmethod
38
+ def get_excluded_embed_metadata_keys(self) -> list[str]:
39
+ """Get metadata keys to exclude from embedding.
40
+
41
+ Raises:
42
+ NotImplementedError: in this abstract implementation
43
+
44
+ Returns:
45
+ list[str]: the metadata to exclude
46
+ """
47
+ raise NotImplementedError()
48
+
49
+ @abstractmethod
50
+ def get_excluded_llm_metadata_keys(self) -> list[str]:
51
+ """Get metadata keys to exclude from LLM generation.
52
+
53
+ Raises:
54
+ NotImplementedError: in this abstract implementation
55
+
56
+ Returns:
57
+ list[str]: the metadata to exclude
58
+ """
59
+ raise NotImplementedError()
@@ -0,0 +1,61 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Simple metadata extractor module."""
7
+
8
+
9
+ from enum import Enum
10
+ from typing import Any
11
+
12
+ from docling_core.transforms.metadata_extractor import BaseMetadataExtractor
13
+ from docling_core.types import Document as DLDocument
14
+
15
+
16
+ class SimpleMetadataExtractor(BaseMetadataExtractor):
17
+ """Simple metadata extractor class."""
18
+
19
+ class _Keys(str, Enum):
20
+ DL_DOC_HASH = "dl_doc_hash"
21
+ ORIGIN = "origin"
22
+
23
+ include_origin: bool = False
24
+
25
+ def get_metadata(
26
+ self, doc: DLDocument, origin: str, *args: Any, **kwargs: Any
27
+ ) -> dict[str, Any]:
28
+ """Extract metadata for the given document.
29
+
30
+ Args:
31
+ doc (DLDocument): document to extract metadata for
32
+ origin (str): the document origin
33
+
34
+ Returns:
35
+ dict[str, Any]: the extracted metadata
36
+ """
37
+ meta: dict[str, Any] = {
38
+ self._Keys.DL_DOC_HASH: doc.file_info.document_hash,
39
+ }
40
+ if self.include_origin:
41
+ meta[self._Keys.ORIGIN] = origin
42
+ return meta
43
+
44
+ def get_excluded_embed_metadata_keys(self) -> list[str]:
45
+ """Get metadata keys to exclude from embedding.
46
+
47
+ Returns:
48
+ list[str]: the metadata to exclude
49
+ """
50
+ excl_keys: list[str] = [self._Keys.DL_DOC_HASH]
51
+ if self.include_origin:
52
+ excl_keys.append(self._Keys.ORIGIN)
53
+ return excl_keys
54
+
55
+ def get_excluded_llm_metadata_keys(self) -> list[str]:
56
+ """Get metadata keys to exclude from LLM generation.
57
+
58
+ Returns:
59
+ list[str]: the metadata to exclude
60
+ """
61
+ return self.get_excluded_embed_metadata_keys()
@@ -440,7 +440,6 @@ class BaseText(BaseCell):
440
440
  ):
441
441
  """Export text element to document tokens format."""
442
442
  body = f"<{self.obj_type}>"
443
- # body = f"<{self.name}>"
444
443
 
445
444
  assert DocumentToken.is_known_token(
446
445
  body
@@ -0,0 +1,30 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Package for models defined by the Document type."""
7
+
8
+ from .base import BoundingBox, CoordOrigin, Size
9
+ from .document import (
10
+ BasePictureData,
11
+ BaseTableData,
12
+ DescriptionItem,
13
+ DocItem,
14
+ DoclingDocument,
15
+ DocumentOrigin,
16
+ FloatingItem,
17
+ GroupItem,
18
+ ImageRef,
19
+ KeyValueItem,
20
+ NodeItem,
21
+ PageItem,
22
+ PictureItem,
23
+ ProvenanceItem,
24
+ RefItem,
25
+ SectionHeaderItem,
26
+ TableCell,
27
+ TableItem,
28
+ TextItem,
29
+ )
30
+ from .labels import DocItemLabel, GroupLabel, TableCellLabel
@@ -0,0 +1,167 @@
1
+ """Models for the base data types."""
2
+
3
+ import copy
4
+ from enum import Enum
5
+ from typing import Tuple
6
+
7
+ from pydantic import BaseModel
8
+
9
+
10
+ class CoordOrigin(str, Enum):
11
+ """CoordOrigin."""
12
+
13
+ TOPLEFT = "TOPLEFT"
14
+ BOTTOMLEFT = "BOTTOMLEFT"
15
+
16
+
17
+ class Size(BaseModel):
18
+ """Size."""
19
+
20
+ width: float = 0.0
21
+ height: float = 0.0
22
+
23
+ def as_tuple(self):
24
+ """as_tuple."""
25
+ return (self.width, self.height)
26
+
27
+
28
+ class BoundingBox(BaseModel):
29
+ """BoundingBox."""
30
+
31
+ l: float # left
32
+ t: float # top
33
+ r: float # right
34
+ b: float # bottom
35
+
36
+ coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
37
+
38
+ @property
39
+ def width(self):
40
+ """width."""
41
+ return self.r - self.l
42
+
43
+ @property
44
+ def height(self):
45
+ """height."""
46
+ return abs(self.t - self.b)
47
+
48
+ def scaled(self, scale: float) -> "BoundingBox":
49
+ """scaled.
50
+
51
+ :param scale: float:
52
+
53
+ """
54
+ out_bbox = copy.deepcopy(self)
55
+ out_bbox.l *= scale
56
+ out_bbox.r *= scale
57
+ out_bbox.t *= scale
58
+ out_bbox.b *= scale
59
+
60
+ return out_bbox
61
+
62
+ def normalized(self, page_size: Size) -> "BoundingBox":
63
+ """normalized.
64
+
65
+ :param page_size: Size:
66
+
67
+ """
68
+ out_bbox = copy.deepcopy(self)
69
+ out_bbox.l /= page_size.width
70
+ out_bbox.r /= page_size.width
71
+ out_bbox.t /= page_size.height
72
+ out_bbox.b /= page_size.height
73
+
74
+ return out_bbox
75
+
76
+ def as_tuple(self):
77
+ """as_tuple."""
78
+ if self.coord_origin == CoordOrigin.TOPLEFT:
79
+ return (self.l, self.t, self.r, self.b)
80
+ elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
81
+ return (self.l, self.b, self.r, self.t)
82
+
83
+ @classmethod
84
+ def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
85
+ """from_tuple.
86
+
87
+ :param coord: Tuple[float:
88
+ :param ...]:
89
+ :param origin: CoordOrigin:
90
+
91
+ """
92
+ if origin == CoordOrigin.TOPLEFT:
93
+ l, t, r, b = coord[0], coord[1], coord[2], coord[3]
94
+ if r < l:
95
+ l, r = r, l
96
+ if b < t:
97
+ b, t = t, b
98
+
99
+ return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
100
+ elif origin == CoordOrigin.BOTTOMLEFT:
101
+ l, b, r, t = coord[0], coord[1], coord[2], coord[3]
102
+ if r < l:
103
+ l, r = r, l
104
+ if b > t:
105
+ b, t = t, b
106
+
107
+ return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
108
+
109
+ def area(self) -> float:
110
+ """area."""
111
+ return (self.r - self.l) * (self.b - self.t)
112
+
113
+ def intersection_area_with(self, other: "BoundingBox") -> float:
114
+ """intersection_area_with.
115
+
116
+ :param other: "BoundingBox":
117
+
118
+ """
119
+ # Calculate intersection coordinates
120
+ left = max(self.l, other.l)
121
+ top = max(self.t, other.t)
122
+ right = min(self.r, other.r)
123
+ bottom = min(self.b, other.b)
124
+
125
+ # Calculate intersection dimensions
126
+ width = right - left
127
+ height = bottom - top
128
+
129
+ # If the bounding boxes do not overlap, width or height will be negative
130
+ if width <= 0 or height <= 0:
131
+ return 0.0
132
+
133
+ return width * height
134
+
135
+ def to_bottom_left_origin(self, page_height) -> "BoundingBox":
136
+ """to_bottom_left_origin.
137
+
138
+ :param page_height:
139
+
140
+ """
141
+ if self.coord_origin == CoordOrigin.BOTTOMLEFT:
142
+ return self
143
+ elif self.coord_origin == CoordOrigin.TOPLEFT:
144
+ return BoundingBox(
145
+ l=self.l,
146
+ r=self.r,
147
+ t=page_height - self.t,
148
+ b=page_height - self.b,
149
+ coord_origin=CoordOrigin.BOTTOMLEFT,
150
+ )
151
+
152
+ def to_top_left_origin(self, page_height):
153
+ """to_top_left_origin.
154
+
155
+ :param page_height:
156
+
157
+ """
158
+ if self.coord_origin == CoordOrigin.TOPLEFT:
159
+ return self
160
+ elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
161
+ return BoundingBox(
162
+ l=self.l,
163
+ r=self.r,
164
+ t=page_height - self.t, # self.b
165
+ b=page_height - self.b, # self.t
166
+ coord_origin=CoordOrigin.TOPLEFT,
167
+ )