docling-core 1.6.3__py3-none-any.whl → 1.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -4,26 +4,48 @@
4
4
  #
5
5
 
6
6
  """Define base classes for chunking."""
7
+ import re
7
8
  from abc import ABC, abstractmethod
8
- from typing import Iterator, Optional
9
+ from typing import Final, Iterator, Optional
9
10
 
10
- from pydantic import BaseModel
11
+ from pydantic import BaseModel, Field, field_validator
11
12
 
12
13
  from docling_core.types import BoundingBox, Document
14
+ from docling_core.types.base import _JSON_POINTER_REGEX
15
+
16
+ # (subset of) JSONPath format, e.g. "$.main-text[84]" (for migration purposes)
17
+ _DEPRECATED_JSON_PATH_PATTERN: Final = re.compile(r"^\$\.([\w-]+)\[(\d+)\]$")
18
+
19
+
20
+ def _create_path(pos: int, path_prefix: str = "main-text") -> str:
21
+ return f"#/{path_prefix}/{pos}"
13
22
 
14
23
 
15
24
  class Chunk(BaseModel):
16
25
  """Data model for Chunk."""
17
26
 
18
- path: str
27
+ path: str = Field(pattern=_JSON_POINTER_REGEX)
19
28
  text: str
29
+ heading: Optional[str] = None
30
+
31
+ @field_validator("path", mode="before")
32
+ @classmethod
33
+ def _json_pointer_from_json_path(cls, path: str):
34
+ if (match := _DEPRECATED_JSON_PATH_PATTERN.match(path)) is not None:
35
+ groups = match.groups()
36
+ if len(groups) == 2 and groups[0] is not None and groups[1] is not None:
37
+ return _create_path(
38
+ pos=int(groups[1]),
39
+ path_prefix=groups[0],
40
+ )
41
+ return path
20
42
 
21
43
 
22
44
  class ChunkWithMetadata(Chunk):
23
45
  """Data model for Chunk including metadata."""
24
46
 
25
- page: Optional[int]
26
- bbox: Optional[BoundingBox]
47
+ page: Optional[int] = None
48
+ bbox: Optional[BoundingBox] = None
27
49
 
28
50
 
29
51
  class BaseChunker(BaseModel, ABC):
@@ -43,3 +65,10 @@ class BaseChunker(BaseModel, ABC):
43
65
  Iterator[Chunk]: iterator over extracted chunks
44
66
  """
45
67
  raise NotImplementedError()
68
+
69
+ @classmethod
70
+ def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
71
+ return _create_path(
72
+ pos=pos,
73
+ path_prefix=path_prefix,
74
+ )
@@ -12,7 +12,7 @@ from enum import Enum
12
12
  from typing import Any, Iterator, Optional, Union
13
13
 
14
14
  import pandas as pd
15
- from pydantic import BaseModel, PositiveInt
15
+ from pydantic import BaseModel, Field, PositiveInt
16
16
 
17
17
  from docling_core.transforms.chunker import BaseChunker, Chunk, ChunkWithMetadata
18
18
  from docling_core.types import BaseText
@@ -25,8 +25,17 @@ _logger = logging.getLogger(__name__)
25
25
  class HierarchicalChunker(BaseChunker):
26
26
  """Chunker implementation leveraging the document layout."""
27
27
 
28
- include_metadata: bool = True
29
- min_chunk_len: PositiveInt = 64
28
+ heading_as_metadata: bool = Field(
29
+ default=False,
30
+ description="Whether heading should be in metadata (instead of text)",
31
+ )
32
+ include_metadata: bool = Field(
33
+ default=True,
34
+ description="Whether to include extras in the metadata",
35
+ )
36
+ min_chunk_len: PositiveInt = Field(
37
+ default=64, description="Minimum chunk text length to consider (in chars)"
38
+ )
30
39
 
31
40
  class _NodeType(str, Enum):
32
41
  PARAGRAPH = "paragraph"
@@ -82,10 +91,6 @@ class HierarchicalChunker(BaseChunker):
82
91
 
83
92
  return output_text
84
93
 
85
- @classmethod
86
- def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
87
- return f"$.{path_prefix}[{pos}]"
88
-
89
94
  class _MainTextItemNode(BaseModel):
90
95
  parent: Optional[int] = None
91
96
  children: list[int] = []
@@ -184,7 +189,7 @@ class HierarchicalChunker(BaseChunker):
184
189
 
185
190
  def _build_chunk_impl(
186
191
  self, doc: DLDocument, doc_map: _DocContext, idx: int, rec: bool = False
187
- ) -> list[_TextEntry]:
192
+ ) -> tuple[list[_TextEntry], Optional[str]]:
188
193
  if doc.main_text:
189
194
  item = doc.main_text[idx]
190
195
  item_type = _HC._norm(item.obj_type)
@@ -193,7 +198,7 @@ class HierarchicalChunker(BaseChunker):
193
198
  item_type not in self._allowed_types
194
199
  or item_name in self._disallowed_names_by_type.get(item_type, [])
195
200
  ):
196
- return []
201
+ return [], None
197
202
 
198
203
  c2p = doc_map.dmap
199
204
 
@@ -219,7 +224,7 @@ class HierarchicalChunker(BaseChunker):
219
224
  else []
220
225
  )
221
226
  else:
222
- return []
227
+ return [], None
223
228
  elif isinstance(item, BaseText):
224
229
  text_entries = [
225
230
  self._TextEntry(
@@ -248,21 +253,29 @@ class HierarchicalChunker(BaseChunker):
248
253
  _HC._NodeName.LIST_ITEM,
249
254
  _HC._NodeName.SUBTITLE_LEVEL_1,
250
255
  ]:
251
- return []
256
+ return [], None
252
257
 
253
258
  if (parent := c2p[idx].parent) is not None:
254
259
  # prepend with ancestors
260
+
261
+ parent_res = self._build_chunk_impl(
262
+ doc=doc, doc_map=doc_map, idx=parent, rec=True
263
+ )
255
264
  return (
256
- self._build_chunk_impl(
257
- doc=doc, doc_map=doc_map, idx=parent, rec=True
258
- )
259
- + text_entries
265
+ parent_res[0] + text_entries, # expanded text
266
+ parent_res[1], # heading
260
267
  )
261
268
  else:
262
- # if root, augment with title (if available and different)
263
- return text_entries
269
+ if (
270
+ self.heading_as_metadata
271
+ and isinstance(item, BaseText)
272
+ and _HC._norm(item.obj_type) == _HC._NodeType.SUBTITLE_LEVEL_1
273
+ ):
274
+ return [], text_entries[0].text
275
+ else:
276
+ return text_entries, None
264
277
  else:
265
- return []
278
+ return [], None
266
279
 
267
280
  def _build_chunk(
268
281
  self,
@@ -272,7 +285,9 @@ class HierarchicalChunker(BaseChunker):
272
285
  delim: str,
273
286
  rec: bool = False,
274
287
  ) -> Optional[Chunk]:
275
- texts = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
288
+ res = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
289
+ texts = res[0]
290
+ heading = res[1]
276
291
  concat = delim.join([t.text for t in texts if t.text])
277
292
  assert doc.main_text is not None
278
293
  if len(concat) >= self.min_chunk_len:
@@ -293,6 +308,7 @@ class HierarchicalChunker(BaseChunker):
293
308
  return ChunkWithMetadata(
294
309
  text=concat,
295
310
  path=path,
311
+ heading=heading,
296
312
  page=item.prov[0].page if item.prov else None,
297
313
  bbox=item.prov[0].bbox if item.prov else None,
298
314
  )
@@ -300,6 +316,7 @@ class HierarchicalChunker(BaseChunker):
300
316
  return Chunk(
301
317
  text=concat,
302
318
  path=path,
319
+ heading=heading,
303
320
  )
304
321
  else:
305
322
  return None
@@ -0,0 +1,12 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define the ID generator types."""
7
+
8
+ from docling_core.transforms.id_generator.base import BaseIDGenerator # noqa
9
+ from docling_core.transforms.id_generator.doc_hash_id_generator import ( # noqa
10
+ DocHashIDGenerator,
11
+ )
12
+ from docling_core.transforms.id_generator.uuid_generator import UUIDGenerator # noqa
@@ -0,0 +1,30 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Base document ID generator module."""
7
+
8
+ from abc import ABC, abstractmethod
9
+ from typing import Any
10
+
11
+ from docling_core.types import Document as DLDocument
12
+
13
+
14
+ class BaseIDGenerator(ABC):
15
+ """Document ID generator base class."""
16
+
17
+ @abstractmethod
18
+ def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
19
+ """Generate an ID for the given document.
20
+
21
+ Args:
22
+ doc (DLDocument): document to generate ID for
23
+
24
+ Raises:
25
+ NotImplementedError: in this abstract implementation
26
+
27
+ Returns:
28
+ str: the generated ID
29
+ """
30
+ raise NotImplementedError()
@@ -0,0 +1,27 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Doc-hash-based ID generator module."""
7
+
8
+
9
+ from typing import Any
10
+
11
+ from docling_core.transforms.id_generator import BaseIDGenerator
12
+ from docling_core.types import Document as DLDocument
13
+
14
+
15
+ class DocHashIDGenerator(BaseIDGenerator):
16
+ """Doc-hash-based ID generator class."""
17
+
18
+ def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
19
+ """Generate an ID for the given document.
20
+
21
+ Args:
22
+ doc (DLDocument): document to generate ID for
23
+
24
+ Returns:
25
+ str: the generated ID
26
+ """
27
+ return doc.file_info.document_hash
@@ -0,0 +1,34 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """UUID-based ID generator module."""
7
+
8
+ from random import Random
9
+ from typing import Annotated, Any, Optional
10
+ from uuid import UUID
11
+
12
+ from pydantic import BaseModel, Field
13
+
14
+ from docling_core.transforms.id_generator import BaseIDGenerator
15
+ from docling_core.types import Document as DLDocument
16
+
17
+
18
+ class UUIDGenerator(BaseModel, BaseIDGenerator):
19
+ """UUID-based ID generator class."""
20
+
21
+ seed: Optional[int] = None
22
+ uuid_version: Annotated[int, Field(strict=True, ge=1, le=5)] = 4
23
+
24
+ def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
25
+ """Generate an ID for the given document.
26
+
27
+ Args:
28
+ doc (DLDocument): document to generate ID for
29
+
30
+ Returns:
31
+ str: the generated ID
32
+ """
33
+ rd = Random(x=self.seed)
34
+ return str(UUID(int=rd.getrandbits(128), version=self.uuid_version))
@@ -0,0 +1,13 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define the metadata extractor types."""
7
+
8
+ from docling_core.transforms.metadata_extractor.base import ( # noqa
9
+ BaseMetadataExtractor,
10
+ )
11
+ from docling_core.transforms.metadata_extractor.simple_metadata_extractor import ( # noqa
12
+ SimpleMetadataExtractor,
13
+ )
@@ -0,0 +1,59 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Base metadata extractor module."""
7
+
8
+
9
+ from abc import ABC, abstractmethod
10
+ from typing import Any
11
+
12
+ from pydantic import BaseModel
13
+
14
+ from docling_core.types import Document as DLDocument
15
+
16
+
17
+ class BaseMetadataExtractor(BaseModel, ABC):
18
+ """Metadata extractor base class."""
19
+
20
+ @abstractmethod
21
+ def get_metadata(
22
+ self, doc: DLDocument, *args: Any, **kwargs: Any
23
+ ) -> dict[str, Any]:
24
+ """Extract metadata for the given document.
25
+
26
+ Args:
27
+ doc (DLDocument): document to extract metadata for
28
+
29
+ Raises:
30
+ NotImplementedError: in this abstract implementation
31
+
32
+ Returns:
33
+ dict[str, Any]: the extracted metadata
34
+ """
35
+ raise NotImplementedError()
36
+
37
+ @abstractmethod
38
+ def get_excluded_embed_metadata_keys(self) -> list[str]:
39
+ """Get metadata keys to exclude from embedding.
40
+
41
+ Raises:
42
+ NotImplementedError: in this abstract implementation
43
+
44
+ Returns:
45
+ list[str]: the metadata to exclude
46
+ """
47
+ raise NotImplementedError()
48
+
49
+ @abstractmethod
50
+ def get_excluded_llm_metadata_keys(self) -> list[str]:
51
+ """Get metadata keys to exclude from LLM generation.
52
+
53
+ Raises:
54
+ NotImplementedError: in this abstract implementation
55
+
56
+ Returns:
57
+ list[str]: the metadata to exclude
58
+ """
59
+ raise NotImplementedError()
@@ -0,0 +1,59 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Simple metadata extractor module."""
7
+
8
+
9
+ from typing import Any, Final
10
+
11
+ from docling_core.transforms.metadata_extractor import BaseMetadataExtractor
12
+ from docling_core.types import Document as DLDocument
13
+
14
+ _DL_DOC_HASH: Final[str] = "dl_doc_hash"
15
+ _ORIGIN: Final[str] = "origin"
16
+
17
+
18
+ class SimpleMetadataExtractor(BaseMetadataExtractor):
19
+ """Simple metadata extractor class."""
20
+
21
+ include_origin: bool = False
22
+
23
+ def get_metadata(
24
+ self, doc: DLDocument, origin: str, *args: Any, **kwargs: Any
25
+ ) -> dict[str, Any]:
26
+ """Extract metadata for the given document.
27
+
28
+ Args:
29
+ doc (DLDocument): document to extract metadata for
30
+ origin (str): the document origin
31
+
32
+ Returns:
33
+ dict[str, Any]: the extracted metadata
34
+ """
35
+ meta: dict[str, Any] = {
36
+ _DL_DOC_HASH: doc.file_info.document_hash,
37
+ }
38
+ if self.include_origin:
39
+ meta[_ORIGIN] = origin
40
+ return meta
41
+
42
+ def get_excluded_embed_metadata_keys(self) -> list[str]:
43
+ """Get metadata keys to exclude from embedding.
44
+
45
+ Returns:
46
+ list[str]: the metadata to exclude
47
+ """
48
+ excl_keys: list[str] = [_DL_DOC_HASH]
49
+ if self.include_origin:
50
+ excl_keys.append(_ORIGIN)
51
+ return excl_keys
52
+
53
+ def get_excluded_llm_metadata_keys(self) -> list[str]:
54
+ """Get metadata keys to exclude from LLM generation.
55
+
56
+ Returns:
57
+ list[str]: the metadata to exclude
58
+ """
59
+ return self.get_excluded_embed_metadata_keys()
@@ -6,7 +6,7 @@
6
6
  """Define common models across types."""
7
7
  from datetime import datetime, timezone
8
8
  from enum import Enum
9
- from typing import Generic, Hashable, List, Literal, Optional, TypeVar
9
+ from typing import Final, Generic, Hashable, List, Literal, Optional, TypeVar
10
10
 
11
11
  from pydantic import (
12
12
  AfterValidator,
@@ -28,6 +28,9 @@ from docling_core.search.package import VERSION_PATTERN
28
28
  from docling_core.utils.alias import AliasModel
29
29
  from docling_core.utils.validators import validate_datetime, validate_unique_list
30
30
 
31
+ # (subset of) JSON Pointer URI fragment id format, e.g. "#/main-text/84":
32
+ _JSON_POINTER_REGEX: Final[str] = r"^#(?:/([\w-]+)(?:/(\d+))?)?$"
33
+
31
34
  LanguageT = TypeVar("LanguageT", bound=str)
32
35
  IdentifierTypeT = TypeVar("IdentifierTypeT", bound=str)
33
36
  DescriptionAdvancedT = TypeVar("DescriptionAdvancedT", bound=BaseModel)
@@ -440,7 +440,6 @@ class BaseText(BaseCell):
440
440
  ):
441
441
  """Export text element to document tokens format."""
442
442
  body = f"<{self.obj_type}>"
443
- # body = f"<{self.name}>"
444
443
 
445
444
  assert DocumentToken.is_known_token(
446
445
  body
@@ -0,0 +1,30 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Package for models defined by the Document type."""
7
+
8
+ from .base import BoundingBox, CoordOrigin, Size
9
+ from .document import (
10
+ BasePictureData,
11
+ BaseTableData,
12
+ DescriptionItem,
13
+ DocItem,
14
+ DoclingDocument,
15
+ DocumentOrigin,
16
+ FloatingItem,
17
+ GroupItem,
18
+ ImageRef,
19
+ KeyValueItem,
20
+ NodeItem,
21
+ PageItem,
22
+ PictureItem,
23
+ ProvenanceItem,
24
+ RefItem,
25
+ SectionHeaderItem,
26
+ TableCell,
27
+ TableItem,
28
+ TextItem,
29
+ )
30
+ from .labels import DocItemLabel, GroupLabel, TableCellLabel
@@ -0,0 +1,167 @@
1
+ """Models for the base data types."""
2
+
3
+ import copy
4
+ from enum import Enum
5
+ from typing import Tuple
6
+
7
+ from pydantic import BaseModel
8
+
9
+
10
+ class CoordOrigin(str, Enum):
11
+ """CoordOrigin."""
12
+
13
+ TOPLEFT = "TOPLEFT"
14
+ BOTTOMLEFT = "BOTTOMLEFT"
15
+
16
+
17
+ class Size(BaseModel):
18
+ """Size."""
19
+
20
+ width: float = 0.0
21
+ height: float = 0.0
22
+
23
+ def as_tuple(self):
24
+ """as_tuple."""
25
+ return (self.width, self.height)
26
+
27
+
28
+ class BoundingBox(BaseModel):
29
+ """BoundingBox."""
30
+
31
+ l: float # left
32
+ t: float # top
33
+ r: float # right
34
+ b: float # bottom
35
+
36
+ coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
37
+
38
+ @property
39
+ def width(self):
40
+ """width."""
41
+ return self.r - self.l
42
+
43
+ @property
44
+ def height(self):
45
+ """height."""
46
+ return abs(self.t - self.b)
47
+
48
+ def scaled(self, scale: float) -> "BoundingBox":
49
+ """scaled.
50
+
51
+ :param scale: float:
52
+
53
+ """
54
+ out_bbox = copy.deepcopy(self)
55
+ out_bbox.l *= scale
56
+ out_bbox.r *= scale
57
+ out_bbox.t *= scale
58
+ out_bbox.b *= scale
59
+
60
+ return out_bbox
61
+
62
+ def normalized(self, page_size: Size) -> "BoundingBox":
63
+ """normalized.
64
+
65
+ :param page_size: Size:
66
+
67
+ """
68
+ out_bbox = copy.deepcopy(self)
69
+ out_bbox.l /= page_size.width
70
+ out_bbox.r /= page_size.width
71
+ out_bbox.t /= page_size.height
72
+ out_bbox.b /= page_size.height
73
+
74
+ return out_bbox
75
+
76
+ def as_tuple(self):
77
+ """as_tuple."""
78
+ if self.coord_origin == CoordOrigin.TOPLEFT:
79
+ return (self.l, self.t, self.r, self.b)
80
+ elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
81
+ return (self.l, self.b, self.r, self.t)
82
+
83
+ @classmethod
84
+ def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
85
+ """from_tuple.
86
+
87
+ :param coord: Tuple[float:
88
+ :param ...]:
89
+ :param origin: CoordOrigin:
90
+
91
+ """
92
+ if origin == CoordOrigin.TOPLEFT:
93
+ l, t, r, b = coord[0], coord[1], coord[2], coord[3]
94
+ if r < l:
95
+ l, r = r, l
96
+ if b < t:
97
+ b, t = t, b
98
+
99
+ return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
100
+ elif origin == CoordOrigin.BOTTOMLEFT:
101
+ l, b, r, t = coord[0], coord[1], coord[2], coord[3]
102
+ if r < l:
103
+ l, r = r, l
104
+ if b > t:
105
+ b, t = t, b
106
+
107
+ return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
108
+
109
+ def area(self) -> float:
110
+ """area."""
111
+ return (self.r - self.l) * (self.b - self.t)
112
+
113
+ def intersection_area_with(self, other: "BoundingBox") -> float:
114
+ """intersection_area_with.
115
+
116
+ :param other: "BoundingBox":
117
+
118
+ """
119
+ # Calculate intersection coordinates
120
+ left = max(self.l, other.l)
121
+ top = max(self.t, other.t)
122
+ right = min(self.r, other.r)
123
+ bottom = min(self.b, other.b)
124
+
125
+ # Calculate intersection dimensions
126
+ width = right - left
127
+ height = bottom - top
128
+
129
+ # If the bounding boxes do not overlap, width or height will be negative
130
+ if width <= 0 or height <= 0:
131
+ return 0.0
132
+
133
+ return width * height
134
+
135
+ def to_bottom_left_origin(self, page_height) -> "BoundingBox":
136
+ """to_bottom_left_origin.
137
+
138
+ :param page_height:
139
+
140
+ """
141
+ if self.coord_origin == CoordOrigin.BOTTOMLEFT:
142
+ return self
143
+ elif self.coord_origin == CoordOrigin.TOPLEFT:
144
+ return BoundingBox(
145
+ l=self.l,
146
+ r=self.r,
147
+ t=page_height - self.t,
148
+ b=page_height - self.b,
149
+ coord_origin=CoordOrigin.BOTTOMLEFT,
150
+ )
151
+
152
+ def to_top_left_origin(self, page_height):
153
+ """to_top_left_origin.
154
+
155
+ :param page_height:
156
+
157
+ """
158
+ if self.coord_origin == CoordOrigin.TOPLEFT:
159
+ return self
160
+ elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
161
+ return BoundingBox(
162
+ l=self.l,
163
+ r=self.r,
164
+ t=page_height - self.t, # self.b
165
+ b=page_height - self.b, # self.t
166
+ coord_origin=CoordOrigin.TOPLEFT,
167
+ )