docling-core 1.6.3__py3-none-any.whl → 1.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/base.py +34 -5
- docling_core/transforms/chunker/hierarchical_chunker.py +36 -19
- docling_core/transforms/id_generator/__init__.py +12 -0
- docling_core/transforms/id_generator/base.py +30 -0
- docling_core/transforms/id_generator/doc_hash_id_generator.py +27 -0
- docling_core/transforms/id_generator/uuid_generator.py +34 -0
- docling_core/transforms/metadata_extractor/__init__.py +13 -0
- docling_core/transforms/metadata_extractor/base.py +59 -0
- docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +59 -0
- docling_core/types/base.py +4 -1
- docling_core/types/doc/base.py +0 -1
- docling_core/types/experimental/__init__.py +30 -0
- docling_core/types/experimental/base.py +167 -0
- docling_core/types/experimental/document.py +1192 -0
- docling_core/types/experimental/labels.py +50 -0
- {docling_core-1.6.3.dist-info → docling_core-1.7.1.dist-info}/METADATA +1 -1
- {docling_core-1.6.3.dist-info → docling_core-1.7.1.dist-info}/RECORD +20 -9
- {docling_core-1.6.3.dist-info → docling_core-1.7.1.dist-info}/LICENSE +0 -0
- {docling_core-1.6.3.dist-info → docling_core-1.7.1.dist-info}/WHEEL +0 -0
- {docling_core-1.6.3.dist-info → docling_core-1.7.1.dist-info}/entry_points.txt +0 -0
|
@@ -4,26 +4,48 @@
|
|
|
4
4
|
#
|
|
5
5
|
|
|
6
6
|
"""Define base classes for chunking."""
|
|
7
|
+
import re
|
|
7
8
|
from abc import ABC, abstractmethod
|
|
8
|
-
from typing import Iterator, Optional
|
|
9
|
+
from typing import Final, Iterator, Optional
|
|
9
10
|
|
|
10
|
-
from pydantic import BaseModel
|
|
11
|
+
from pydantic import BaseModel, Field, field_validator
|
|
11
12
|
|
|
12
13
|
from docling_core.types import BoundingBox, Document
|
|
14
|
+
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
15
|
+
|
|
16
|
+
# (subset of) JSONPath format, e.g. "$.main-text[84]" (for migration purposes)
|
|
17
|
+
_DEPRECATED_JSON_PATH_PATTERN: Final = re.compile(r"^\$\.([\w-]+)\[(\d+)\]$")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _create_path(pos: int, path_prefix: str = "main-text") -> str:
|
|
21
|
+
return f"#/{path_prefix}/{pos}"
|
|
13
22
|
|
|
14
23
|
|
|
15
24
|
class Chunk(BaseModel):
|
|
16
25
|
"""Data model for Chunk."""
|
|
17
26
|
|
|
18
|
-
path: str
|
|
27
|
+
path: str = Field(pattern=_JSON_POINTER_REGEX)
|
|
19
28
|
text: str
|
|
29
|
+
heading: Optional[str] = None
|
|
30
|
+
|
|
31
|
+
@field_validator("path", mode="before")
|
|
32
|
+
@classmethod
|
|
33
|
+
def _json_pointer_from_json_path(cls, path: str):
|
|
34
|
+
if (match := _DEPRECATED_JSON_PATH_PATTERN.match(path)) is not None:
|
|
35
|
+
groups = match.groups()
|
|
36
|
+
if len(groups) == 2 and groups[0] is not None and groups[1] is not None:
|
|
37
|
+
return _create_path(
|
|
38
|
+
pos=int(groups[1]),
|
|
39
|
+
path_prefix=groups[0],
|
|
40
|
+
)
|
|
41
|
+
return path
|
|
20
42
|
|
|
21
43
|
|
|
22
44
|
class ChunkWithMetadata(Chunk):
|
|
23
45
|
"""Data model for Chunk including metadata."""
|
|
24
46
|
|
|
25
|
-
page: Optional[int]
|
|
26
|
-
bbox: Optional[BoundingBox]
|
|
47
|
+
page: Optional[int] = None
|
|
48
|
+
bbox: Optional[BoundingBox] = None
|
|
27
49
|
|
|
28
50
|
|
|
29
51
|
class BaseChunker(BaseModel, ABC):
|
|
@@ -43,3 +65,10 @@ class BaseChunker(BaseModel, ABC):
|
|
|
43
65
|
Iterator[Chunk]: iterator over extracted chunks
|
|
44
66
|
"""
|
|
45
67
|
raise NotImplementedError()
|
|
68
|
+
|
|
69
|
+
@classmethod
|
|
70
|
+
def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
|
|
71
|
+
return _create_path(
|
|
72
|
+
pos=pos,
|
|
73
|
+
path_prefix=path_prefix,
|
|
74
|
+
)
|
|
@@ -12,7 +12,7 @@ from enum import Enum
|
|
|
12
12
|
from typing import Any, Iterator, Optional, Union
|
|
13
13
|
|
|
14
14
|
import pandas as pd
|
|
15
|
-
from pydantic import BaseModel, PositiveInt
|
|
15
|
+
from pydantic import BaseModel, Field, PositiveInt
|
|
16
16
|
|
|
17
17
|
from docling_core.transforms.chunker import BaseChunker, Chunk, ChunkWithMetadata
|
|
18
18
|
from docling_core.types import BaseText
|
|
@@ -25,8 +25,17 @@ _logger = logging.getLogger(__name__)
|
|
|
25
25
|
class HierarchicalChunker(BaseChunker):
|
|
26
26
|
"""Chunker implementation leveraging the document layout."""
|
|
27
27
|
|
|
28
|
-
|
|
29
|
-
|
|
28
|
+
heading_as_metadata: bool = Field(
|
|
29
|
+
default=False,
|
|
30
|
+
description="Whether heading should be in metadata (instead of text)",
|
|
31
|
+
)
|
|
32
|
+
include_metadata: bool = Field(
|
|
33
|
+
default=True,
|
|
34
|
+
description="Whether to include extras in the metadata",
|
|
35
|
+
)
|
|
36
|
+
min_chunk_len: PositiveInt = Field(
|
|
37
|
+
default=64, description="Minimum chunk text length to consider (in chars)"
|
|
38
|
+
)
|
|
30
39
|
|
|
31
40
|
class _NodeType(str, Enum):
|
|
32
41
|
PARAGRAPH = "paragraph"
|
|
@@ -82,10 +91,6 @@ class HierarchicalChunker(BaseChunker):
|
|
|
82
91
|
|
|
83
92
|
return output_text
|
|
84
93
|
|
|
85
|
-
@classmethod
|
|
86
|
-
def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
|
|
87
|
-
return f"$.{path_prefix}[{pos}]"
|
|
88
|
-
|
|
89
94
|
class _MainTextItemNode(BaseModel):
|
|
90
95
|
parent: Optional[int] = None
|
|
91
96
|
children: list[int] = []
|
|
@@ -184,7 +189,7 @@ class HierarchicalChunker(BaseChunker):
|
|
|
184
189
|
|
|
185
190
|
def _build_chunk_impl(
|
|
186
191
|
self, doc: DLDocument, doc_map: _DocContext, idx: int, rec: bool = False
|
|
187
|
-
) -> list[_TextEntry]:
|
|
192
|
+
) -> tuple[list[_TextEntry], Optional[str]]:
|
|
188
193
|
if doc.main_text:
|
|
189
194
|
item = doc.main_text[idx]
|
|
190
195
|
item_type = _HC._norm(item.obj_type)
|
|
@@ -193,7 +198,7 @@ class HierarchicalChunker(BaseChunker):
|
|
|
193
198
|
item_type not in self._allowed_types
|
|
194
199
|
or item_name in self._disallowed_names_by_type.get(item_type, [])
|
|
195
200
|
):
|
|
196
|
-
return []
|
|
201
|
+
return [], None
|
|
197
202
|
|
|
198
203
|
c2p = doc_map.dmap
|
|
199
204
|
|
|
@@ -219,7 +224,7 @@ class HierarchicalChunker(BaseChunker):
|
|
|
219
224
|
else []
|
|
220
225
|
)
|
|
221
226
|
else:
|
|
222
|
-
return []
|
|
227
|
+
return [], None
|
|
223
228
|
elif isinstance(item, BaseText):
|
|
224
229
|
text_entries = [
|
|
225
230
|
self._TextEntry(
|
|
@@ -248,21 +253,29 @@ class HierarchicalChunker(BaseChunker):
|
|
|
248
253
|
_HC._NodeName.LIST_ITEM,
|
|
249
254
|
_HC._NodeName.SUBTITLE_LEVEL_1,
|
|
250
255
|
]:
|
|
251
|
-
return []
|
|
256
|
+
return [], None
|
|
252
257
|
|
|
253
258
|
if (parent := c2p[idx].parent) is not None:
|
|
254
259
|
# prepend with ancestors
|
|
260
|
+
|
|
261
|
+
parent_res = self._build_chunk_impl(
|
|
262
|
+
doc=doc, doc_map=doc_map, idx=parent, rec=True
|
|
263
|
+
)
|
|
255
264
|
return (
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
)
|
|
259
|
-
+ text_entries
|
|
265
|
+
parent_res[0] + text_entries, # expanded text
|
|
266
|
+
parent_res[1], # heading
|
|
260
267
|
)
|
|
261
268
|
else:
|
|
262
|
-
|
|
263
|
-
|
|
269
|
+
if (
|
|
270
|
+
self.heading_as_metadata
|
|
271
|
+
and isinstance(item, BaseText)
|
|
272
|
+
and _HC._norm(item.obj_type) == _HC._NodeType.SUBTITLE_LEVEL_1
|
|
273
|
+
):
|
|
274
|
+
return [], text_entries[0].text
|
|
275
|
+
else:
|
|
276
|
+
return text_entries, None
|
|
264
277
|
else:
|
|
265
|
-
return []
|
|
278
|
+
return [], None
|
|
266
279
|
|
|
267
280
|
def _build_chunk(
|
|
268
281
|
self,
|
|
@@ -272,7 +285,9 @@ class HierarchicalChunker(BaseChunker):
|
|
|
272
285
|
delim: str,
|
|
273
286
|
rec: bool = False,
|
|
274
287
|
) -> Optional[Chunk]:
|
|
275
|
-
|
|
288
|
+
res = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
|
|
289
|
+
texts = res[0]
|
|
290
|
+
heading = res[1]
|
|
276
291
|
concat = delim.join([t.text for t in texts if t.text])
|
|
277
292
|
assert doc.main_text is not None
|
|
278
293
|
if len(concat) >= self.min_chunk_len:
|
|
@@ -293,6 +308,7 @@ class HierarchicalChunker(BaseChunker):
|
|
|
293
308
|
return ChunkWithMetadata(
|
|
294
309
|
text=concat,
|
|
295
310
|
path=path,
|
|
311
|
+
heading=heading,
|
|
296
312
|
page=item.prov[0].page if item.prov else None,
|
|
297
313
|
bbox=item.prov[0].bbox if item.prov else None,
|
|
298
314
|
)
|
|
@@ -300,6 +316,7 @@ class HierarchicalChunker(BaseChunker):
|
|
|
300
316
|
return Chunk(
|
|
301
317
|
text=concat,
|
|
302
318
|
path=path,
|
|
319
|
+
heading=heading,
|
|
303
320
|
)
|
|
304
321
|
else:
|
|
305
322
|
return None
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define the ID generator types."""
|
|
7
|
+
|
|
8
|
+
from docling_core.transforms.id_generator.base import BaseIDGenerator # noqa
|
|
9
|
+
from docling_core.transforms.id_generator.doc_hash_id_generator import ( # noqa
|
|
10
|
+
DocHashIDGenerator,
|
|
11
|
+
)
|
|
12
|
+
from docling_core.transforms.id_generator.uuid_generator import UUIDGenerator # noqa
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Base document ID generator module."""
|
|
7
|
+
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from docling_core.types import Document as DLDocument
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BaseIDGenerator(ABC):
|
|
15
|
+
"""Document ID generator base class."""
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
|
|
19
|
+
"""Generate an ID for the given document.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
doc (DLDocument): document to generate ID for
|
|
23
|
+
|
|
24
|
+
Raises:
|
|
25
|
+
NotImplementedError: in this abstract implementation
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
str: the generated ID
|
|
29
|
+
"""
|
|
30
|
+
raise NotImplementedError()
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Doc-hash-based ID generator module."""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from docling_core.transforms.id_generator import BaseIDGenerator
|
|
12
|
+
from docling_core.types import Document as DLDocument
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DocHashIDGenerator(BaseIDGenerator):
|
|
16
|
+
"""Doc-hash-based ID generator class."""
|
|
17
|
+
|
|
18
|
+
def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
|
|
19
|
+
"""Generate an ID for the given document.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
doc (DLDocument): document to generate ID for
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
str: the generated ID
|
|
26
|
+
"""
|
|
27
|
+
return doc.file_info.document_hash
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""UUID-based ID generator module."""
|
|
7
|
+
|
|
8
|
+
from random import Random
|
|
9
|
+
from typing import Annotated, Any, Optional
|
|
10
|
+
from uuid import UUID
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, Field
|
|
13
|
+
|
|
14
|
+
from docling_core.transforms.id_generator import BaseIDGenerator
|
|
15
|
+
from docling_core.types import Document as DLDocument
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class UUIDGenerator(BaseModel, BaseIDGenerator):
|
|
19
|
+
"""UUID-based ID generator class."""
|
|
20
|
+
|
|
21
|
+
seed: Optional[int] = None
|
|
22
|
+
uuid_version: Annotated[int, Field(strict=True, ge=1, le=5)] = 4
|
|
23
|
+
|
|
24
|
+
def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
|
|
25
|
+
"""Generate an ID for the given document.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
doc (DLDocument): document to generate ID for
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
str: the generated ID
|
|
32
|
+
"""
|
|
33
|
+
rd = Random(x=self.seed)
|
|
34
|
+
return str(UUID(int=rd.getrandbits(128), version=self.uuid_version))
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define the metadata extractor types."""
|
|
7
|
+
|
|
8
|
+
from docling_core.transforms.metadata_extractor.base import ( # noqa
|
|
9
|
+
BaseMetadataExtractor,
|
|
10
|
+
)
|
|
11
|
+
from docling_core.transforms.metadata_extractor.simple_metadata_extractor import ( # noqa
|
|
12
|
+
SimpleMetadataExtractor,
|
|
13
|
+
)
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Base metadata extractor module."""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel
|
|
13
|
+
|
|
14
|
+
from docling_core.types import Document as DLDocument
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BaseMetadataExtractor(BaseModel, ABC):
|
|
18
|
+
"""Metadata extractor base class."""
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def get_metadata(
|
|
22
|
+
self, doc: DLDocument, *args: Any, **kwargs: Any
|
|
23
|
+
) -> dict[str, Any]:
|
|
24
|
+
"""Extract metadata for the given document.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
doc (DLDocument): document to extract metadata for
|
|
28
|
+
|
|
29
|
+
Raises:
|
|
30
|
+
NotImplementedError: in this abstract implementation
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
dict[str, Any]: the extracted metadata
|
|
34
|
+
"""
|
|
35
|
+
raise NotImplementedError()
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def get_excluded_embed_metadata_keys(self) -> list[str]:
|
|
39
|
+
"""Get metadata keys to exclude from embedding.
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
NotImplementedError: in this abstract implementation
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
list[str]: the metadata to exclude
|
|
46
|
+
"""
|
|
47
|
+
raise NotImplementedError()
|
|
48
|
+
|
|
49
|
+
@abstractmethod
|
|
50
|
+
def get_excluded_llm_metadata_keys(self) -> list[str]:
|
|
51
|
+
"""Get metadata keys to exclude from LLM generation.
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
NotImplementedError: in this abstract implementation
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
list[str]: the metadata to exclude
|
|
58
|
+
"""
|
|
59
|
+
raise NotImplementedError()
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Simple metadata extractor module."""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
from typing import Any, Final
|
|
10
|
+
|
|
11
|
+
from docling_core.transforms.metadata_extractor import BaseMetadataExtractor
|
|
12
|
+
from docling_core.types import Document as DLDocument
|
|
13
|
+
|
|
14
|
+
_DL_DOC_HASH: Final[str] = "dl_doc_hash"
|
|
15
|
+
_ORIGIN: Final[str] = "origin"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SimpleMetadataExtractor(BaseMetadataExtractor):
|
|
19
|
+
"""Simple metadata extractor class."""
|
|
20
|
+
|
|
21
|
+
include_origin: bool = False
|
|
22
|
+
|
|
23
|
+
def get_metadata(
|
|
24
|
+
self, doc: DLDocument, origin: str, *args: Any, **kwargs: Any
|
|
25
|
+
) -> dict[str, Any]:
|
|
26
|
+
"""Extract metadata for the given document.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
doc (DLDocument): document to extract metadata for
|
|
30
|
+
origin (str): the document origin
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
dict[str, Any]: the extracted metadata
|
|
34
|
+
"""
|
|
35
|
+
meta: dict[str, Any] = {
|
|
36
|
+
_DL_DOC_HASH: doc.file_info.document_hash,
|
|
37
|
+
}
|
|
38
|
+
if self.include_origin:
|
|
39
|
+
meta[_ORIGIN] = origin
|
|
40
|
+
return meta
|
|
41
|
+
|
|
42
|
+
def get_excluded_embed_metadata_keys(self) -> list[str]:
|
|
43
|
+
"""Get metadata keys to exclude from embedding.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
list[str]: the metadata to exclude
|
|
47
|
+
"""
|
|
48
|
+
excl_keys: list[str] = [_DL_DOC_HASH]
|
|
49
|
+
if self.include_origin:
|
|
50
|
+
excl_keys.append(_ORIGIN)
|
|
51
|
+
return excl_keys
|
|
52
|
+
|
|
53
|
+
def get_excluded_llm_metadata_keys(self) -> list[str]:
|
|
54
|
+
"""Get metadata keys to exclude from LLM generation.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
list[str]: the metadata to exclude
|
|
58
|
+
"""
|
|
59
|
+
return self.get_excluded_embed_metadata_keys()
|
docling_core/types/base.py
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
"""Define common models across types."""
|
|
7
7
|
from datetime import datetime, timezone
|
|
8
8
|
from enum import Enum
|
|
9
|
-
from typing import Generic, Hashable, List, Literal, Optional, TypeVar
|
|
9
|
+
from typing import Final, Generic, Hashable, List, Literal, Optional, TypeVar
|
|
10
10
|
|
|
11
11
|
from pydantic import (
|
|
12
12
|
AfterValidator,
|
|
@@ -28,6 +28,9 @@ from docling_core.search.package import VERSION_PATTERN
|
|
|
28
28
|
from docling_core.utils.alias import AliasModel
|
|
29
29
|
from docling_core.utils.validators import validate_datetime, validate_unique_list
|
|
30
30
|
|
|
31
|
+
# (subset of) JSON Pointer URI fragment id format, e.g. "#/main-text/84":
|
|
32
|
+
_JSON_POINTER_REGEX: Final[str] = r"^#(?:/([\w-]+)(?:/(\d+))?)?$"
|
|
33
|
+
|
|
31
34
|
LanguageT = TypeVar("LanguageT", bound=str)
|
|
32
35
|
IdentifierTypeT = TypeVar("IdentifierTypeT", bound=str)
|
|
33
36
|
DescriptionAdvancedT = TypeVar("DescriptionAdvancedT", bound=BaseModel)
|
docling_core/types/doc/base.py
CHANGED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Package for models defined by the Document type."""
|
|
7
|
+
|
|
8
|
+
from .base import BoundingBox, CoordOrigin, Size
|
|
9
|
+
from .document import (
|
|
10
|
+
BasePictureData,
|
|
11
|
+
BaseTableData,
|
|
12
|
+
DescriptionItem,
|
|
13
|
+
DocItem,
|
|
14
|
+
DoclingDocument,
|
|
15
|
+
DocumentOrigin,
|
|
16
|
+
FloatingItem,
|
|
17
|
+
GroupItem,
|
|
18
|
+
ImageRef,
|
|
19
|
+
KeyValueItem,
|
|
20
|
+
NodeItem,
|
|
21
|
+
PageItem,
|
|
22
|
+
PictureItem,
|
|
23
|
+
ProvenanceItem,
|
|
24
|
+
RefItem,
|
|
25
|
+
SectionHeaderItem,
|
|
26
|
+
TableCell,
|
|
27
|
+
TableItem,
|
|
28
|
+
TextItem,
|
|
29
|
+
)
|
|
30
|
+
from .labels import DocItemLabel, GroupLabel, TableCellLabel
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Models for the base data types."""
|
|
2
|
+
|
|
3
|
+
import copy
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Tuple
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CoordOrigin(str, Enum):
|
|
11
|
+
"""CoordOrigin."""
|
|
12
|
+
|
|
13
|
+
TOPLEFT = "TOPLEFT"
|
|
14
|
+
BOTTOMLEFT = "BOTTOMLEFT"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Size(BaseModel):
|
|
18
|
+
"""Size."""
|
|
19
|
+
|
|
20
|
+
width: float = 0.0
|
|
21
|
+
height: float = 0.0
|
|
22
|
+
|
|
23
|
+
def as_tuple(self):
|
|
24
|
+
"""as_tuple."""
|
|
25
|
+
return (self.width, self.height)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class BoundingBox(BaseModel):
|
|
29
|
+
"""BoundingBox."""
|
|
30
|
+
|
|
31
|
+
l: float # left
|
|
32
|
+
t: float # top
|
|
33
|
+
r: float # right
|
|
34
|
+
b: float # bottom
|
|
35
|
+
|
|
36
|
+
coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def width(self):
|
|
40
|
+
"""width."""
|
|
41
|
+
return self.r - self.l
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def height(self):
|
|
45
|
+
"""height."""
|
|
46
|
+
return abs(self.t - self.b)
|
|
47
|
+
|
|
48
|
+
def scaled(self, scale: float) -> "BoundingBox":
|
|
49
|
+
"""scaled.
|
|
50
|
+
|
|
51
|
+
:param scale: float:
|
|
52
|
+
|
|
53
|
+
"""
|
|
54
|
+
out_bbox = copy.deepcopy(self)
|
|
55
|
+
out_bbox.l *= scale
|
|
56
|
+
out_bbox.r *= scale
|
|
57
|
+
out_bbox.t *= scale
|
|
58
|
+
out_bbox.b *= scale
|
|
59
|
+
|
|
60
|
+
return out_bbox
|
|
61
|
+
|
|
62
|
+
def normalized(self, page_size: Size) -> "BoundingBox":
|
|
63
|
+
"""normalized.
|
|
64
|
+
|
|
65
|
+
:param page_size: Size:
|
|
66
|
+
|
|
67
|
+
"""
|
|
68
|
+
out_bbox = copy.deepcopy(self)
|
|
69
|
+
out_bbox.l /= page_size.width
|
|
70
|
+
out_bbox.r /= page_size.width
|
|
71
|
+
out_bbox.t /= page_size.height
|
|
72
|
+
out_bbox.b /= page_size.height
|
|
73
|
+
|
|
74
|
+
return out_bbox
|
|
75
|
+
|
|
76
|
+
def as_tuple(self):
|
|
77
|
+
"""as_tuple."""
|
|
78
|
+
if self.coord_origin == CoordOrigin.TOPLEFT:
|
|
79
|
+
return (self.l, self.t, self.r, self.b)
|
|
80
|
+
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
81
|
+
return (self.l, self.b, self.r, self.t)
|
|
82
|
+
|
|
83
|
+
@classmethod
|
|
84
|
+
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
|
|
85
|
+
"""from_tuple.
|
|
86
|
+
|
|
87
|
+
:param coord: Tuple[float:
|
|
88
|
+
:param ...]:
|
|
89
|
+
:param origin: CoordOrigin:
|
|
90
|
+
|
|
91
|
+
"""
|
|
92
|
+
if origin == CoordOrigin.TOPLEFT:
|
|
93
|
+
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
|
|
94
|
+
if r < l:
|
|
95
|
+
l, r = r, l
|
|
96
|
+
if b < t:
|
|
97
|
+
b, t = t, b
|
|
98
|
+
|
|
99
|
+
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
|
100
|
+
elif origin == CoordOrigin.BOTTOMLEFT:
|
|
101
|
+
l, b, r, t = coord[0], coord[1], coord[2], coord[3]
|
|
102
|
+
if r < l:
|
|
103
|
+
l, r = r, l
|
|
104
|
+
if b > t:
|
|
105
|
+
b, t = t, b
|
|
106
|
+
|
|
107
|
+
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
|
108
|
+
|
|
109
|
+
def area(self) -> float:
|
|
110
|
+
"""area."""
|
|
111
|
+
return (self.r - self.l) * (self.b - self.t)
|
|
112
|
+
|
|
113
|
+
def intersection_area_with(self, other: "BoundingBox") -> float:
|
|
114
|
+
"""intersection_area_with.
|
|
115
|
+
|
|
116
|
+
:param other: "BoundingBox":
|
|
117
|
+
|
|
118
|
+
"""
|
|
119
|
+
# Calculate intersection coordinates
|
|
120
|
+
left = max(self.l, other.l)
|
|
121
|
+
top = max(self.t, other.t)
|
|
122
|
+
right = min(self.r, other.r)
|
|
123
|
+
bottom = min(self.b, other.b)
|
|
124
|
+
|
|
125
|
+
# Calculate intersection dimensions
|
|
126
|
+
width = right - left
|
|
127
|
+
height = bottom - top
|
|
128
|
+
|
|
129
|
+
# If the bounding boxes do not overlap, width or height will be negative
|
|
130
|
+
if width <= 0 or height <= 0:
|
|
131
|
+
return 0.0
|
|
132
|
+
|
|
133
|
+
return width * height
|
|
134
|
+
|
|
135
|
+
def to_bottom_left_origin(self, page_height) -> "BoundingBox":
|
|
136
|
+
"""to_bottom_left_origin.
|
|
137
|
+
|
|
138
|
+
:param page_height:
|
|
139
|
+
|
|
140
|
+
"""
|
|
141
|
+
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
142
|
+
return self
|
|
143
|
+
elif self.coord_origin == CoordOrigin.TOPLEFT:
|
|
144
|
+
return BoundingBox(
|
|
145
|
+
l=self.l,
|
|
146
|
+
r=self.r,
|
|
147
|
+
t=page_height - self.t,
|
|
148
|
+
b=page_height - self.b,
|
|
149
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
def to_top_left_origin(self, page_height):
|
|
153
|
+
"""to_top_left_origin.
|
|
154
|
+
|
|
155
|
+
:param page_height:
|
|
156
|
+
|
|
157
|
+
"""
|
|
158
|
+
if self.coord_origin == CoordOrigin.TOPLEFT:
|
|
159
|
+
return self
|
|
160
|
+
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
161
|
+
return BoundingBox(
|
|
162
|
+
l=self.l,
|
|
163
|
+
r=self.r,
|
|
164
|
+
t=page_height - self.t, # self.b
|
|
165
|
+
b=page_height - self.b, # self.t
|
|
166
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
|
167
|
+
)
|