docling-core 1.6.2__tar.gz → 1.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-1.6.2 → docling_core-1.7.0}/PKG-INFO +1 -1
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/search/json_schema_to_search_mapper.py +6 -4
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/transforms/chunker/base.py +3 -2
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/transforms/chunker/hierarchical_chunker.py +29 -12
- docling_core-1.7.0/docling_core/transforms/id_generator/__init__.py +12 -0
- docling_core-1.7.0/docling_core/transforms/id_generator/base.py +30 -0
- docling_core-1.7.0/docling_core/transforms/id_generator/doc_hash_id_generator.py +27 -0
- docling_core-1.7.0/docling_core/transforms/id_generator/uuid_generator.py +34 -0
- docling_core-1.7.0/docling_core/transforms/metadata_extractor/__init__.py +13 -0
- docling_core-1.7.0/docling_core/transforms/metadata_extractor/base.py +59 -0
- docling_core-1.7.0/docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +61 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/doc/base.py +0 -1
- docling_core-1.7.0/docling_core/types/experimental/__init__.py +30 -0
- docling_core-1.7.0/docling_core/types/experimental/base.py +167 -0
- docling_core-1.7.0/docling_core/types/experimental/document.py +1194 -0
- docling_core-1.7.0/docling_core/types/experimental/labels.py +50 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/pyproject.toml +3 -1
- {docling_core-1.6.2 → docling_core-1.7.0}/LICENSE +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/README.md +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/__init__.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/py.typed +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/search/__init__.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/search/mapping.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/search/meta.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/search/package.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/__init__.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/base.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/doc/doc_ann.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/doc/doc_ocr.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/doc/doc_raw.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/doc/document.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/utils/alias.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/utils/ds_generate_docs.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/utils/ds_generate_jsonschema.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/utils/file.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/utils/validate.py +0 -0
- {docling_core-1.6.2 → docling_core-1.7.0}/docling_core/utils/validators.py +0 -0
{docling_core-1.6.2 → docling_core-1.7.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
@@ -8,7 +8,7 @@ import re
|
|
|
8
8
|
from copy import deepcopy
|
|
9
9
|
from typing import Any, Optional, Pattern, Tuple, TypedDict
|
|
10
10
|
|
|
11
|
-
from jsonref import
|
|
11
|
+
from jsonref import replace_refs
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class SearchIndexDefinition(TypedDict):
|
|
@@ -95,7 +95,11 @@ class JsonSchemaToSearchMapper:
|
|
|
95
95
|
which define the fields, their data types, and other specifications to index
|
|
96
96
|
JSON documents into a Lucene index.
|
|
97
97
|
"""
|
|
98
|
-
mapping =
|
|
98
|
+
mapping = deepcopy(schema)
|
|
99
|
+
|
|
100
|
+
mapping = self._suppress(mapping, self._suppress_key)
|
|
101
|
+
|
|
102
|
+
mapping = replace_refs(mapping)
|
|
99
103
|
|
|
100
104
|
mapping = self._merge_unions(mapping)
|
|
101
105
|
|
|
@@ -105,8 +109,6 @@ class JsonSchemaToSearchMapper:
|
|
|
105
109
|
|
|
106
110
|
mapping = self._remove_keys(mapping, self._rm_keys)
|
|
107
111
|
|
|
108
|
-
mapping = self._suppress(mapping, self._suppress_key)
|
|
109
|
-
|
|
110
112
|
mapping = self._translate_keys_re(mapping)
|
|
111
113
|
|
|
112
114
|
mapping = self._clean(mapping)
|
|
@@ -22,8 +22,9 @@ class Chunk(BaseModel):
|
|
|
22
22
|
class ChunkWithMetadata(Chunk):
|
|
23
23
|
"""Data model for Chunk including metadata."""
|
|
24
24
|
|
|
25
|
-
page: Optional[int]
|
|
26
|
-
bbox: Optional[BoundingBox]
|
|
25
|
+
page: Optional[int] = None
|
|
26
|
+
bbox: Optional[BoundingBox] = None
|
|
27
|
+
heading: Optional[str] = None
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
class BaseChunker(BaseModel, ABC):
|
{docling_core-1.6.2 → docling_core-1.7.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
@@ -26,6 +26,7 @@ class HierarchicalChunker(BaseChunker):
|
|
|
26
26
|
"""Chunker implementation leveraging the document layout."""
|
|
27
27
|
|
|
28
28
|
include_metadata: bool = True
|
|
29
|
+
heading_as_metadata: bool = False
|
|
29
30
|
min_chunk_len: PositiveInt = 64
|
|
30
31
|
|
|
31
32
|
class _NodeType(str, Enum):
|
|
@@ -184,7 +185,7 @@ class HierarchicalChunker(BaseChunker):
|
|
|
184
185
|
|
|
185
186
|
def _build_chunk_impl(
|
|
186
187
|
self, doc: DLDocument, doc_map: _DocContext, idx: int, rec: bool = False
|
|
187
|
-
) -> list[_TextEntry]:
|
|
188
|
+
) -> tuple[list[_TextEntry], Optional[str]]:
|
|
188
189
|
if doc.main_text:
|
|
189
190
|
item = doc.main_text[idx]
|
|
190
191
|
item_type = _HC._norm(item.obj_type)
|
|
@@ -193,7 +194,7 @@ class HierarchicalChunker(BaseChunker):
|
|
|
193
194
|
item_type not in self._allowed_types
|
|
194
195
|
or item_name in self._disallowed_names_by_type.get(item_type, [])
|
|
195
196
|
):
|
|
196
|
-
return []
|
|
197
|
+
return [], None
|
|
197
198
|
|
|
198
199
|
c2p = doc_map.dmap
|
|
199
200
|
|
|
@@ -219,7 +220,7 @@ class HierarchicalChunker(BaseChunker):
|
|
|
219
220
|
else []
|
|
220
221
|
)
|
|
221
222
|
else:
|
|
222
|
-
return []
|
|
223
|
+
return [], None
|
|
223
224
|
elif isinstance(item, BaseText):
|
|
224
225
|
text_entries = [
|
|
225
226
|
self._TextEntry(
|
|
@@ -248,21 +249,29 @@ class HierarchicalChunker(BaseChunker):
|
|
|
248
249
|
_HC._NodeName.LIST_ITEM,
|
|
249
250
|
_HC._NodeName.SUBTITLE_LEVEL_1,
|
|
250
251
|
]:
|
|
251
|
-
return []
|
|
252
|
+
return [], None
|
|
252
253
|
|
|
253
254
|
if (parent := c2p[idx].parent) is not None:
|
|
254
255
|
# prepend with ancestors
|
|
256
|
+
|
|
257
|
+
parent_res = self._build_chunk_impl(
|
|
258
|
+
doc=doc, doc_map=doc_map, idx=parent, rec=True
|
|
259
|
+
)
|
|
255
260
|
return (
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
)
|
|
259
|
-
+ text_entries
|
|
261
|
+
parent_res[0] + text_entries, # expanded text
|
|
262
|
+
parent_res[1], # heading
|
|
260
263
|
)
|
|
261
264
|
else:
|
|
262
|
-
|
|
263
|
-
|
|
265
|
+
if (
|
|
266
|
+
self.heading_as_metadata
|
|
267
|
+
and isinstance(item, BaseText)
|
|
268
|
+
and _HC._norm(item.obj_type) == _HC._NodeType.SUBTITLE_LEVEL_1
|
|
269
|
+
):
|
|
270
|
+
return [], text_entries[0].text
|
|
271
|
+
else:
|
|
272
|
+
return text_entries, None
|
|
264
273
|
else:
|
|
265
|
-
return []
|
|
274
|
+
return [], None
|
|
266
275
|
|
|
267
276
|
def _build_chunk(
|
|
268
277
|
self,
|
|
@@ -272,7 +281,9 @@ class HierarchicalChunker(BaseChunker):
|
|
|
272
281
|
delim: str,
|
|
273
282
|
rec: bool = False,
|
|
274
283
|
) -> Optional[Chunk]:
|
|
275
|
-
|
|
284
|
+
res = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
|
|
285
|
+
texts = res[0]
|
|
286
|
+
heading = res[1]
|
|
276
287
|
concat = delim.join([t.text for t in texts if t.text])
|
|
277
288
|
assert doc.main_text is not None
|
|
278
289
|
if len(concat) >= self.min_chunk_len:
|
|
@@ -295,6 +306,7 @@ class HierarchicalChunker(BaseChunker):
|
|
|
295
306
|
path=path,
|
|
296
307
|
page=item.prov[0].page if item.prov else None,
|
|
297
308
|
bbox=item.prov[0].bbox if item.prov else None,
|
|
309
|
+
heading=heading,
|
|
298
310
|
)
|
|
299
311
|
else:
|
|
300
312
|
return Chunk(
|
|
@@ -315,6 +327,11 @@ class HierarchicalChunker(BaseChunker):
|
|
|
315
327
|
Yields:
|
|
316
328
|
Iterator[Chunk]: iterator over extracted chunks
|
|
317
329
|
"""
|
|
330
|
+
if (not self.include_metadata) and self.heading_as_metadata:
|
|
331
|
+
raise RuntimeError(
|
|
332
|
+
"To enable `heading_as_metadata`, also `include_metadata` must be True."
|
|
333
|
+
)
|
|
334
|
+
|
|
318
335
|
if dl_doc.main_text:
|
|
319
336
|
# extract doc structure incl. metadata for
|
|
320
337
|
# each item (e.g. parent, children)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define the ID generator types."""
|
|
7
|
+
|
|
8
|
+
from docling_core.transforms.id_generator.base import BaseIDGenerator # noqa
|
|
9
|
+
from docling_core.transforms.id_generator.doc_hash_id_generator import ( # noqa
|
|
10
|
+
DocHashIDGenerator,
|
|
11
|
+
)
|
|
12
|
+
from docling_core.transforms.id_generator.uuid_generator import UUIDGenerator # noqa
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Base document ID generator module."""
|
|
7
|
+
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from docling_core.types import Document as DLDocument
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BaseIDGenerator(ABC):
|
|
15
|
+
"""Document ID generator base class."""
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
|
|
19
|
+
"""Generate an ID for the given document.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
doc (DLDocument): document to generate ID for
|
|
23
|
+
|
|
24
|
+
Raises:
|
|
25
|
+
NotImplementedError: in this abstract implementation
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
str: the generated ID
|
|
29
|
+
"""
|
|
30
|
+
raise NotImplementedError()
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Doc-hash-based ID generator module."""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from docling_core.transforms.id_generator import BaseIDGenerator
|
|
12
|
+
from docling_core.types import Document as DLDocument
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DocHashIDGenerator(BaseIDGenerator):
|
|
16
|
+
"""Doc-hash-based ID generator class."""
|
|
17
|
+
|
|
18
|
+
def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
|
|
19
|
+
"""Generate an ID for the given document.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
doc (DLDocument): document to generate ID for
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
str: the generated ID
|
|
26
|
+
"""
|
|
27
|
+
return doc.file_info.document_hash
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""UUID-based ID generator module."""
|
|
7
|
+
|
|
8
|
+
from random import Random
|
|
9
|
+
from typing import Annotated, Any, Optional
|
|
10
|
+
from uuid import UUID
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, Field
|
|
13
|
+
|
|
14
|
+
from docling_core.transforms.id_generator import BaseIDGenerator
|
|
15
|
+
from docling_core.types import Document as DLDocument
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class UUIDGenerator(BaseModel, BaseIDGenerator):
|
|
19
|
+
"""UUID-based ID generator class."""
|
|
20
|
+
|
|
21
|
+
seed: Optional[int] = None
|
|
22
|
+
uuid_version: Annotated[int, Field(strict=True, ge=1, le=5)] = 4
|
|
23
|
+
|
|
24
|
+
def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
|
|
25
|
+
"""Generate an ID for the given document.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
doc (DLDocument): document to generate ID for
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
str: the generated ID
|
|
32
|
+
"""
|
|
33
|
+
rd = Random(x=self.seed)
|
|
34
|
+
return str(UUID(int=rd.getrandbits(128), version=self.uuid_version))
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define the metadata extractor types."""
|
|
7
|
+
|
|
8
|
+
from docling_core.transforms.metadata_extractor.base import ( # noqa
|
|
9
|
+
BaseMetadataExtractor,
|
|
10
|
+
)
|
|
11
|
+
from docling_core.transforms.metadata_extractor.simple_metadata_extractor import ( # noqa
|
|
12
|
+
SimpleMetadataExtractor,
|
|
13
|
+
)
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Base metadata extractor module."""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel
|
|
13
|
+
|
|
14
|
+
from docling_core.types import Document as DLDocument
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BaseMetadataExtractor(BaseModel, ABC):
|
|
18
|
+
"""Metadata extractor base class."""
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def get_metadata(
|
|
22
|
+
self, doc: DLDocument, *args: Any, **kwargs: Any
|
|
23
|
+
) -> dict[str, Any]:
|
|
24
|
+
"""Extract metadata for the given document.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
doc (DLDocument): document to extract metadata for
|
|
28
|
+
|
|
29
|
+
Raises:
|
|
30
|
+
NotImplementedError: in this abstract implementation
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
dict[str, Any]: the extracted metadata
|
|
34
|
+
"""
|
|
35
|
+
raise NotImplementedError()
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def get_excluded_embed_metadata_keys(self) -> list[str]:
|
|
39
|
+
"""Get metadata keys to exclude from embedding.
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
NotImplementedError: in this abstract implementation
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
list[str]: the metadata to exclude
|
|
46
|
+
"""
|
|
47
|
+
raise NotImplementedError()
|
|
48
|
+
|
|
49
|
+
@abstractmethod
|
|
50
|
+
def get_excluded_llm_metadata_keys(self) -> list[str]:
|
|
51
|
+
"""Get metadata keys to exclude from LLM generation.
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
NotImplementedError: in this abstract implementation
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
list[str]: the metadata to exclude
|
|
58
|
+
"""
|
|
59
|
+
raise NotImplementedError()
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Simple metadata extractor module."""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from docling_core.transforms.metadata_extractor import BaseMetadataExtractor
|
|
13
|
+
from docling_core.types import Document as DLDocument
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SimpleMetadataExtractor(BaseMetadataExtractor):
|
|
17
|
+
"""Simple metadata extractor class."""
|
|
18
|
+
|
|
19
|
+
class _Keys(str, Enum):
|
|
20
|
+
DL_DOC_HASH = "dl_doc_hash"
|
|
21
|
+
ORIGIN = "origin"
|
|
22
|
+
|
|
23
|
+
include_origin: bool = False
|
|
24
|
+
|
|
25
|
+
def get_metadata(
|
|
26
|
+
self, doc: DLDocument, origin: str, *args: Any, **kwargs: Any
|
|
27
|
+
) -> dict[str, Any]:
|
|
28
|
+
"""Extract metadata for the given document.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
doc (DLDocument): document to extract metadata for
|
|
32
|
+
origin (str): the document origin
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
dict[str, Any]: the extracted metadata
|
|
36
|
+
"""
|
|
37
|
+
meta: dict[str, Any] = {
|
|
38
|
+
self._Keys.DL_DOC_HASH: doc.file_info.document_hash,
|
|
39
|
+
}
|
|
40
|
+
if self.include_origin:
|
|
41
|
+
meta[self._Keys.ORIGIN] = origin
|
|
42
|
+
return meta
|
|
43
|
+
|
|
44
|
+
def get_excluded_embed_metadata_keys(self) -> list[str]:
|
|
45
|
+
"""Get metadata keys to exclude from embedding.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
list[str]: the metadata to exclude
|
|
49
|
+
"""
|
|
50
|
+
excl_keys: list[str] = [self._Keys.DL_DOC_HASH]
|
|
51
|
+
if self.include_origin:
|
|
52
|
+
excl_keys.append(self._Keys.ORIGIN)
|
|
53
|
+
return excl_keys
|
|
54
|
+
|
|
55
|
+
def get_excluded_llm_metadata_keys(self) -> list[str]:
|
|
56
|
+
"""Get metadata keys to exclude from LLM generation.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
list[str]: the metadata to exclude
|
|
60
|
+
"""
|
|
61
|
+
return self.get_excluded_embed_metadata_keys()
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Package for models defined by the Document type."""
|
|
7
|
+
|
|
8
|
+
from .base import BoundingBox, CoordOrigin, Size
|
|
9
|
+
from .document import (
|
|
10
|
+
BasePictureData,
|
|
11
|
+
BaseTableData,
|
|
12
|
+
DescriptionItem,
|
|
13
|
+
DocItem,
|
|
14
|
+
DoclingDocument,
|
|
15
|
+
DocumentOrigin,
|
|
16
|
+
FloatingItem,
|
|
17
|
+
GroupItem,
|
|
18
|
+
ImageRef,
|
|
19
|
+
KeyValueItem,
|
|
20
|
+
NodeItem,
|
|
21
|
+
PageItem,
|
|
22
|
+
PictureItem,
|
|
23
|
+
ProvenanceItem,
|
|
24
|
+
RefItem,
|
|
25
|
+
SectionHeaderItem,
|
|
26
|
+
TableCell,
|
|
27
|
+
TableItem,
|
|
28
|
+
TextItem,
|
|
29
|
+
)
|
|
30
|
+
from .labels import DocItemLabel, GroupLabel, TableCellLabel
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Models for the base data types."""
|
|
2
|
+
|
|
3
|
+
import copy
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Tuple
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CoordOrigin(str, Enum):
|
|
11
|
+
"""CoordOrigin."""
|
|
12
|
+
|
|
13
|
+
TOPLEFT = "TOPLEFT"
|
|
14
|
+
BOTTOMLEFT = "BOTTOMLEFT"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Size(BaseModel):
|
|
18
|
+
"""Size."""
|
|
19
|
+
|
|
20
|
+
width: float = 0.0
|
|
21
|
+
height: float = 0.0
|
|
22
|
+
|
|
23
|
+
def as_tuple(self):
|
|
24
|
+
"""as_tuple."""
|
|
25
|
+
return (self.width, self.height)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class BoundingBox(BaseModel):
|
|
29
|
+
"""BoundingBox."""
|
|
30
|
+
|
|
31
|
+
l: float # left
|
|
32
|
+
t: float # top
|
|
33
|
+
r: float # right
|
|
34
|
+
b: float # bottom
|
|
35
|
+
|
|
36
|
+
coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def width(self):
|
|
40
|
+
"""width."""
|
|
41
|
+
return self.r - self.l
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def height(self):
|
|
45
|
+
"""height."""
|
|
46
|
+
return abs(self.t - self.b)
|
|
47
|
+
|
|
48
|
+
def scaled(self, scale: float) -> "BoundingBox":
|
|
49
|
+
"""scaled.
|
|
50
|
+
|
|
51
|
+
:param scale: float:
|
|
52
|
+
|
|
53
|
+
"""
|
|
54
|
+
out_bbox = copy.deepcopy(self)
|
|
55
|
+
out_bbox.l *= scale
|
|
56
|
+
out_bbox.r *= scale
|
|
57
|
+
out_bbox.t *= scale
|
|
58
|
+
out_bbox.b *= scale
|
|
59
|
+
|
|
60
|
+
return out_bbox
|
|
61
|
+
|
|
62
|
+
def normalized(self, page_size: Size) -> "BoundingBox":
|
|
63
|
+
"""normalized.
|
|
64
|
+
|
|
65
|
+
:param page_size: Size:
|
|
66
|
+
|
|
67
|
+
"""
|
|
68
|
+
out_bbox = copy.deepcopy(self)
|
|
69
|
+
out_bbox.l /= page_size.width
|
|
70
|
+
out_bbox.r /= page_size.width
|
|
71
|
+
out_bbox.t /= page_size.height
|
|
72
|
+
out_bbox.b /= page_size.height
|
|
73
|
+
|
|
74
|
+
return out_bbox
|
|
75
|
+
|
|
76
|
+
def as_tuple(self):
|
|
77
|
+
"""as_tuple."""
|
|
78
|
+
if self.coord_origin == CoordOrigin.TOPLEFT:
|
|
79
|
+
return (self.l, self.t, self.r, self.b)
|
|
80
|
+
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
81
|
+
return (self.l, self.b, self.r, self.t)
|
|
82
|
+
|
|
83
|
+
@classmethod
|
|
84
|
+
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
|
|
85
|
+
"""from_tuple.
|
|
86
|
+
|
|
87
|
+
:param coord: Tuple[float:
|
|
88
|
+
:param ...]:
|
|
89
|
+
:param origin: CoordOrigin:
|
|
90
|
+
|
|
91
|
+
"""
|
|
92
|
+
if origin == CoordOrigin.TOPLEFT:
|
|
93
|
+
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
|
|
94
|
+
if r < l:
|
|
95
|
+
l, r = r, l
|
|
96
|
+
if b < t:
|
|
97
|
+
b, t = t, b
|
|
98
|
+
|
|
99
|
+
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
|
100
|
+
elif origin == CoordOrigin.BOTTOMLEFT:
|
|
101
|
+
l, b, r, t = coord[0], coord[1], coord[2], coord[3]
|
|
102
|
+
if r < l:
|
|
103
|
+
l, r = r, l
|
|
104
|
+
if b > t:
|
|
105
|
+
b, t = t, b
|
|
106
|
+
|
|
107
|
+
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
|
108
|
+
|
|
109
|
+
def area(self) -> float:
|
|
110
|
+
"""area."""
|
|
111
|
+
return (self.r - self.l) * (self.b - self.t)
|
|
112
|
+
|
|
113
|
+
def intersection_area_with(self, other: "BoundingBox") -> float:
|
|
114
|
+
"""intersection_area_with.
|
|
115
|
+
|
|
116
|
+
:param other: "BoundingBox":
|
|
117
|
+
|
|
118
|
+
"""
|
|
119
|
+
# Calculate intersection coordinates
|
|
120
|
+
left = max(self.l, other.l)
|
|
121
|
+
top = max(self.t, other.t)
|
|
122
|
+
right = min(self.r, other.r)
|
|
123
|
+
bottom = min(self.b, other.b)
|
|
124
|
+
|
|
125
|
+
# Calculate intersection dimensions
|
|
126
|
+
width = right - left
|
|
127
|
+
height = bottom - top
|
|
128
|
+
|
|
129
|
+
# If the bounding boxes do not overlap, width or height will be negative
|
|
130
|
+
if width <= 0 or height <= 0:
|
|
131
|
+
return 0.0
|
|
132
|
+
|
|
133
|
+
return width * height
|
|
134
|
+
|
|
135
|
+
def to_bottom_left_origin(self, page_height) -> "BoundingBox":
|
|
136
|
+
"""to_bottom_left_origin.
|
|
137
|
+
|
|
138
|
+
:param page_height:
|
|
139
|
+
|
|
140
|
+
"""
|
|
141
|
+
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
142
|
+
return self
|
|
143
|
+
elif self.coord_origin == CoordOrigin.TOPLEFT:
|
|
144
|
+
return BoundingBox(
|
|
145
|
+
l=self.l,
|
|
146
|
+
r=self.r,
|
|
147
|
+
t=page_height - self.t,
|
|
148
|
+
b=page_height - self.b,
|
|
149
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
def to_top_left_origin(self, page_height):
|
|
153
|
+
"""to_top_left_origin.
|
|
154
|
+
|
|
155
|
+
:param page_height:
|
|
156
|
+
|
|
157
|
+
"""
|
|
158
|
+
if self.coord_origin == CoordOrigin.TOPLEFT:
|
|
159
|
+
return self
|
|
160
|
+
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
161
|
+
return BoundingBox(
|
|
162
|
+
l=self.l,
|
|
163
|
+
r=self.r,
|
|
164
|
+
t=page_height - self.t, # self.b
|
|
165
|
+
b=page_height - self.b, # self.t
|
|
166
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
|
167
|
+
)
|