docling-core 1.7.0__tar.gz → 1.7.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-1.7.0 → docling_core-1.7.1}/PKG-INFO +1 -1
- docling_core-1.7.1/docling_core/transforms/chunker/base.py +74 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/chunker/hierarchical_chunker.py +14 -14
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +8 -10
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/base.py +4 -1
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/experimental/document.py +1 -3
- {docling_core-1.7.0 → docling_core-1.7.1}/pyproject.toml +1 -1
- docling_core-1.7.0/docling_core/transforms/chunker/base.py +0 -46
- {docling_core-1.7.0 → docling_core-1.7.1}/LICENSE +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/README.md +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/__init__.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/py.typed +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/search/__init__.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/search/mapping.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/search/meta.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/search/package.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/__init__.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/id_generator/__init__.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/id_generator/base.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/id_generator/uuid_generator.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/metadata_extractor/__init__.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/metadata_extractor/base.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/__init__.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/doc/base.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/doc/doc_ann.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/doc/doc_ocr.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/doc/doc_raw.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/doc/document.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/experimental/__init__.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/experimental/base.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/experimental/labels.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/gen/generic.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/rec/base.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/rec/record.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/rec/statement.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/rec/subject.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/utils/__init__.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/utils/alias.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/utils/ds_generate_docs.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/utils/ds_generate_jsonschema.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/utils/file.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/utils/validate.py +0 -0
- {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/utils/validators.py +0 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define base classes for chunking."""
|
|
7
|
+
import re
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from typing import Final, Iterator, Optional
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, Field, field_validator
|
|
12
|
+
|
|
13
|
+
from docling_core.types import BoundingBox, Document
|
|
14
|
+
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
15
|
+
|
|
16
|
+
# (subset of) JSONPath format, e.g. "$.main-text[84]" (for migration purposes)
|
|
17
|
+
_DEPRECATED_JSON_PATH_PATTERN: Final = re.compile(r"^\$\.([\w-]+)\[(\d+)\]$")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _create_path(pos: int, path_prefix: str = "main-text") -> str:
|
|
21
|
+
return f"#/{path_prefix}/{pos}"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Chunk(BaseModel):
|
|
25
|
+
"""Data model for Chunk."""
|
|
26
|
+
|
|
27
|
+
path: str = Field(pattern=_JSON_POINTER_REGEX)
|
|
28
|
+
text: str
|
|
29
|
+
heading: Optional[str] = None
|
|
30
|
+
|
|
31
|
+
@field_validator("path", mode="before")
|
|
32
|
+
@classmethod
|
|
33
|
+
def _json_pointer_from_json_path(cls, path: str):
|
|
34
|
+
if (match := _DEPRECATED_JSON_PATH_PATTERN.match(path)) is not None:
|
|
35
|
+
groups = match.groups()
|
|
36
|
+
if len(groups) == 2 and groups[0] is not None and groups[1] is not None:
|
|
37
|
+
return _create_path(
|
|
38
|
+
pos=int(groups[1]),
|
|
39
|
+
path_prefix=groups[0],
|
|
40
|
+
)
|
|
41
|
+
return path
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ChunkWithMetadata(Chunk):
|
|
45
|
+
"""Data model for Chunk including metadata."""
|
|
46
|
+
|
|
47
|
+
page: Optional[int] = None
|
|
48
|
+
bbox: Optional[BoundingBox] = None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class BaseChunker(BaseModel, ABC):
|
|
52
|
+
"""Base class for Chunker."""
|
|
53
|
+
|
|
54
|
+
@abstractmethod
|
|
55
|
+
def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]:
|
|
56
|
+
"""Chunk the provided document.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
dl_doc (Document): document to chunk
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
NotImplementedError: in this abstract implementation
|
|
63
|
+
|
|
64
|
+
Yields:
|
|
65
|
+
Iterator[Chunk]: iterator over extracted chunks
|
|
66
|
+
"""
|
|
67
|
+
raise NotImplementedError()
|
|
68
|
+
|
|
69
|
+
@classmethod
|
|
70
|
+
def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
|
|
71
|
+
return _create_path(
|
|
72
|
+
pos=pos,
|
|
73
|
+
path_prefix=path_prefix,
|
|
74
|
+
)
|
{docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
@@ -12,7 +12,7 @@ from enum import Enum
|
|
|
12
12
|
from typing import Any, Iterator, Optional, Union
|
|
13
13
|
|
|
14
14
|
import pandas as pd
|
|
15
|
-
from pydantic import BaseModel, PositiveInt
|
|
15
|
+
from pydantic import BaseModel, Field, PositiveInt
|
|
16
16
|
|
|
17
17
|
from docling_core.transforms.chunker import BaseChunker, Chunk, ChunkWithMetadata
|
|
18
18
|
from docling_core.types import BaseText
|
|
@@ -25,9 +25,17 @@ _logger = logging.getLogger(__name__)
|
|
|
25
25
|
class HierarchicalChunker(BaseChunker):
|
|
26
26
|
"""Chunker implementation leveraging the document layout."""
|
|
27
27
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
28
|
+
heading_as_metadata: bool = Field(
|
|
29
|
+
default=False,
|
|
30
|
+
description="Whether heading should be in metadata (instead of text)",
|
|
31
|
+
)
|
|
32
|
+
include_metadata: bool = Field(
|
|
33
|
+
default=True,
|
|
34
|
+
description="Whether to include extras in the metadata",
|
|
35
|
+
)
|
|
36
|
+
min_chunk_len: PositiveInt = Field(
|
|
37
|
+
default=64, description="Minimum chunk text length to consider (in chars)"
|
|
38
|
+
)
|
|
31
39
|
|
|
32
40
|
class _NodeType(str, Enum):
|
|
33
41
|
PARAGRAPH = "paragraph"
|
|
@@ -83,10 +91,6 @@ class HierarchicalChunker(BaseChunker):
|
|
|
83
91
|
|
|
84
92
|
return output_text
|
|
85
93
|
|
|
86
|
-
@classmethod
|
|
87
|
-
def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
|
|
88
|
-
return f"$.{path_prefix}[{pos}]"
|
|
89
|
-
|
|
90
94
|
class _MainTextItemNode(BaseModel):
|
|
91
95
|
parent: Optional[int] = None
|
|
92
96
|
children: list[int] = []
|
|
@@ -304,14 +308,15 @@ class HierarchicalChunker(BaseChunker):
|
|
|
304
308
|
return ChunkWithMetadata(
|
|
305
309
|
text=concat,
|
|
306
310
|
path=path,
|
|
311
|
+
heading=heading,
|
|
307
312
|
page=item.prov[0].page if item.prov else None,
|
|
308
313
|
bbox=item.prov[0].bbox if item.prov else None,
|
|
309
|
-
heading=heading,
|
|
310
314
|
)
|
|
311
315
|
else:
|
|
312
316
|
return Chunk(
|
|
313
317
|
text=concat,
|
|
314
318
|
path=path,
|
|
319
|
+
heading=heading,
|
|
315
320
|
)
|
|
316
321
|
else:
|
|
317
322
|
return None
|
|
@@ -327,11 +332,6 @@ class HierarchicalChunker(BaseChunker):
|
|
|
327
332
|
Yields:
|
|
328
333
|
Iterator[Chunk]: iterator over extracted chunks
|
|
329
334
|
"""
|
|
330
|
-
if (not self.include_metadata) and self.heading_as_metadata:
|
|
331
|
-
raise RuntimeError(
|
|
332
|
-
"To enable `heading_as_metadata`, also `include_metadata` must be True."
|
|
333
|
-
)
|
|
334
|
-
|
|
335
335
|
if dl_doc.main_text:
|
|
336
336
|
# extract doc structure incl. metadata for
|
|
337
337
|
# each item (e.g. parent, children)
|
|
@@ -6,20 +6,18 @@
|
|
|
6
6
|
"""Simple metadata extractor module."""
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
from
|
|
10
|
-
from typing import Any
|
|
9
|
+
from typing import Any, Final
|
|
11
10
|
|
|
12
11
|
from docling_core.transforms.metadata_extractor import BaseMetadataExtractor
|
|
13
12
|
from docling_core.types import Document as DLDocument
|
|
14
13
|
|
|
14
|
+
_DL_DOC_HASH: Final[str] = "dl_doc_hash"
|
|
15
|
+
_ORIGIN: Final[str] = "origin"
|
|
16
|
+
|
|
15
17
|
|
|
16
18
|
class SimpleMetadataExtractor(BaseMetadataExtractor):
|
|
17
19
|
"""Simple metadata extractor class."""
|
|
18
20
|
|
|
19
|
-
class _Keys(str, Enum):
|
|
20
|
-
DL_DOC_HASH = "dl_doc_hash"
|
|
21
|
-
ORIGIN = "origin"
|
|
22
|
-
|
|
23
21
|
include_origin: bool = False
|
|
24
22
|
|
|
25
23
|
def get_metadata(
|
|
@@ -35,10 +33,10 @@ class SimpleMetadataExtractor(BaseMetadataExtractor):
|
|
|
35
33
|
dict[str, Any]: the extracted metadata
|
|
36
34
|
"""
|
|
37
35
|
meta: dict[str, Any] = {
|
|
38
|
-
|
|
36
|
+
_DL_DOC_HASH: doc.file_info.document_hash,
|
|
39
37
|
}
|
|
40
38
|
if self.include_origin:
|
|
41
|
-
meta[
|
|
39
|
+
meta[_ORIGIN] = origin
|
|
42
40
|
return meta
|
|
43
41
|
|
|
44
42
|
def get_excluded_embed_metadata_keys(self) -> list[str]:
|
|
@@ -47,9 +45,9 @@ class SimpleMetadataExtractor(BaseMetadataExtractor):
|
|
|
47
45
|
Returns:
|
|
48
46
|
list[str]: the metadata to exclude
|
|
49
47
|
"""
|
|
50
|
-
excl_keys: list[str] = [
|
|
48
|
+
excl_keys: list[str] = [_DL_DOC_HASH]
|
|
51
49
|
if self.include_origin:
|
|
52
|
-
excl_keys.append(
|
|
50
|
+
excl_keys.append(_ORIGIN)
|
|
53
51
|
return excl_keys
|
|
54
52
|
|
|
55
53
|
def get_excluded_llm_metadata_keys(self) -> list[str]:
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
"""Define common models across types."""
|
|
7
7
|
from datetime import datetime, timezone
|
|
8
8
|
from enum import Enum
|
|
9
|
-
from typing import Generic, Hashable, List, Literal, Optional, TypeVar
|
|
9
|
+
from typing import Final, Generic, Hashable, List, Literal, Optional, TypeVar
|
|
10
10
|
|
|
11
11
|
from pydantic import (
|
|
12
12
|
AfterValidator,
|
|
@@ -28,6 +28,9 @@ from docling_core.search.package import VERSION_PATTERN
|
|
|
28
28
|
from docling_core.utils.alias import AliasModel
|
|
29
29
|
from docling_core.utils.validators import validate_datetime, validate_unique_list
|
|
30
30
|
|
|
31
|
+
# (subset of) JSON Pointer URI fragment id format, e.g. "#/main-text/84":
|
|
32
|
+
_JSON_POINTER_REGEX: Final[str] = r"^#(?:/([\w-]+)(?:/(\d+))?)?$"
|
|
33
|
+
|
|
31
34
|
LanguageT = TypeVar("LanguageT", bound=str)
|
|
32
35
|
IdentifierTypeT = TypeVar("IdentifierTypeT", bound=str)
|
|
33
36
|
DescriptionAdvancedT = TypeVar("DescriptionAdvancedT", bound=BaseModel)
|
|
@@ -20,6 +20,7 @@ from tabulate import tabulate
|
|
|
20
20
|
from typing_extensions import Annotated
|
|
21
21
|
|
|
22
22
|
from docling_core.search.package import VERSION_PATTERN
|
|
23
|
+
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
23
24
|
from docling_core.types.doc.tokens import DocumentToken
|
|
24
25
|
from docling_core.types.experimental import BoundingBox, Size
|
|
25
26
|
from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
|
|
@@ -28,9 +29,6 @@ Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
|
28
29
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
29
30
|
CURRENT_VERSION: Final = "1.0.0"
|
|
30
31
|
|
|
31
|
-
# (subset of) JSON Pointer URI fragment identifier format:
|
|
32
|
-
_JSON_POINTER_REGEX = r"^#(/[\w\-]+(/\d+)?)?$"
|
|
33
|
-
|
|
34
32
|
|
|
35
33
|
class BasePictureData(BaseModel): # TBD
|
|
36
34
|
"""BasePictureData."""
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
-
# SPDX-License-Identifier: MIT
|
|
4
|
-
#
|
|
5
|
-
|
|
6
|
-
"""Define base classes for chunking."""
|
|
7
|
-
from abc import ABC, abstractmethod
|
|
8
|
-
from typing import Iterator, Optional
|
|
9
|
-
|
|
10
|
-
from pydantic import BaseModel
|
|
11
|
-
|
|
12
|
-
from docling_core.types import BoundingBox, Document
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class Chunk(BaseModel):
|
|
16
|
-
"""Data model for Chunk."""
|
|
17
|
-
|
|
18
|
-
path: str
|
|
19
|
-
text: str
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class ChunkWithMetadata(Chunk):
|
|
23
|
-
"""Data model for Chunk including metadata."""
|
|
24
|
-
|
|
25
|
-
page: Optional[int] = None
|
|
26
|
-
bbox: Optional[BoundingBox] = None
|
|
27
|
-
heading: Optional[str] = None
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class BaseChunker(BaseModel, ABC):
|
|
31
|
-
"""Base class for Chunker."""
|
|
32
|
-
|
|
33
|
-
@abstractmethod
|
|
34
|
-
def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]:
|
|
35
|
-
"""Chunk the provided document.
|
|
36
|
-
|
|
37
|
-
Args:
|
|
38
|
-
dl_doc (Document): document to chunk
|
|
39
|
-
|
|
40
|
-
Raises:
|
|
41
|
-
NotImplementedError: in this abstract implementation
|
|
42
|
-
|
|
43
|
-
Yields:
|
|
44
|
-
Iterator[Chunk]: iterator over extracted chunks
|
|
45
|
-
"""
|
|
46
|
-
raise NotImplementedError()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-1.7.0 → docling_core-1.7.1}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-1.7.0 → docling_core-1.7.1}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/id_generator/uuid_generator.py
RENAMED
|
File without changes
|
{docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/metadata_extractor/__init__.py
RENAMED
|
File without changes
|
{docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/metadata_extractor/base.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|