docling-core 2.31.1__py3-none-any.whl → 2.31.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/hybrid_chunker.py +22 -7
- {docling_core-2.31.1.dist-info → docling_core-2.31.2.dist-info}/METADATA +1 -1
- {docling_core-2.31.1.dist-info → docling_core-2.31.2.dist-info}/RECORD +6 -6
- {docling_core-2.31.1.dist-info → docling_core-2.31.2.dist-info}/LICENSE +0 -0
- {docling_core-2.31.1.dist-info → docling_core-2.31.2.dist-info}/WHEEL +0 -0
- {docling_core-2.31.1.dist-info → docling_core-2.31.2.dist-info}/entry_points.txt +0 -0
|
@@ -9,6 +9,7 @@ from functools import cached_property
|
|
|
9
9
|
from typing import Any, Iterable, Iterator, Optional, Union
|
|
10
10
|
|
|
11
11
|
from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator
|
|
12
|
+
from transformers import PreTrainedTokenizerBase
|
|
12
13
|
|
|
13
14
|
from docling_core.transforms.chunker.hierarchical_chunker import (
|
|
14
15
|
ChunkingSerializerProvider,
|
|
@@ -70,23 +71,37 @@ class HybridChunker(BaseChunker):
|
|
|
70
71
|
@model_validator(mode="before")
|
|
71
72
|
@classmethod
|
|
72
73
|
def _patch(cls, data: Any) -> Any:
|
|
73
|
-
if isinstance(data, dict)
|
|
74
|
+
if isinstance(data, dict):
|
|
75
|
+
tokenizer = data.get("tokenizer")
|
|
74
76
|
max_tokens = data.get("max_tokens")
|
|
75
|
-
if isinstance(tokenizer, BaseTokenizer)
|
|
76
|
-
|
|
77
|
-
|
|
77
|
+
if not isinstance(tokenizer, BaseTokenizer) and (
|
|
78
|
+
# some legacy param passed:
|
|
79
|
+
tokenizer is not None
|
|
80
|
+
or max_tokens is not None
|
|
81
|
+
):
|
|
78
82
|
from docling_core.transforms.chunker.tokenizer.huggingface import (
|
|
79
83
|
HuggingFaceTokenizer,
|
|
80
84
|
)
|
|
81
85
|
|
|
86
|
+
warnings.warn(
|
|
87
|
+
"Deprecated initialization parameter types for HybridChunker. "
|
|
88
|
+
"For updated usage check out "
|
|
89
|
+
"https://docling-project.github.io/docling/examples/hybrid_chunking/",
|
|
90
|
+
DeprecationWarning,
|
|
91
|
+
stacklevel=3,
|
|
92
|
+
)
|
|
93
|
+
|
|
82
94
|
if isinstance(tokenizer, str):
|
|
83
95
|
data["tokenizer"] = HuggingFaceTokenizer.from_pretrained(
|
|
84
96
|
model_name=tokenizer,
|
|
85
97
|
max_tokens=max_tokens,
|
|
86
98
|
)
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
99
|
+
elif tokenizer is None or isinstance(
|
|
100
|
+
tokenizer, PreTrainedTokenizerBase
|
|
101
|
+
):
|
|
102
|
+
kwargs = {
|
|
103
|
+
"tokenizer": tokenizer or _get_default_tokenizer().tokenizer
|
|
104
|
+
}
|
|
90
105
|
if max_tokens is not None:
|
|
91
106
|
kwargs["max_tokens"] = max_tokens
|
|
92
107
|
data["tokenizer"] = HuggingFaceTokenizer(**kwargs)
|
|
@@ -20,7 +20,7 @@ docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9AC
|
|
|
20
20
|
docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
|
|
21
21
|
docling_core/transforms/chunker/base.py,sha256=kJaRrGQynglG9wpy0IaAYTf4MKheWH5BAPzx4LE9yIg,2824
|
|
22
22
|
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=7Fpwwsn2BoiR12KGPrn8fU1uuhqBLp85MRLMF0aIsL8,8281
|
|
23
|
-
docling_core/transforms/chunker/hybrid_chunker.py,sha256=
|
|
23
|
+
docling_core/transforms/chunker/hybrid_chunker.py,sha256=TsAPI7WmvsMbr7Xc6WGmvwrMf4WHZnEeWxtgaf-S0iM,12530
|
|
24
24
|
docling_core/transforms/chunker/tokenizer/__init__.py,sha256=-bhXOTpoI7SYk7vn47z8Ek-RZFjJk4TfZawxsFuNHnE,34
|
|
25
25
|
docling_core/transforms/chunker/tokenizer/base.py,sha256=2gOBQPYJYC0iWXOgMG3DiNP7xEBtii7DYcib0iECq5o,575
|
|
26
26
|
docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=JQ-D3b5vTPQbvu4HaMfYqFzSBLbV_HnmoBGv7d6Kqn4,2220
|
|
@@ -73,8 +73,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
73
73
|
docling_core/utils/legacy.py,sha256=DrI3QGoL755ZCIoKHF74-pTWm8R0zfFo2C2vB5dT2aY,24463
|
|
74
74
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
75
75
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
76
|
-
docling_core-2.31.
|
|
77
|
-
docling_core-2.31.
|
|
78
|
-
docling_core-2.31.
|
|
79
|
-
docling_core-2.31.
|
|
80
|
-
docling_core-2.31.
|
|
76
|
+
docling_core-2.31.2.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
77
|
+
docling_core-2.31.2.dist-info/METADATA,sha256=Dd_spe0UyIXloLfK2CHHHtHY08eP7SmM2jAkfdPsj8w,5976
|
|
78
|
+
docling_core-2.31.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
79
|
+
docling_core-2.31.2.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
|
|
80
|
+
docling_core-2.31.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|