docling-core 2.31.0__py3-none-any.whl → 2.31.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/hybrid_chunker.py +22 -7
- docling_core/transforms/serializer/markdown.py +2 -2
- {docling_core-2.31.0.dist-info → docling_core-2.31.2.dist-info}/METADATA +1 -1
- {docling_core-2.31.0.dist-info → docling_core-2.31.2.dist-info}/RECORD +7 -7
- {docling_core-2.31.0.dist-info → docling_core-2.31.2.dist-info}/LICENSE +0 -0
- {docling_core-2.31.0.dist-info → docling_core-2.31.2.dist-info}/WHEEL +0 -0
- {docling_core-2.31.0.dist-info → docling_core-2.31.2.dist-info}/entry_points.txt +0 -0
|
@@ -9,6 +9,7 @@ from functools import cached_property
|
|
|
9
9
|
from typing import Any, Iterable, Iterator, Optional, Union
|
|
10
10
|
|
|
11
11
|
from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator
|
|
12
|
+
from transformers import PreTrainedTokenizerBase
|
|
12
13
|
|
|
13
14
|
from docling_core.transforms.chunker.hierarchical_chunker import (
|
|
14
15
|
ChunkingSerializerProvider,
|
|
@@ -70,23 +71,37 @@ class HybridChunker(BaseChunker):
|
|
|
70
71
|
@model_validator(mode="before")
|
|
71
72
|
@classmethod
|
|
72
73
|
def _patch(cls, data: Any) -> Any:
|
|
73
|
-
if isinstance(data, dict)
|
|
74
|
+
if isinstance(data, dict):
|
|
75
|
+
tokenizer = data.get("tokenizer")
|
|
74
76
|
max_tokens = data.get("max_tokens")
|
|
75
|
-
if isinstance(tokenizer, BaseTokenizer)
|
|
76
|
-
|
|
77
|
-
|
|
77
|
+
if not isinstance(tokenizer, BaseTokenizer) and (
|
|
78
|
+
# some legacy param passed:
|
|
79
|
+
tokenizer is not None
|
|
80
|
+
or max_tokens is not None
|
|
81
|
+
):
|
|
78
82
|
from docling_core.transforms.chunker.tokenizer.huggingface import (
|
|
79
83
|
HuggingFaceTokenizer,
|
|
80
84
|
)
|
|
81
85
|
|
|
86
|
+
warnings.warn(
|
|
87
|
+
"Deprecated initialization parameter types for HybridChunker. "
|
|
88
|
+
"For updated usage check out "
|
|
89
|
+
"https://docling-project.github.io/docling/examples/hybrid_chunking/",
|
|
90
|
+
DeprecationWarning,
|
|
91
|
+
stacklevel=3,
|
|
92
|
+
)
|
|
93
|
+
|
|
82
94
|
if isinstance(tokenizer, str):
|
|
83
95
|
data["tokenizer"] = HuggingFaceTokenizer.from_pretrained(
|
|
84
96
|
model_name=tokenizer,
|
|
85
97
|
max_tokens=max_tokens,
|
|
86
98
|
)
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
99
|
+
elif tokenizer is None or isinstance(
|
|
100
|
+
tokenizer, PreTrainedTokenizerBase
|
|
101
|
+
):
|
|
102
|
+
kwargs = {
|
|
103
|
+
"tokenizer": tokenizer or _get_default_tokenizer().tokenizer
|
|
104
|
+
}
|
|
90
105
|
if max_tokens is not None:
|
|
91
106
|
kwargs["max_tokens"] = max_tokens
|
|
92
107
|
data["tokenizer"] = HuggingFaceTokenizer(**kwargs)
|
|
@@ -535,7 +535,7 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
535
535
|
) -> SerializationResult:
|
|
536
536
|
"""Serialize a document out of its parts."""
|
|
537
537
|
text_res = "\n\n".join([p.text for p in parts if p.text])
|
|
538
|
-
if self.
|
|
538
|
+
if self.requires_page_break():
|
|
539
539
|
page_sep = self.params.page_break_placeholder or ""
|
|
540
540
|
for full_match, _, _ in self._get_page_breaks(text=text_res):
|
|
541
541
|
text_res = text_res.replace(full_match, page_sep)
|
|
@@ -543,6 +543,6 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
543
543
|
return create_ser_result(text=text_res, span_source=parts)
|
|
544
544
|
|
|
545
545
|
@override
|
|
546
|
-
def requires_page_break(self):
|
|
546
|
+
def requires_page_break(self) -> bool:
|
|
547
547
|
"""Whether to add page breaks."""
|
|
548
548
|
return self.params.page_break_placeholder is not None
|
|
@@ -20,7 +20,7 @@ docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9AC
|
|
|
20
20
|
docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
|
|
21
21
|
docling_core/transforms/chunker/base.py,sha256=kJaRrGQynglG9wpy0IaAYTf4MKheWH5BAPzx4LE9yIg,2824
|
|
22
22
|
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=7Fpwwsn2BoiR12KGPrn8fU1uuhqBLp85MRLMF0aIsL8,8281
|
|
23
|
-
docling_core/transforms/chunker/hybrid_chunker.py,sha256=
|
|
23
|
+
docling_core/transforms/chunker/hybrid_chunker.py,sha256=TsAPI7WmvsMbr7Xc6WGmvwrMf4WHZnEeWxtgaf-S0iM,12530
|
|
24
24
|
docling_core/transforms/chunker/tokenizer/__init__.py,sha256=-bhXOTpoI7SYk7vn47z8Ek-RZFjJk4TfZawxsFuNHnE,34
|
|
25
25
|
docling_core/transforms/chunker/tokenizer/base.py,sha256=2gOBQPYJYC0iWXOgMG3DiNP7xEBtii7DYcib0iECq5o,575
|
|
26
26
|
docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=JQ-D3b5vTPQbvu4HaMfYqFzSBLbV_HnmoBGv7d6Kqn4,2220
|
|
@@ -31,7 +31,7 @@ docling_core/transforms/serializer/common.py,sha256=TC1EwHIp9PYcI8jeTKeavUAPtoun
|
|
|
31
31
|
docling_core/transforms/serializer/doctags.py,sha256=mEmRWVuebcG5pZcR1_HX146cyUk0_FjaLQtMXSgh9hs,17870
|
|
32
32
|
docling_core/transforms/serializer/html.py,sha256=Xq9CU5qZTDdwstizYqWNL_TFNDs9NHK_6JvvZk0TP98,34571
|
|
33
33
|
docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
|
|
34
|
-
docling_core/transforms/serializer/markdown.py,sha256=
|
|
34
|
+
docling_core/transforms/serializer/markdown.py,sha256=4thokWJIaF3dvpchjp-Y7NTSzUuXwTmfNey4MQj-c5I,17873
|
|
35
35
|
docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
|
|
36
36
|
docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
|
|
37
37
|
docling_core/transforms/visualizer/layout_visualizer.py,sha256=ulXxWGIl69-HMKDPFk_XKgNCgQeDNc969PVt_X0-drA,7823
|
|
@@ -73,8 +73,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
73
73
|
docling_core/utils/legacy.py,sha256=DrI3QGoL755ZCIoKHF74-pTWm8R0zfFo2C2vB5dT2aY,24463
|
|
74
74
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
75
75
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
76
|
-
docling_core-2.31.
|
|
77
|
-
docling_core-2.31.
|
|
78
|
-
docling_core-2.31.
|
|
79
|
-
docling_core-2.31.
|
|
80
|
-
docling_core-2.31.
|
|
76
|
+
docling_core-2.31.2.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
77
|
+
docling_core-2.31.2.dist-info/METADATA,sha256=Dd_spe0UyIXloLfK2CHHHtHY08eP7SmM2jAkfdPsj8w,5976
|
|
78
|
+
docling_core-2.31.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
79
|
+
docling_core-2.31.2.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
|
|
80
|
+
docling_core-2.31.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|