docling-core 2.31.1__py3-none-any.whl → 2.31.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -9,6 +9,7 @@ from functools import cached_property
9
9
  from typing import Any, Iterable, Iterator, Optional, Union
10
10
 
11
11
  from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator
12
+ from transformers import PreTrainedTokenizerBase
12
13
 
13
14
  from docling_core.transforms.chunker.hierarchical_chunker import (
14
15
  ChunkingSerializerProvider,
@@ -70,23 +71,37 @@ class HybridChunker(BaseChunker):
70
71
  @model_validator(mode="before")
71
72
  @classmethod
72
73
  def _patch(cls, data: Any) -> Any:
73
- if isinstance(data, dict) and (tokenizer := data.get("tokenizer")):
74
+ if isinstance(data, dict):
75
+ tokenizer = data.get("tokenizer")
74
76
  max_tokens = data.get("max_tokens")
75
- if isinstance(tokenizer, BaseTokenizer):
76
- pass
77
- else:
77
+ if not isinstance(tokenizer, BaseTokenizer) and (
78
+ # some legacy param passed:
79
+ tokenizer is not None
80
+ or max_tokens is not None
81
+ ):
78
82
  from docling_core.transforms.chunker.tokenizer.huggingface import (
79
83
  HuggingFaceTokenizer,
80
84
  )
81
85
 
86
+ warnings.warn(
87
+ "Deprecated initialization parameter types for HybridChunker. "
88
+ "For updated usage check out "
89
+ "https://docling-project.github.io/docling/examples/hybrid_chunking/",
90
+ DeprecationWarning,
91
+ stacklevel=3,
92
+ )
93
+
82
94
  if isinstance(tokenizer, str):
83
95
  data["tokenizer"] = HuggingFaceTokenizer.from_pretrained(
84
96
  model_name=tokenizer,
85
97
  max_tokens=max_tokens,
86
98
  )
87
- else:
88
- # migrate previous HF-based tokenizers
89
- kwargs = {"tokenizer": tokenizer}
99
+ elif tokenizer is None or isinstance(
100
+ tokenizer, PreTrainedTokenizerBase
101
+ ):
102
+ kwargs = {
103
+ "tokenizer": tokenizer or _get_default_tokenizer().tokenizer
104
+ }
90
105
  if max_tokens is not None:
91
106
  kwargs["max_tokens"] = max_tokens
92
107
  data["tokenizer"] = HuggingFaceTokenizer(**kwargs)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.31.1
3
+ Version: 2.31.2
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://github.com/docling-project
6
6
  License: MIT
@@ -20,7 +20,7 @@ docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9AC
20
20
  docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
21
21
  docling_core/transforms/chunker/base.py,sha256=kJaRrGQynglG9wpy0IaAYTf4MKheWH5BAPzx4LE9yIg,2824
22
22
  docling_core/transforms/chunker/hierarchical_chunker.py,sha256=7Fpwwsn2BoiR12KGPrn8fU1uuhqBLp85MRLMF0aIsL8,8281
23
- docling_core/transforms/chunker/hybrid_chunker.py,sha256=67Whij6zSPZbVQA-fToyBtTfLtDK6BdnZ-Mhlz0p8ZQ,11886
23
+ docling_core/transforms/chunker/hybrid_chunker.py,sha256=TsAPI7WmvsMbr7Xc6WGmvwrMf4WHZnEeWxtgaf-S0iM,12530
24
24
  docling_core/transforms/chunker/tokenizer/__init__.py,sha256=-bhXOTpoI7SYk7vn47z8Ek-RZFjJk4TfZawxsFuNHnE,34
25
25
  docling_core/transforms/chunker/tokenizer/base.py,sha256=2gOBQPYJYC0iWXOgMG3DiNP7xEBtii7DYcib0iECq5o,575
26
26
  docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=JQ-D3b5vTPQbvu4HaMfYqFzSBLbV_HnmoBGv7d6Kqn4,2220
@@ -73,8 +73,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
73
73
  docling_core/utils/legacy.py,sha256=DrI3QGoL755ZCIoKHF74-pTWm8R0zfFo2C2vB5dT2aY,24463
74
74
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
75
75
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
76
- docling_core-2.31.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
77
- docling_core-2.31.1.dist-info/METADATA,sha256=O13NvxzbHR0wUzP_3yQbOFqSI63LUvCzWvXEkiqQePY,5976
78
- docling_core-2.31.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
79
- docling_core-2.31.1.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
80
- docling_core-2.31.1.dist-info/RECORD,,
76
+ docling_core-2.31.2.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
77
+ docling_core-2.31.2.dist-info/METADATA,sha256=Dd_spe0UyIXloLfK2CHHHtHY08eP7SmM2jAkfdPsj8w,5976
78
+ docling_core-2.31.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
79
+ docling_core-2.31.2.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
80
+ docling_core-2.31.2.dist-info/RECORD,,