docling-core 2.31.0__py3-none-any.whl → 2.31.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -9,6 +9,7 @@ from functools import cached_property
9
9
  from typing import Any, Iterable, Iterator, Optional, Union
10
10
 
11
11
  from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator
12
+ from transformers import PreTrainedTokenizerBase
12
13
 
13
14
  from docling_core.transforms.chunker.hierarchical_chunker import (
14
15
  ChunkingSerializerProvider,
@@ -70,23 +71,37 @@ class HybridChunker(BaseChunker):
70
71
  @model_validator(mode="before")
71
72
  @classmethod
72
73
  def _patch(cls, data: Any) -> Any:
73
- if isinstance(data, dict) and (tokenizer := data.get("tokenizer")):
74
+ if isinstance(data, dict):
75
+ tokenizer = data.get("tokenizer")
74
76
  max_tokens = data.get("max_tokens")
75
- if isinstance(tokenizer, BaseTokenizer):
76
- pass
77
- else:
77
+ if not isinstance(tokenizer, BaseTokenizer) and (
78
+ # some legacy param passed:
79
+ tokenizer is not None
80
+ or max_tokens is not None
81
+ ):
78
82
  from docling_core.transforms.chunker.tokenizer.huggingface import (
79
83
  HuggingFaceTokenizer,
80
84
  )
81
85
 
86
+ warnings.warn(
87
+ "Deprecated initialization parameter types for HybridChunker. "
88
+ "For updated usage check out "
89
+ "https://docling-project.github.io/docling/examples/hybrid_chunking/",
90
+ DeprecationWarning,
91
+ stacklevel=3,
92
+ )
93
+
82
94
  if isinstance(tokenizer, str):
83
95
  data["tokenizer"] = HuggingFaceTokenizer.from_pretrained(
84
96
  model_name=tokenizer,
85
97
  max_tokens=max_tokens,
86
98
  )
87
- else:
88
- # migrate previous HF-based tokenizers
89
- kwargs = {"tokenizer": tokenizer}
99
+ elif tokenizer is None or isinstance(
100
+ tokenizer, PreTrainedTokenizerBase
101
+ ):
102
+ kwargs = {
103
+ "tokenizer": tokenizer or _get_default_tokenizer().tokenizer
104
+ }
90
105
  if max_tokens is not None:
91
106
  kwargs["max_tokens"] = max_tokens
92
107
  data["tokenizer"] = HuggingFaceTokenizer(**kwargs)
@@ -535,7 +535,7 @@ class MarkdownDocSerializer(DocSerializer):
535
535
  ) -> SerializationResult:
536
536
  """Serialize a document out of its parts."""
537
537
  text_res = "\n\n".join([p.text for p in parts if p.text])
538
- if self.params.page_break_placeholder:
538
+ if self.requires_page_break():
539
539
  page_sep = self.params.page_break_placeholder or ""
540
540
  for full_match, _, _ in self._get_page_breaks(text=text_res):
541
541
  text_res = text_res.replace(full_match, page_sep)
@@ -543,6 +543,6 @@ class MarkdownDocSerializer(DocSerializer):
543
543
  return create_ser_result(text=text_res, span_source=parts)
544
544
 
545
545
  @override
546
- def requires_page_break(self):
546
+ def requires_page_break(self) -> bool:
547
547
  """Whether to add page breaks."""
548
548
  return self.params.page_break_placeholder is not None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.31.0
3
+ Version: 2.31.2
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://github.com/docling-project
6
6
  License: MIT
@@ -20,7 +20,7 @@ docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9AC
20
20
  docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
21
21
  docling_core/transforms/chunker/base.py,sha256=kJaRrGQynglG9wpy0IaAYTf4MKheWH5BAPzx4LE9yIg,2824
22
22
  docling_core/transforms/chunker/hierarchical_chunker.py,sha256=7Fpwwsn2BoiR12KGPrn8fU1uuhqBLp85MRLMF0aIsL8,8281
23
- docling_core/transforms/chunker/hybrid_chunker.py,sha256=67Whij6zSPZbVQA-fToyBtTfLtDK6BdnZ-Mhlz0p8ZQ,11886
23
+ docling_core/transforms/chunker/hybrid_chunker.py,sha256=TsAPI7WmvsMbr7Xc6WGmvwrMf4WHZnEeWxtgaf-S0iM,12530
24
24
  docling_core/transforms/chunker/tokenizer/__init__.py,sha256=-bhXOTpoI7SYk7vn47z8Ek-RZFjJk4TfZawxsFuNHnE,34
25
25
  docling_core/transforms/chunker/tokenizer/base.py,sha256=2gOBQPYJYC0iWXOgMG3DiNP7xEBtii7DYcib0iECq5o,575
26
26
  docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=JQ-D3b5vTPQbvu4HaMfYqFzSBLbV_HnmoBGv7d6Kqn4,2220
@@ -31,7 +31,7 @@ docling_core/transforms/serializer/common.py,sha256=TC1EwHIp9PYcI8jeTKeavUAPtoun
31
31
  docling_core/transforms/serializer/doctags.py,sha256=mEmRWVuebcG5pZcR1_HX146cyUk0_FjaLQtMXSgh9hs,17870
32
32
  docling_core/transforms/serializer/html.py,sha256=Xq9CU5qZTDdwstizYqWNL_TFNDs9NHK_6JvvZk0TP98,34571
33
33
  docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
34
- docling_core/transforms/serializer/markdown.py,sha256=YqThAYMsOWSg6nZnnmrUHZohn0QvfZzRqpLrB-Keev8,17873
34
+ docling_core/transforms/serializer/markdown.py,sha256=4thokWJIaF3dvpchjp-Y7NTSzUuXwTmfNey4MQj-c5I,17873
35
35
  docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
36
36
  docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
37
37
  docling_core/transforms/visualizer/layout_visualizer.py,sha256=ulXxWGIl69-HMKDPFk_XKgNCgQeDNc969PVt_X0-drA,7823
@@ -73,8 +73,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
73
73
  docling_core/utils/legacy.py,sha256=DrI3QGoL755ZCIoKHF74-pTWm8R0zfFo2C2vB5dT2aY,24463
74
74
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
75
75
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
76
- docling_core-2.31.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
77
- docling_core-2.31.0.dist-info/METADATA,sha256=OohNxPwKcbRVVKm_kpa3HRFhYh9ZMVyBIlf3apF9hm4,5976
78
- docling_core-2.31.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
79
- docling_core-2.31.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
80
- docling_core-2.31.0.dist-info/RECORD,,
76
+ docling_core-2.31.2.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
77
+ docling_core-2.31.2.dist-info/METADATA,sha256=Dd_spe0UyIXloLfK2CHHHtHY08eP7SmM2jAkfdPsj8w,5976
78
+ docling_core-2.31.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
79
+ docling_core-2.31.2.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
80
+ docling_core-2.31.2.dist-info/RECORD,,