docling-core 2.31.0__tar.gz → 2.31.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (79) hide show
  1. {docling_core-2.31.0 → docling_core-2.31.2}/PKG-INFO +1 -1
  2. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/transforms/chunker/hybrid_chunker.py +22 -7
  3. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/transforms/serializer/markdown.py +2 -2
  4. {docling_core-2.31.0 → docling_core-2.31.2}/pyproject.toml +1 -1
  5. {docling_core-2.31.0 → docling_core-2.31.2}/LICENSE +0 -0
  6. {docling_core-2.31.0 → docling_core-2.31.2}/README.md +0 -0
  7. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/__init__.py +0 -0
  8. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/cli/__init__.py +0 -0
  9. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/cli/view.py +0 -0
  10. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/experimental/__init__.py +0 -0
  11. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/py.typed +0 -0
  12. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/resources/schemas/doc/ANN.json +0 -0
  13. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/resources/schemas/doc/DOC.json +0 -0
  14. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  15. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/resources/schemas/doc/RAW.json +0 -0
  16. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  17. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  18. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  19. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  20. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/search/__init__.py +0 -0
  21. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  22. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/search/mapping.py +0 -0
  23. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/search/meta.py +0 -0
  24. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/search/package.py +0 -0
  25. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/transforms/__init__.py +0 -0
  26. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/transforms/chunker/__init__.py +0 -0
  27. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/transforms/chunker/base.py +0 -0
  28. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  29. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
  30. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
  31. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
  32. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
  33. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/transforms/serializer/__init__.py +0 -0
  34. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/transforms/serializer/base.py +0 -0
  35. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/transforms/serializer/common.py +0 -0
  36. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/transforms/serializer/doctags.py +0 -0
  37. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/transforms/serializer/html.py +0 -0
  38. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/transforms/serializer/html_styles.py +0 -0
  39. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/transforms/visualizer/__init__.py +0 -0
  40. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/transforms/visualizer/base.py +0 -0
  41. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
  42. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
  43. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/__init__.py +0 -0
  44. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/base.py +0 -0
  45. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/doc/__init__.py +0 -0
  46. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/doc/base.py +0 -0
  47. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/doc/document.py +0 -0
  48. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/doc/labels.py +0 -0
  49. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/doc/page.py +0 -0
  50. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/doc/tokens.py +0 -0
  51. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/doc/utils.py +0 -0
  52. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/gen/__init__.py +0 -0
  53. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/gen/generic.py +0 -0
  54. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/io/__init__.py +0 -0
  55. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/legacy_doc/__init__.py +0 -0
  56. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/legacy_doc/base.py +0 -0
  57. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  58. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  59. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  60. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/legacy_doc/document.py +0 -0
  61. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/legacy_doc/tokens.py +0 -0
  62. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/nlp/__init__.py +0 -0
  63. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/nlp/qa.py +0 -0
  64. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/nlp/qa_labels.py +0 -0
  65. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/rec/__init__.py +0 -0
  66. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/rec/attribute.py +0 -0
  67. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/rec/base.py +0 -0
  68. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/rec/predicate.py +0 -0
  69. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/rec/record.py +0 -0
  70. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/rec/statement.py +0 -0
  71. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/types/rec/subject.py +0 -0
  72. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/utils/__init__.py +0 -0
  73. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/utils/alias.py +0 -0
  74. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/utils/file.py +0 -0
  75. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/utils/generate_docs.py +0 -0
  76. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/utils/generate_jsonschema.py +0 -0
  77. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/utils/legacy.py +0 -0
  78. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/utils/validate.py +0 -0
  79. {docling_core-2.31.0 → docling_core-2.31.2}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.31.0
3
+ Version: 2.31.2
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://github.com/docling-project
6
6
  License: MIT
@@ -9,6 +9,7 @@ from functools import cached_property
9
9
  from typing import Any, Iterable, Iterator, Optional, Union
10
10
 
11
11
  from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator
12
+ from transformers import PreTrainedTokenizerBase
12
13
 
13
14
  from docling_core.transforms.chunker.hierarchical_chunker import (
14
15
  ChunkingSerializerProvider,
@@ -70,23 +71,37 @@ class HybridChunker(BaseChunker):
70
71
  @model_validator(mode="before")
71
72
  @classmethod
72
73
  def _patch(cls, data: Any) -> Any:
73
- if isinstance(data, dict) and (tokenizer := data.get("tokenizer")):
74
+ if isinstance(data, dict):
75
+ tokenizer = data.get("tokenizer")
74
76
  max_tokens = data.get("max_tokens")
75
- if isinstance(tokenizer, BaseTokenizer):
76
- pass
77
- else:
77
+ if not isinstance(tokenizer, BaseTokenizer) and (
78
+ # some legacy param passed:
79
+ tokenizer is not None
80
+ or max_tokens is not None
81
+ ):
78
82
  from docling_core.transforms.chunker.tokenizer.huggingface import (
79
83
  HuggingFaceTokenizer,
80
84
  )
81
85
 
86
+ warnings.warn(
87
+ "Deprecated initialization parameter types for HybridChunker. "
88
+ "For updated usage check out "
89
+ "https://docling-project.github.io/docling/examples/hybrid_chunking/",
90
+ DeprecationWarning,
91
+ stacklevel=3,
92
+ )
93
+
82
94
  if isinstance(tokenizer, str):
83
95
  data["tokenizer"] = HuggingFaceTokenizer.from_pretrained(
84
96
  model_name=tokenizer,
85
97
  max_tokens=max_tokens,
86
98
  )
87
- else:
88
- # migrate previous HF-based tokenizers
89
- kwargs = {"tokenizer": tokenizer}
99
+ elif tokenizer is None or isinstance(
100
+ tokenizer, PreTrainedTokenizerBase
101
+ ):
102
+ kwargs = {
103
+ "tokenizer": tokenizer or _get_default_tokenizer().tokenizer
104
+ }
90
105
  if max_tokens is not None:
91
106
  kwargs["max_tokens"] = max_tokens
92
107
  data["tokenizer"] = HuggingFaceTokenizer(**kwargs)
@@ -535,7 +535,7 @@ class MarkdownDocSerializer(DocSerializer):
535
535
  ) -> SerializationResult:
536
536
  """Serialize a document out of its parts."""
537
537
  text_res = "\n\n".join([p.text for p in parts if p.text])
538
- if self.params.page_break_placeholder:
538
+ if self.requires_page_break():
539
539
  page_sep = self.params.page_break_placeholder or ""
540
540
  for full_match, _, _ in self._get_page_breaks(text=text_res):
541
541
  text_res = text_res.replace(full_match, page_sep)
@@ -543,6 +543,6 @@ class MarkdownDocSerializer(DocSerializer):
543
543
  return create_ser_result(text=text_res, span_source=parts)
544
544
 
545
545
  @override
546
- def requires_page_break(self):
546
+ def requires_page_break(self) -> bool:
547
547
  """Whether to add page breaks."""
548
548
  return self.params.page_break_placeholder is not None
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.31.0"
3
+ version = "2.31.2"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
File without changes
File without changes