docling-core 2.41.0__py3-none-any.whl → 2.43.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -11,3 +11,4 @@ from docling_core.transforms.chunker.hierarchical_chunker import (
11
11
  DocMeta,
12
12
  HierarchicalChunker,
13
13
  )
14
+ from docling_core.transforms.chunker.page_chunker import PageChunker
@@ -0,0 +1,59 @@
1
+ """Page-based chunker implementation: each chunk corresponds to a single page."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Iterator
6
+
7
+ from pydantic import ConfigDict
8
+ from typing_extensions import override
9
+
10
+ from docling_core.transforms.chunker import BaseChunker, DocChunk, DocMeta
11
+ from docling_core.transforms.chunker.hierarchical_chunker import (
12
+ ChunkingSerializerProvider,
13
+ )
14
+ from docling_core.types import DoclingDocument as DLDocument
15
+
16
+
17
+ class PageChunker(BaseChunker):
18
+ r"""Chunker implementation that yields one chunk per page."""
19
+
20
+ model_config = ConfigDict(arbitrary_types_allowed=True)
21
+
22
+ serializer_provider: ChunkingSerializerProvider = ChunkingSerializerProvider()
23
+
24
+ @override
25
+ def chunk(
26
+ self,
27
+ dl_doc: DLDocument,
28
+ **kwargs: Any,
29
+ ) -> Iterator[DocChunk]:
30
+ """Chunk the provided document by page."""
31
+ my_doc_ser = self.serializer_provider.get_serializer(doc=dl_doc)
32
+ if dl_doc.pages:
33
+ # chunk by page
34
+ for page_no in sorted(dl_doc.pages.keys()):
35
+ ser_res = my_doc_ser.serialize(pages={page_no})
36
+ if not ser_res.text:
37
+ continue
38
+ yield DocChunk(
39
+ text=ser_res.text,
40
+ meta=DocMeta(
41
+ doc_items=ser_res.get_unique_doc_items(),
42
+ headings=None,
43
+ captions=None,
44
+ origin=dl_doc.origin,
45
+ ),
46
+ )
47
+ else:
48
+ # if no pages, treat whole document as single chunk
49
+ ser_res = my_doc_ser.serialize()
50
+ if ser_res.text:
51
+ yield DocChunk(
52
+ text=ser_res.text,
53
+ meta=DocMeta(
54
+ doc_items=ser_res.get_unique_doc_items(),
55
+ headings=None,
56
+ captions=None,
57
+ origin=dl_doc.origin,
58
+ ),
59
+ )
@@ -39,6 +39,16 @@ class SerializationResult(BaseModel):
39
39
  spans: list[Span] = []
40
40
  # group: Optional[GroupItem] = None # set when result reflects specific group item
41
41
 
42
+ def get_unique_doc_items(self) -> list[DocItem]:
43
+ """Get the doc items corresponding to this result."""
44
+ seen_doc_item_refs: set[str] = set()
45
+ doc_items: list[DocItem] = []
46
+ for span in self.spans:
47
+ if span.item.self_ref not in seen_doc_item_refs:
48
+ seen_doc_item_refs.add(span.item.self_ref)
49
+ doc_items.append(span.item)
50
+ return doc_items
51
+
42
52
 
43
53
  class BaseTextSerializer(ABC):
44
54
  """Base class for text item serializers."""
@@ -285,7 +285,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
285
285
 
286
286
  def _serialize_body(self, **kwargs) -> SerializationResult:
287
287
  """Serialize the document body."""
288
- subparts = self.get_parts()
288
+ subparts = self.get_parts(**kwargs)
289
289
  res = self.serialize_doc(parts=subparts, **kwargs)
290
290
  return res
291
291
 
@@ -1,7 +1,7 @@
1
1
  """Models for the base data types."""
2
2
 
3
3
  from enum import Enum
4
- from typing import List, Tuple
4
+ from typing import Any, List, Tuple
5
5
 
6
6
  from pydantic import BaseModel, FieldSerializationInfo, field_serializer
7
7
 
@@ -21,16 +21,23 @@ class CoordOrigin(str, Enum):
21
21
  BOTTOMLEFT = "BOTTOMLEFT"
22
22
 
23
23
 
24
- _CTX_COORD_PREC = "coord_prec"
24
+ class PydanticSerCtxKey(str, Enum):
25
+ """Pydantic serialization context keys."""
25
26
 
27
+ COORD_PREC = "coord_prec" # key for coordinates precision
28
+ CONFID_PREC = "confid_prec" # key for confidence values precision
26
29
 
27
- def _serialize_precision(
28
- value: float, info: FieldSerializationInfo, ctx_key: str
30
+
31
+ def round_pydantic_float(
32
+ val: float, ctx: Any, precision_ctx_key: PydanticSerCtxKey
29
33
  ) -> float:
30
- precision = info.context.get(ctx_key) if info.context else None
31
- if isinstance(precision, int):
32
- return round(value, precision)
33
- return value
34
+ """Round float, provided the precision is available in the context."""
35
+ precision = (
36
+ ctx.get(precision_ctx_key.value)
37
+ if isinstance(ctx, dict)
38
+ else getattr(ctx, precision_ctx_key.value, None)
39
+ )
40
+ return round(val, precision) if isinstance(precision, int) else val
34
41
 
35
42
 
36
43
  class Size(BaseModel):
@@ -41,7 +48,7 @@ class Size(BaseModel):
41
48
 
42
49
  @field_serializer("width", "height")
43
50
  def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
44
- return _serialize_precision(value, info, _CTX_COORD_PREC)
51
+ return round_pydantic_float(value, info.context, PydanticSerCtxKey.COORD_PREC)
45
52
 
46
53
  def as_tuple(self):
47
54
  """as_tuple."""
@@ -70,7 +77,7 @@ class BoundingBox(BaseModel):
70
77
 
71
78
  @field_serializer("l", "t", "r", "b")
72
79
  def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
73
- return _serialize_precision(value, info, _CTX_COORD_PREC)
80
+ return round_pydantic_float(value, info.context, PydanticSerCtxKey.COORD_PREC)
74
81
 
75
82
  def resize_by_scale(self, x_scale: float, y_scale: float):
76
83
  """resize_by_scale."""