docling-core 2.42.0__py3-none-any.whl → 2.43.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -11,3 +11,4 @@ from docling_core.transforms.chunker.hierarchical_chunker import (
11
11
  DocMeta,
12
12
  HierarchicalChunker,
13
13
  )
14
+ from docling_core.transforms.chunker.page_chunker import PageChunker
@@ -0,0 +1,59 @@
1
+ """Page-based chunker implementation: each chunk corresponds to a single page."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Iterator
6
+
7
+ from pydantic import ConfigDict
8
+ from typing_extensions import override
9
+
10
+ from docling_core.transforms.chunker import BaseChunker, DocChunk, DocMeta
11
+ from docling_core.transforms.chunker.hierarchical_chunker import (
12
+ ChunkingSerializerProvider,
13
+ )
14
+ from docling_core.types import DoclingDocument as DLDocument
15
+
16
+
17
+ class PageChunker(BaseChunker):
18
+ r"""Chunker implementation that yields one chunk per page."""
19
+
20
+ model_config = ConfigDict(arbitrary_types_allowed=True)
21
+
22
+ serializer_provider: ChunkingSerializerProvider = ChunkingSerializerProvider()
23
+
24
+ @override
25
+ def chunk(
26
+ self,
27
+ dl_doc: DLDocument,
28
+ **kwargs: Any,
29
+ ) -> Iterator[DocChunk]:
30
+ """Chunk the provided document by page."""
31
+ my_doc_ser = self.serializer_provider.get_serializer(doc=dl_doc)
32
+ if dl_doc.pages:
33
+ # chunk by page
34
+ for page_no in sorted(dl_doc.pages.keys()):
35
+ ser_res = my_doc_ser.serialize(pages={page_no})
36
+ if not ser_res.text:
37
+ continue
38
+ yield DocChunk(
39
+ text=ser_res.text,
40
+ meta=DocMeta(
41
+ doc_items=ser_res.get_unique_doc_items(),
42
+ headings=None,
43
+ captions=None,
44
+ origin=dl_doc.origin,
45
+ ),
46
+ )
47
+ else:
48
+ # if no pages, treat whole document as single chunk
49
+ ser_res = my_doc_ser.serialize()
50
+ if ser_res.text:
51
+ yield DocChunk(
52
+ text=ser_res.text,
53
+ meta=DocMeta(
54
+ doc_items=ser_res.get_unique_doc_items(),
55
+ headings=None,
56
+ captions=None,
57
+ origin=dl_doc.origin,
58
+ ),
59
+ )
@@ -39,6 +39,16 @@ class SerializationResult(BaseModel):
39
39
  spans: list[Span] = []
40
40
  # group: Optional[GroupItem] = None # set when result reflects specific group item
41
41
 
42
+ def get_unique_doc_items(self) -> list[DocItem]:
43
+ """Get the doc items corresponding to this result."""
44
+ seen_doc_item_refs: set[str] = set()
45
+ doc_items: list[DocItem] = []
46
+ for span in self.spans:
47
+ if span.item.self_ref not in seen_doc_item_refs:
48
+ seen_doc_item_refs.add(span.item.self_ref)
49
+ doc_items.append(span.item)
50
+ return doc_items
51
+
42
52
 
43
53
  class BaseTextSerializer(ABC):
44
54
  """Base class for text item serializers."""
@@ -285,7 +285,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
285
285
 
286
286
  def _serialize_body(self, **kwargs) -> SerializationResult:
287
287
  """Serialize the document body."""
288
- subparts = self.get_parts()
288
+ subparts = self.get_parts(**kwargs)
289
289
  res = self.serialize_doc(parts=subparts, **kwargs)
290
290
  return res
291
291
 
@@ -4098,7 +4098,10 @@ class DoclingDocument(BaseModel):
4098
4098
  return result
4099
4099
 
4100
4100
  def _with_pictures_refs(
4101
- self, image_dir: Path, reference_path: Optional[Path] = None
4101
+ self,
4102
+ image_dir: Path,
4103
+ page_no: Optional[int],
4104
+ reference_path: Optional[Path] = None,
4102
4105
  ) -> "DoclingDocument":
4103
4106
  """Document with images as refs.
4104
4107
 
@@ -4111,7 +4114,7 @@ class DoclingDocument(BaseModel):
4111
4114
  image_dir.mkdir(parents=True, exist_ok=True)
4112
4115
 
4113
4116
  if image_dir.is_dir():
4114
- for item, level in result.iterate_items(with_groups=False):
4117
+ for item, level in result.iterate_items(page_no=page_no, with_groups=False):
4115
4118
  if isinstance(item, PictureItem):
4116
4119
 
4117
4120
  if (
@@ -4211,7 +4214,7 @@ class DoclingDocument(BaseModel):
4211
4214
  os.makedirs(artifacts_dir, exist_ok=True)
4212
4215
 
4213
4216
  new_doc = self._make_copy_with_refmode(
4214
- artifacts_dir, image_mode, reference_path=reference_path
4217
+ artifacts_dir, image_mode, page_no=None, reference_path=reference_path
4215
4218
  )
4216
4219
 
4217
4220
  out = new_doc.export_to_dict(
@@ -4254,7 +4257,7 @@ class DoclingDocument(BaseModel):
4254
4257
  os.makedirs(artifacts_dir, exist_ok=True)
4255
4258
 
4256
4259
  new_doc = self._make_copy_with_refmode(
4257
- artifacts_dir, image_mode, reference_path=reference_path
4260
+ artifacts_dir, image_mode, page_no=None, reference_path=reference_path
4258
4261
  )
4259
4262
 
4260
4263
  out = new_doc.export_to_dict(
@@ -4327,7 +4330,7 @@ class DoclingDocument(BaseModel):
4327
4330
  os.makedirs(artifacts_dir, exist_ok=True)
4328
4331
 
4329
4332
  new_doc = self._make_copy_with_refmode(
4330
- artifacts_dir, image_mode, reference_path=reference_path
4333
+ artifacts_dir, image_mode, page_no, reference_path=reference_path
4331
4334
  )
4332
4335
 
4333
4336
  md_out = new_doc.export_to_markdown(
@@ -4503,7 +4506,7 @@ class DoclingDocument(BaseModel):
4503
4506
  os.makedirs(artifacts_dir, exist_ok=True)
4504
4507
 
4505
4508
  new_doc = self._make_copy_with_refmode(
4506
- artifacts_dir, image_mode, reference_path=reference_path
4509
+ artifacts_dir, image_mode, page_no, reference_path=reference_path
4507
4510
  )
4508
4511
 
4509
4512
  html_out = new_doc.export_to_html(
@@ -4542,6 +4545,7 @@ class DoclingDocument(BaseModel):
4542
4545
  self,
4543
4546
  artifacts_dir: Path,
4544
4547
  image_mode: ImageRefMode,
4548
+ page_no: Optional[int],
4545
4549
  reference_path: Optional[Path] = None,
4546
4550
  ):
4547
4551
  new_doc = None
@@ -4549,7 +4553,7 @@ class DoclingDocument(BaseModel):
4549
4553
  new_doc = self
4550
4554
  elif image_mode == ImageRefMode.REFERENCED:
4551
4555
  new_doc = self._with_pictures_refs(
4552
- image_dir=artifacts_dir, reference_path=reference_path
4556
+ image_dir=artifacts_dir, page_no=page_no, reference_path=reference_path
4553
4557
  )
4554
4558
  elif image_mode == ImageRefMode.EMBEDDED:
4555
4559
  new_doc = self._with_embedded_pictures()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.42.0
3
+ Version: 2.43.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -17,17 +17,18 @@ docling_core/search/mapping.py,sha256=6rqG7LgYSeWmooKNEcRa5gFDLp1ZdzPqDGlwTA5gpO
17
17
  docling_core/search/meta.py,sha256=wSurrsqdP1N3gQKx027fVdzVmc33a7Y6rPl-FClQvtA,3318
18
18
  docling_core/search/package.py,sha256=Lz2ml2eDy5t0ZimnGTq-DXHAn-f18w0bn4H5xrhs75A,1841
19
19
  docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9ACDd57ds,106
20
- docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
20
+ docling_core/transforms/chunker/__init__.py,sha256=Qg5RhC-2QqdXKEfjzNGJaVi0NqBCL3xAhKWJGOlrE3M,375
21
21
  docling_core/transforms/chunker/base.py,sha256=kJaRrGQynglG9wpy0IaAYTf4MKheWH5BAPzx4LE9yIg,2824
22
22
  docling_core/transforms/chunker/hierarchical_chunker.py,sha256=uDf-qGiIT_4JUEg9NOdzvDqAPOTqycKJ-jEpDkV3jJU,8243
23
23
  docling_core/transforms/chunker/hybrid_chunker.py,sha256=xjkz8hy3tXXzkJzf7QMFOEq_v8V7Jcs9tCY0Mxjge74,12548
24
+ docling_core/transforms/chunker/page_chunker.py,sha256=gLUlqA_klK-rkuPVYuJKi3ZuTIGdd2HD7ces72AiZ2U,2018
24
25
  docling_core/transforms/chunker/tokenizer/__init__.py,sha256=-bhXOTpoI7SYk7vn47z8Ek-RZFjJk4TfZawxsFuNHnE,34
25
26
  docling_core/transforms/chunker/tokenizer/base.py,sha256=2gOBQPYJYC0iWXOgMG3DiNP7xEBtii7DYcib0iECq5o,575
26
27
  docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=aZ_RNQIzcNkAHGHZw3SBCoqJHM2Ihb65eiM29O9BR6o,2506
27
28
  docling_core/transforms/chunker/tokenizer/openai.py,sha256=zt2kwcC-r8MafeEG0CESab8E4RIC9aaFXxxnxOGyTMA,918
28
29
  docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
29
- docling_core/transforms/serializer/base.py,sha256=s3Anl_3-QJM1t29Bz-iOgLhAcfG3BZuwZqdYTi5Xfr0,6846
30
- docling_core/transforms/serializer/common.py,sha256=Dkw9axJqU2qlZuEFRDa6Av11PIL2ejOOOCAahtoK9sA,19106
30
+ docling_core/transforms/serializer/base.py,sha256=TI8Epj7gyxdTet9j-Rs4o5U09gfACfAIVoirlschviM,7266
31
+ docling_core/transforms/serializer/common.py,sha256=0TNEGoA_rJ-qkVYp-X8SMUr3jTrbf6TRzPzwufYh5JM,19114
31
32
  docling_core/transforms/serializer/doctags.py,sha256=TD0yAm1qSVy-GsE6svpUAI-Yqjcf2rrTZ3ac9YU3gbE,19858
32
33
  docling_core/transforms/serializer/html.py,sha256=oxnUhszRPBINiK1tq2dwf5QjTCrIV_q15vsrPVqBeME,38988
33
34
  docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
@@ -41,7 +42,7 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
41
42
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
42
43
  docling_core/types/doc/__init__.py,sha256=8hOhm5W9mArf3zwgfoMxDs1pHizhLFSAZlLu1tPBBRk,1641
43
44
  docling_core/types/doc/base.py,sha256=i98y4IF250adR-8BSS374K90fwfwG-vBfWh14tLC5Cs,15906
44
- docling_core/types/doc/document.py,sha256=ShxqS9A9N1KhVS73C7eoTBo0WgCI-TRbjwW8frhFEns,199154
45
+ docling_core/types/doc/document.py,sha256=SUqIJ-huO3ELLRdCMUYjkkXHeGGkeY2oaOWFQ1nq5lg,199315
45
46
  docling_core/types/doc/labels.py,sha256=-W1-LW6z0J9F9ExJqR0Wd1WeqWTaY3Unm-j1UkQGlC4,7330
46
47
  docling_core/types/doc/page.py,sha256=35h1xdtCM3-AaN8Dim9jDseZIiw-3GxpB-ofF-H2rQQ,41878
47
48
  docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
@@ -74,9 +75,9 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
74
75
  docling_core/utils/legacy.py,sha256=5lghO48OEcV9V51tRnH3YSKgLtdqhr-Q5C_OcJZ8TOs,24392
75
76
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
76
77
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
77
- docling_core-2.42.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
78
- docling_core-2.42.0.dist-info/METADATA,sha256=R7dpA43x-3YVrXztgu3_Pt87SWRrjaeBW0jXPndawdw,6453
79
- docling_core-2.42.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
80
- docling_core-2.42.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
81
- docling_core-2.42.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
82
- docling_core-2.42.0.dist-info/RECORD,,
78
+ docling_core-2.43.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
79
+ docling_core-2.43.0.dist-info/METADATA,sha256=4aQkJub9YsHu2OrYq4Y2vIlBYHcOCC-ruxCN6EnWbW4,6453
80
+ docling_core-2.43.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
81
+ docling_core-2.43.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
82
+ docling_core-2.43.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
83
+ docling_core-2.43.0.dist-info/RECORD,,