docling-core 2.42.0__py3-none-any.whl → 2.43.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/__init__.py +1 -0
- docling_core/transforms/chunker/page_chunker.py +59 -0
- docling_core/transforms/serializer/base.py +10 -0
- docling_core/transforms/serializer/common.py +1 -1
- docling_core/types/doc/document.py +11 -7
- {docling_core-2.42.0.dist-info → docling_core-2.43.0.dist-info}/METADATA +1 -1
- {docling_core-2.42.0.dist-info → docling_core-2.43.0.dist-info}/RECORD +11 -10
- {docling_core-2.42.0.dist-info → docling_core-2.43.0.dist-info}/WHEEL +0 -0
- {docling_core-2.42.0.dist-info → docling_core-2.43.0.dist-info}/entry_points.txt +0 -0
- {docling_core-2.42.0.dist-info → docling_core-2.43.0.dist-info}/licenses/LICENSE +0 -0
- {docling_core-2.42.0.dist-info → docling_core-2.43.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Page-based chunker implementation: each chunk corresponds to a single page."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Iterator
|
|
6
|
+
|
|
7
|
+
from pydantic import ConfigDict
|
|
8
|
+
from typing_extensions import override
|
|
9
|
+
|
|
10
|
+
from docling_core.transforms.chunker import BaseChunker, DocChunk, DocMeta
|
|
11
|
+
from docling_core.transforms.chunker.hierarchical_chunker import (
|
|
12
|
+
ChunkingSerializerProvider,
|
|
13
|
+
)
|
|
14
|
+
from docling_core.types import DoclingDocument as DLDocument
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PageChunker(BaseChunker):
|
|
18
|
+
r"""Chunker implementation that yields one chunk per page."""
|
|
19
|
+
|
|
20
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
21
|
+
|
|
22
|
+
serializer_provider: ChunkingSerializerProvider = ChunkingSerializerProvider()
|
|
23
|
+
|
|
24
|
+
@override
|
|
25
|
+
def chunk(
|
|
26
|
+
self,
|
|
27
|
+
dl_doc: DLDocument,
|
|
28
|
+
**kwargs: Any,
|
|
29
|
+
) -> Iterator[DocChunk]:
|
|
30
|
+
"""Chunk the provided document by page."""
|
|
31
|
+
my_doc_ser = self.serializer_provider.get_serializer(doc=dl_doc)
|
|
32
|
+
if dl_doc.pages:
|
|
33
|
+
# chunk by page
|
|
34
|
+
for page_no in sorted(dl_doc.pages.keys()):
|
|
35
|
+
ser_res = my_doc_ser.serialize(pages={page_no})
|
|
36
|
+
if not ser_res.text:
|
|
37
|
+
continue
|
|
38
|
+
yield DocChunk(
|
|
39
|
+
text=ser_res.text,
|
|
40
|
+
meta=DocMeta(
|
|
41
|
+
doc_items=ser_res.get_unique_doc_items(),
|
|
42
|
+
headings=None,
|
|
43
|
+
captions=None,
|
|
44
|
+
origin=dl_doc.origin,
|
|
45
|
+
),
|
|
46
|
+
)
|
|
47
|
+
else:
|
|
48
|
+
# if no pages, treat whole document as single chunk
|
|
49
|
+
ser_res = my_doc_ser.serialize()
|
|
50
|
+
if ser_res.text:
|
|
51
|
+
yield DocChunk(
|
|
52
|
+
text=ser_res.text,
|
|
53
|
+
meta=DocMeta(
|
|
54
|
+
doc_items=ser_res.get_unique_doc_items(),
|
|
55
|
+
headings=None,
|
|
56
|
+
captions=None,
|
|
57
|
+
origin=dl_doc.origin,
|
|
58
|
+
),
|
|
59
|
+
)
|
|
@@ -39,6 +39,16 @@ class SerializationResult(BaseModel):
|
|
|
39
39
|
spans: list[Span] = []
|
|
40
40
|
# group: Optional[GroupItem] = None # set when result reflects specific group item
|
|
41
41
|
|
|
42
|
+
def get_unique_doc_items(self) -> list[DocItem]:
|
|
43
|
+
"""Get the doc items corresponding to this result."""
|
|
44
|
+
seen_doc_item_refs: set[str] = set()
|
|
45
|
+
doc_items: list[DocItem] = []
|
|
46
|
+
for span in self.spans:
|
|
47
|
+
if span.item.self_ref not in seen_doc_item_refs:
|
|
48
|
+
seen_doc_item_refs.add(span.item.self_ref)
|
|
49
|
+
doc_items.append(span.item)
|
|
50
|
+
return doc_items
|
|
51
|
+
|
|
42
52
|
|
|
43
53
|
class BaseTextSerializer(ABC):
|
|
44
54
|
"""Base class for text item serializers."""
|
|
@@ -285,7 +285,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
285
285
|
|
|
286
286
|
def _serialize_body(self, **kwargs) -> SerializationResult:
|
|
287
287
|
"""Serialize the document body."""
|
|
288
|
-
subparts = self.get_parts()
|
|
288
|
+
subparts = self.get_parts(**kwargs)
|
|
289
289
|
res = self.serialize_doc(parts=subparts, **kwargs)
|
|
290
290
|
return res
|
|
291
291
|
|
|
@@ -4098,7 +4098,10 @@ class DoclingDocument(BaseModel):
|
|
|
4098
4098
|
return result
|
|
4099
4099
|
|
|
4100
4100
|
def _with_pictures_refs(
|
|
4101
|
-
self,
|
|
4101
|
+
self,
|
|
4102
|
+
image_dir: Path,
|
|
4103
|
+
page_no: Optional[int],
|
|
4104
|
+
reference_path: Optional[Path] = None,
|
|
4102
4105
|
) -> "DoclingDocument":
|
|
4103
4106
|
"""Document with images as refs.
|
|
4104
4107
|
|
|
@@ -4111,7 +4114,7 @@ class DoclingDocument(BaseModel):
|
|
|
4111
4114
|
image_dir.mkdir(parents=True, exist_ok=True)
|
|
4112
4115
|
|
|
4113
4116
|
if image_dir.is_dir():
|
|
4114
|
-
for item, level in result.iterate_items(with_groups=False):
|
|
4117
|
+
for item, level in result.iterate_items(page_no=page_no, with_groups=False):
|
|
4115
4118
|
if isinstance(item, PictureItem):
|
|
4116
4119
|
|
|
4117
4120
|
if (
|
|
@@ -4211,7 +4214,7 @@ class DoclingDocument(BaseModel):
|
|
|
4211
4214
|
os.makedirs(artifacts_dir, exist_ok=True)
|
|
4212
4215
|
|
|
4213
4216
|
new_doc = self._make_copy_with_refmode(
|
|
4214
|
-
artifacts_dir, image_mode, reference_path=reference_path
|
|
4217
|
+
artifacts_dir, image_mode, page_no=None, reference_path=reference_path
|
|
4215
4218
|
)
|
|
4216
4219
|
|
|
4217
4220
|
out = new_doc.export_to_dict(
|
|
@@ -4254,7 +4257,7 @@ class DoclingDocument(BaseModel):
|
|
|
4254
4257
|
os.makedirs(artifacts_dir, exist_ok=True)
|
|
4255
4258
|
|
|
4256
4259
|
new_doc = self._make_copy_with_refmode(
|
|
4257
|
-
artifacts_dir, image_mode, reference_path=reference_path
|
|
4260
|
+
artifacts_dir, image_mode, page_no=None, reference_path=reference_path
|
|
4258
4261
|
)
|
|
4259
4262
|
|
|
4260
4263
|
out = new_doc.export_to_dict(
|
|
@@ -4327,7 +4330,7 @@ class DoclingDocument(BaseModel):
|
|
|
4327
4330
|
os.makedirs(artifacts_dir, exist_ok=True)
|
|
4328
4331
|
|
|
4329
4332
|
new_doc = self._make_copy_with_refmode(
|
|
4330
|
-
artifacts_dir, image_mode, reference_path=reference_path
|
|
4333
|
+
artifacts_dir, image_mode, page_no, reference_path=reference_path
|
|
4331
4334
|
)
|
|
4332
4335
|
|
|
4333
4336
|
md_out = new_doc.export_to_markdown(
|
|
@@ -4503,7 +4506,7 @@ class DoclingDocument(BaseModel):
|
|
|
4503
4506
|
os.makedirs(artifacts_dir, exist_ok=True)
|
|
4504
4507
|
|
|
4505
4508
|
new_doc = self._make_copy_with_refmode(
|
|
4506
|
-
artifacts_dir, image_mode, reference_path=reference_path
|
|
4509
|
+
artifacts_dir, image_mode, page_no, reference_path=reference_path
|
|
4507
4510
|
)
|
|
4508
4511
|
|
|
4509
4512
|
html_out = new_doc.export_to_html(
|
|
@@ -4542,6 +4545,7 @@ class DoclingDocument(BaseModel):
|
|
|
4542
4545
|
self,
|
|
4543
4546
|
artifacts_dir: Path,
|
|
4544
4547
|
image_mode: ImageRefMode,
|
|
4548
|
+
page_no: Optional[int],
|
|
4545
4549
|
reference_path: Optional[Path] = None,
|
|
4546
4550
|
):
|
|
4547
4551
|
new_doc = None
|
|
@@ -4549,7 +4553,7 @@ class DoclingDocument(BaseModel):
|
|
|
4549
4553
|
new_doc = self
|
|
4550
4554
|
elif image_mode == ImageRefMode.REFERENCED:
|
|
4551
4555
|
new_doc = self._with_pictures_refs(
|
|
4552
|
-
image_dir=artifacts_dir, reference_path=reference_path
|
|
4556
|
+
image_dir=artifacts_dir, page_no=page_no, reference_path=reference_path
|
|
4553
4557
|
)
|
|
4554
4558
|
elif image_mode == ImageRefMode.EMBEDDED:
|
|
4555
4559
|
new_doc = self._with_embedded_pictures()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.43.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -17,17 +17,18 @@ docling_core/search/mapping.py,sha256=6rqG7LgYSeWmooKNEcRa5gFDLp1ZdzPqDGlwTA5gpO
|
|
|
17
17
|
docling_core/search/meta.py,sha256=wSurrsqdP1N3gQKx027fVdzVmc33a7Y6rPl-FClQvtA,3318
|
|
18
18
|
docling_core/search/package.py,sha256=Lz2ml2eDy5t0ZimnGTq-DXHAn-f18w0bn4H5xrhs75A,1841
|
|
19
19
|
docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9ACDd57ds,106
|
|
20
|
-
docling_core/transforms/chunker/__init__.py,sha256=
|
|
20
|
+
docling_core/transforms/chunker/__init__.py,sha256=Qg5RhC-2QqdXKEfjzNGJaVi0NqBCL3xAhKWJGOlrE3M,375
|
|
21
21
|
docling_core/transforms/chunker/base.py,sha256=kJaRrGQynglG9wpy0IaAYTf4MKheWH5BAPzx4LE9yIg,2824
|
|
22
22
|
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=uDf-qGiIT_4JUEg9NOdzvDqAPOTqycKJ-jEpDkV3jJU,8243
|
|
23
23
|
docling_core/transforms/chunker/hybrid_chunker.py,sha256=xjkz8hy3tXXzkJzf7QMFOEq_v8V7Jcs9tCY0Mxjge74,12548
|
|
24
|
+
docling_core/transforms/chunker/page_chunker.py,sha256=gLUlqA_klK-rkuPVYuJKi3ZuTIGdd2HD7ces72AiZ2U,2018
|
|
24
25
|
docling_core/transforms/chunker/tokenizer/__init__.py,sha256=-bhXOTpoI7SYk7vn47z8Ek-RZFjJk4TfZawxsFuNHnE,34
|
|
25
26
|
docling_core/transforms/chunker/tokenizer/base.py,sha256=2gOBQPYJYC0iWXOgMG3DiNP7xEBtii7DYcib0iECq5o,575
|
|
26
27
|
docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=aZ_RNQIzcNkAHGHZw3SBCoqJHM2Ihb65eiM29O9BR6o,2506
|
|
27
28
|
docling_core/transforms/chunker/tokenizer/openai.py,sha256=zt2kwcC-r8MafeEG0CESab8E4RIC9aaFXxxnxOGyTMA,918
|
|
28
29
|
docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
|
|
29
|
-
docling_core/transforms/serializer/base.py,sha256=
|
|
30
|
-
docling_core/transforms/serializer/common.py,sha256=
|
|
30
|
+
docling_core/transforms/serializer/base.py,sha256=TI8Epj7gyxdTet9j-Rs4o5U09gfACfAIVoirlschviM,7266
|
|
31
|
+
docling_core/transforms/serializer/common.py,sha256=0TNEGoA_rJ-qkVYp-X8SMUr3jTrbf6TRzPzwufYh5JM,19114
|
|
31
32
|
docling_core/transforms/serializer/doctags.py,sha256=TD0yAm1qSVy-GsE6svpUAI-Yqjcf2rrTZ3ac9YU3gbE,19858
|
|
32
33
|
docling_core/transforms/serializer/html.py,sha256=oxnUhszRPBINiK1tq2dwf5QjTCrIV_q15vsrPVqBeME,38988
|
|
33
34
|
docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
|
|
@@ -41,7 +42,7 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
|
|
|
41
42
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
42
43
|
docling_core/types/doc/__init__.py,sha256=8hOhm5W9mArf3zwgfoMxDs1pHizhLFSAZlLu1tPBBRk,1641
|
|
43
44
|
docling_core/types/doc/base.py,sha256=i98y4IF250adR-8BSS374K90fwfwG-vBfWh14tLC5Cs,15906
|
|
44
|
-
docling_core/types/doc/document.py,sha256=
|
|
45
|
+
docling_core/types/doc/document.py,sha256=SUqIJ-huO3ELLRdCMUYjkkXHeGGkeY2oaOWFQ1nq5lg,199315
|
|
45
46
|
docling_core/types/doc/labels.py,sha256=-W1-LW6z0J9F9ExJqR0Wd1WeqWTaY3Unm-j1UkQGlC4,7330
|
|
46
47
|
docling_core/types/doc/page.py,sha256=35h1xdtCM3-AaN8Dim9jDseZIiw-3GxpB-ofF-H2rQQ,41878
|
|
47
48
|
docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
|
|
@@ -74,9 +75,9 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
74
75
|
docling_core/utils/legacy.py,sha256=5lghO48OEcV9V51tRnH3YSKgLtdqhr-Q5C_OcJZ8TOs,24392
|
|
75
76
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
76
77
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
77
|
-
docling_core-2.
|
|
78
|
-
docling_core-2.
|
|
79
|
-
docling_core-2.
|
|
80
|
-
docling_core-2.
|
|
81
|
-
docling_core-2.
|
|
82
|
-
docling_core-2.
|
|
78
|
+
docling_core-2.43.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
79
|
+
docling_core-2.43.0.dist-info/METADATA,sha256=4aQkJub9YsHu2OrYq4Y2vIlBYHcOCC-ruxCN6EnWbW4,6453
|
|
80
|
+
docling_core-2.43.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
81
|
+
docling_core-2.43.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
|
|
82
|
+
docling_core-2.43.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
|
|
83
|
+
docling_core-2.43.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|