docling-core 2.41.0__tar.gz → 2.43.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.41.0 → docling_core-2.43.0}/PKG-INFO +1 -1
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/chunker/__init__.py +1 -0
- docling_core-2.43.0/docling_core/transforms/chunker/page_chunker.py +59 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/serializer/base.py +10 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/serializer/common.py +1 -1
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/doc/base.py +17 -10
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/doc/document.py +1296 -158
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/doc/page.py +7 -3
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core.egg-info/PKG-INFO +1 -1
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core.egg-info/SOURCES.txt +2 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/pyproject.toml +1 -1
- {docling_core-2.41.0 → docling_core-2.43.0}/test/test_docling_doc.py +289 -2
- docling_core-2.43.0/test/test_page_chunker.py +36 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/LICENSE +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/README.md +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/__init__.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/py.typed +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/search/package.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/serializer/__init__.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/serializer/doctags.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/serializer/html.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/serializer/html_styles.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/serializer/markdown.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/visualizer/__init__.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/visualizer/base.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/base.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core/utils/validators.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core.egg-info/dependency_links.txt +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core.egg-info/entry_points.txt +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core.egg-info/requires.txt +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/docling_core.egg-info/top_level.txt +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/setup.cfg +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/test/test_base.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/test/test_collection.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/test/test_data_gen_flag.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/test/test_doc_base.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/test/test_doc_legacy_convert.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/test/test_doc_schema.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/test/test_doc_schema_extractor.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/test/test_doctags_load.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/test/test_hierarchical_chunker.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/test/test_hybrid_chunker.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/test/test_json_schema_to_search_mapper.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/test/test_nlp_qa.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/test/test_otsl_table_export.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/test/test_page.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/test/test_rec_schema.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/test/test_search_meta.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/test/test_serialization.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/test/test_utils.py +0 -0
- {docling_core-2.41.0 → docling_core-2.43.0}/test/test_visualization.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.43.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Page-based chunker implementation: each chunk corresponds to a single page."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Iterator
|
|
6
|
+
|
|
7
|
+
from pydantic import ConfigDict
|
|
8
|
+
from typing_extensions import override
|
|
9
|
+
|
|
10
|
+
from docling_core.transforms.chunker import BaseChunker, DocChunk, DocMeta
|
|
11
|
+
from docling_core.transforms.chunker.hierarchical_chunker import (
|
|
12
|
+
ChunkingSerializerProvider,
|
|
13
|
+
)
|
|
14
|
+
from docling_core.types import DoclingDocument as DLDocument
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PageChunker(BaseChunker):
|
|
18
|
+
r"""Chunker implementation that yields one chunk per page."""
|
|
19
|
+
|
|
20
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
21
|
+
|
|
22
|
+
serializer_provider: ChunkingSerializerProvider = ChunkingSerializerProvider()
|
|
23
|
+
|
|
24
|
+
@override
|
|
25
|
+
def chunk(
|
|
26
|
+
self,
|
|
27
|
+
dl_doc: DLDocument,
|
|
28
|
+
**kwargs: Any,
|
|
29
|
+
) -> Iterator[DocChunk]:
|
|
30
|
+
"""Chunk the provided document by page."""
|
|
31
|
+
my_doc_ser = self.serializer_provider.get_serializer(doc=dl_doc)
|
|
32
|
+
if dl_doc.pages:
|
|
33
|
+
# chunk by page
|
|
34
|
+
for page_no in sorted(dl_doc.pages.keys()):
|
|
35
|
+
ser_res = my_doc_ser.serialize(pages={page_no})
|
|
36
|
+
if not ser_res.text:
|
|
37
|
+
continue
|
|
38
|
+
yield DocChunk(
|
|
39
|
+
text=ser_res.text,
|
|
40
|
+
meta=DocMeta(
|
|
41
|
+
doc_items=ser_res.get_unique_doc_items(),
|
|
42
|
+
headings=None,
|
|
43
|
+
captions=None,
|
|
44
|
+
origin=dl_doc.origin,
|
|
45
|
+
),
|
|
46
|
+
)
|
|
47
|
+
else:
|
|
48
|
+
# if no pages, treat whole document as single chunk
|
|
49
|
+
ser_res = my_doc_ser.serialize()
|
|
50
|
+
if ser_res.text:
|
|
51
|
+
yield DocChunk(
|
|
52
|
+
text=ser_res.text,
|
|
53
|
+
meta=DocMeta(
|
|
54
|
+
doc_items=ser_res.get_unique_doc_items(),
|
|
55
|
+
headings=None,
|
|
56
|
+
captions=None,
|
|
57
|
+
origin=dl_doc.origin,
|
|
58
|
+
),
|
|
59
|
+
)
|
|
@@ -39,6 +39,16 @@ class SerializationResult(BaseModel):
|
|
|
39
39
|
spans: list[Span] = []
|
|
40
40
|
# group: Optional[GroupItem] = None # set when result reflects specific group item
|
|
41
41
|
|
|
42
|
+
def get_unique_doc_items(self) -> list[DocItem]:
|
|
43
|
+
"""Get the doc items corresponding to this result."""
|
|
44
|
+
seen_doc_item_refs: set[str] = set()
|
|
45
|
+
doc_items: list[DocItem] = []
|
|
46
|
+
for span in self.spans:
|
|
47
|
+
if span.item.self_ref not in seen_doc_item_refs:
|
|
48
|
+
seen_doc_item_refs.add(span.item.self_ref)
|
|
49
|
+
doc_items.append(span.item)
|
|
50
|
+
return doc_items
|
|
51
|
+
|
|
42
52
|
|
|
43
53
|
class BaseTextSerializer(ABC):
|
|
44
54
|
"""Base class for text item serializers."""
|
|
@@ -285,7 +285,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
285
285
|
|
|
286
286
|
def _serialize_body(self, **kwargs) -> SerializationResult:
|
|
287
287
|
"""Serialize the document body."""
|
|
288
|
-
subparts = self.get_parts()
|
|
288
|
+
subparts = self.get_parts(**kwargs)
|
|
289
289
|
res = self.serialize_doc(parts=subparts, **kwargs)
|
|
290
290
|
return res
|
|
291
291
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Models for the base data types."""
|
|
2
2
|
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import List, Tuple
|
|
4
|
+
from typing import Any, List, Tuple
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel, FieldSerializationInfo, field_serializer
|
|
7
7
|
|
|
@@ -21,16 +21,23 @@ class CoordOrigin(str, Enum):
|
|
|
21
21
|
BOTTOMLEFT = "BOTTOMLEFT"
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
|
|
24
|
+
class PydanticSerCtxKey(str, Enum):
|
|
25
|
+
"""Pydantic serialization context keys."""
|
|
25
26
|
|
|
27
|
+
COORD_PREC = "coord_prec" # key for coordinates precision
|
|
28
|
+
CONFID_PREC = "confid_prec" # key for confidence values precision
|
|
26
29
|
|
|
27
|
-
|
|
28
|
-
|
|
30
|
+
|
|
31
|
+
def round_pydantic_float(
|
|
32
|
+
val: float, ctx: Any, precision_ctx_key: PydanticSerCtxKey
|
|
29
33
|
) -> float:
|
|
30
|
-
precision
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
+
"""Round float, provided the precision is available in the context."""
|
|
35
|
+
precision = (
|
|
36
|
+
ctx.get(precision_ctx_key.value)
|
|
37
|
+
if isinstance(ctx, dict)
|
|
38
|
+
else getattr(ctx, precision_ctx_key.value, None)
|
|
39
|
+
)
|
|
40
|
+
return round(val, precision) if isinstance(precision, int) else val
|
|
34
41
|
|
|
35
42
|
|
|
36
43
|
class Size(BaseModel):
|
|
@@ -41,7 +48,7 @@ class Size(BaseModel):
|
|
|
41
48
|
|
|
42
49
|
@field_serializer("width", "height")
|
|
43
50
|
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
|
|
44
|
-
return
|
|
51
|
+
return round_pydantic_float(value, info.context, PydanticSerCtxKey.COORD_PREC)
|
|
45
52
|
|
|
46
53
|
def as_tuple(self):
|
|
47
54
|
"""as_tuple."""
|
|
@@ -70,7 +77,7 @@ class BoundingBox(BaseModel):
|
|
|
70
77
|
|
|
71
78
|
@field_serializer("l", "t", "r", "b")
|
|
72
79
|
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
|
|
73
|
-
return
|
|
80
|
+
return round_pydantic_float(value, info.context, PydanticSerCtxKey.COORD_PREC)
|
|
74
81
|
|
|
75
82
|
def resize_by_scale(self, x_scale: float, y_scale: float):
|
|
76
83
|
"""resize_by_scale."""
|