docling-core 2.27.0__py3-none-any.whl → 2.28.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/experimental/serializer/base.py +25 -19
- docling_core/experimental/serializer/common.py +17 -11
- docling_core/experimental/serializer/doctags.py +14 -11
- docling_core/experimental/serializer/html.py +21 -16
- docling_core/experimental/serializer/markdown.py +24 -16
- docling_core/transforms/chunker/hybrid_chunker.py +49 -31
- docling_core/transforms/chunker/tokenizer/__init__.py +1 -0
- docling_core/transforms/chunker/tokenizer/base.py +25 -0
- docling_core/transforms/chunker/tokenizer/huggingface.py +70 -0
- docling_core/transforms/chunker/tokenizer/openai.py +34 -0
- docling_core/transforms/visualizer/__init__.py +1 -0
- docling_core/transforms/visualizer/base.py +23 -0
- docling_core/transforms/visualizer/layout_visualizer.py +212 -0
- docling_core/transforms/visualizer/reading_order_visualizer.py +149 -0
- docling_core/types/doc/document.py +25 -3
- docling_core/types/doc/page.py +4 -3
- docling_core/types/legacy_doc/document.py +2 -2
- {docling_core-2.27.0.dist-info → docling_core-2.28.1.dist-info}/METADATA +4 -2
- {docling_core-2.27.0.dist-info → docling_core-2.28.1.dist-info}/RECORD +22 -14
- {docling_core-2.27.0.dist-info → docling_core-2.28.1.dist-info}/LICENSE +0 -0
- {docling_core-2.27.0.dist-info → docling_core-2.28.1.dist-info}/WHEEL +0 -0
- {docling_core-2.27.0.dist-info → docling_core-2.28.1.dist-info}/entry_points.txt +0 -0
|
@@ -300,7 +300,7 @@ class TableCell(BaseModel):
|
|
|
300
300
|
@classmethod
|
|
301
301
|
def from_dict_format(cls, data: Any) -> Any:
|
|
302
302
|
"""from_dict_format."""
|
|
303
|
-
if isinstance(data,
|
|
303
|
+
if isinstance(data, dict):
|
|
304
304
|
# Check if this is a native BoundingBox or a bbox from docling-ibm-models
|
|
305
305
|
if (
|
|
306
306
|
# "bbox" not in data
|
|
@@ -1383,7 +1383,7 @@ class TableItem(FloatingItem):
|
|
|
1383
1383
|
if add_cross_cell:
|
|
1384
1384
|
body.append(str(TableToken.OTSL_XCEL.value))
|
|
1385
1385
|
body.append(str(TableToken.OTSL_NL.value))
|
|
1386
|
-
|
|
1386
|
+
body_str = "".join(body)
|
|
1387
1387
|
return body_str
|
|
1388
1388
|
|
|
1389
1389
|
@deprecated("Use export_to_doctags() instead.")
|
|
@@ -2888,7 +2888,7 @@ class DoclingDocument(BaseModel):
|
|
|
2888
2888
|
mode: str = "json",
|
|
2889
2889
|
by_alias: bool = True,
|
|
2890
2890
|
exclude_none: bool = True,
|
|
2891
|
-
) -> Dict:
|
|
2891
|
+
) -> Dict[str, Any]:
|
|
2892
2892
|
"""Export to dict."""
|
|
2893
2893
|
out = self.model_dump(mode=mode, by_alias=by_alias, exclude_none=exclude_none)
|
|
2894
2894
|
|
|
@@ -4044,6 +4044,28 @@ class DoclingDocument(BaseModel):
|
|
|
4044
4044
|
self.pages[page_no] = pitem
|
|
4045
4045
|
return pitem
|
|
4046
4046
|
|
|
4047
|
+
def get_visualization(
|
|
4048
|
+
self, show_label: bool = True
|
|
4049
|
+
) -> dict[Optional[int], PILImage.Image]:
|
|
4050
|
+
"""Get visualization of the document as images by page."""
|
|
4051
|
+
from docling_core.transforms.visualizer.layout_visualizer import (
|
|
4052
|
+
LayoutVisualizer,
|
|
4053
|
+
)
|
|
4054
|
+
from docling_core.transforms.visualizer.reading_order_visualizer import (
|
|
4055
|
+
ReadingOrderVisualizer,
|
|
4056
|
+
)
|
|
4057
|
+
|
|
4058
|
+
visualizer = ReadingOrderVisualizer(
|
|
4059
|
+
base_visualizer=LayoutVisualizer(
|
|
4060
|
+
params=LayoutVisualizer.Params(
|
|
4061
|
+
show_label=show_label,
|
|
4062
|
+
),
|
|
4063
|
+
),
|
|
4064
|
+
)
|
|
4065
|
+
images = visualizer.get_visualization(doc=self)
|
|
4066
|
+
|
|
4067
|
+
return images
|
|
4068
|
+
|
|
4047
4069
|
@field_validator("version")
|
|
4048
4070
|
@classmethod
|
|
4049
4071
|
def check_version_is_compatible(cls, v: str) -> str:
|
docling_core/types/doc/page.py
CHANGED
|
@@ -10,6 +10,7 @@ from enum import Enum
|
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
from typing import (
|
|
12
12
|
Annotated,
|
|
13
|
+
Any,
|
|
13
14
|
Dict,
|
|
14
15
|
Iterator,
|
|
15
16
|
List,
|
|
@@ -538,7 +539,7 @@ class SegmentedPdfPage(SegmentedPage):
|
|
|
538
539
|
cells.append(pc)
|
|
539
540
|
return cells
|
|
540
541
|
|
|
541
|
-
def export_to_dict(self) -> Dict:
|
|
542
|
+
def export_to_dict(self) -> Dict[str, Any]:
|
|
542
543
|
"""Export the page data to a dictionary.
|
|
543
544
|
|
|
544
545
|
Returns:
|
|
@@ -1150,7 +1151,7 @@ class PdfTableOfContents(BaseModel):
|
|
|
1150
1151
|
|
|
1151
1152
|
children: List["PdfTableOfContents"] = []
|
|
1152
1153
|
|
|
1153
|
-
def export_to_dict(self, mode: str = "json") -> Dict:
|
|
1154
|
+
def export_to_dict(self, mode: str = "json") -> Dict[str, Any]:
|
|
1154
1155
|
"""Export the table of contents to a dictionary.
|
|
1155
1156
|
|
|
1156
1157
|
Args:
|
|
@@ -1212,7 +1213,7 @@ class ParsedPdfDocument(BaseModel):
|
|
|
1212
1213
|
def export_to_dict(
|
|
1213
1214
|
self,
|
|
1214
1215
|
mode: str = "json",
|
|
1215
|
-
) -> Dict:
|
|
1216
|
+
) -> Dict[str, Any]:
|
|
1216
1217
|
"""Export the document to a dictionary.
|
|
1217
1218
|
|
|
1218
1219
|
Args:
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
"""Models for the Docling Document data type."""
|
|
7
7
|
|
|
8
8
|
from datetime import datetime
|
|
9
|
-
from typing import Dict, Generic, Optional, Union
|
|
9
|
+
from typing import Any, Dict, Generic, Optional, Union
|
|
10
10
|
|
|
11
11
|
from pydantic import (
|
|
12
12
|
AnyHttpUrl,
|
|
@@ -434,7 +434,7 @@ class ExportedCCSDocument(
|
|
|
434
434
|
|
|
435
435
|
return pagedims
|
|
436
436
|
|
|
437
|
-
def export_to_dict(self) -> Dict:
|
|
437
|
+
def export_to_dict(self) -> Dict[str, Any]:
|
|
438
438
|
"""export_to_dict."""
|
|
439
439
|
return self.model_dump(mode="json", by_alias=True, exclude_none=True)
|
|
440
440
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.28.1
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://github.com/docling-project
|
|
6
6
|
License: MIT
|
|
@@ -26,6 +26,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
|
26
26
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
27
27
|
Classifier: Typing :: Typed
|
|
28
28
|
Provides-Extra: chunking
|
|
29
|
+
Provides-Extra: chunking-openai
|
|
29
30
|
Requires-Dist: jsonref (>=1.1.0,<2.0.0)
|
|
30
31
|
Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
|
|
31
32
|
Requires-Dist: latex2mathml (>=3.77.0,<4.0.0)
|
|
@@ -33,8 +34,9 @@ Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
|
|
33
34
|
Requires-Dist: pillow (>=10.0.0,<12.0.0)
|
|
34
35
|
Requires-Dist: pydantic (>=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2)
|
|
35
36
|
Requires-Dist: pyyaml (>=5.1,<7.0.0)
|
|
36
|
-
Requires-Dist: semchunk (>=2.2.0,<3.0.0) ; extra == "chunking"
|
|
37
|
+
Requires-Dist: semchunk (>=2.2.0,<3.0.0) ; extra == "chunking" or extra == "chunking-openai"
|
|
37
38
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
39
|
+
Requires-Dist: tiktoken (>=0.9.0,<0.10.0) ; extra == "chunking-openai"
|
|
38
40
|
Requires-Dist: transformers (>=4.34.0,<5.0.0) ; extra == "chunking"
|
|
39
41
|
Requires-Dist: typer (>=0.12.5,<0.16.0)
|
|
40
42
|
Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
|
|
@@ -3,12 +3,12 @@ docling_core/cli/__init__.py,sha256=C63yWifzpA0IV7YWDatpAdrhoV8zjqxAKv0xMf09VdM,
|
|
|
3
3
|
docling_core/cli/view.py,sha256=gwxSBYhGqwznMR8pdXaEuAh2bjFD5X_g11xFYSgFgtM,1764
|
|
4
4
|
docling_core/experimental/__init__.py,sha256=XnAVSUHbA6OFhNSpoYqSD3u83-xVaUaki1DIKFw69Ew,99
|
|
5
5
|
docling_core/experimental/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
|
|
6
|
-
docling_core/experimental/serializer/base.py,sha256=
|
|
7
|
-
docling_core/experimental/serializer/common.py,sha256=
|
|
8
|
-
docling_core/experimental/serializer/doctags.py,sha256=
|
|
9
|
-
docling_core/experimental/serializer/html.py,sha256=
|
|
6
|
+
docling_core/experimental/serializer/base.py,sha256=9bgpWA0oMmZNRc3yIuZVnu5bJ1glClBsswtVF1vYwMI,6046
|
|
7
|
+
docling_core/experimental/serializer/common.py,sha256=uviwBuYowzqvCbY-vy8v2VaEadJISk9aDETrkrfDo38,17437
|
|
8
|
+
docling_core/experimental/serializer/doctags.py,sha256=RbHdqmFJ-t3oUvCsv0QjbIZqgUajPrt41jMaJGp4sdA,17874
|
|
9
|
+
docling_core/experimental/serializer/html.py,sha256=By7NoDXQ4GDW-iFf8zWCYuU4f_TOHA8i86eGk60d4WM,33070
|
|
10
10
|
docling_core/experimental/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
|
|
11
|
-
docling_core/experimental/serializer/markdown.py,sha256=
|
|
11
|
+
docling_core/experimental/serializer/markdown.py,sha256=WineuzwGDbFhbqEdz-sNWYewrUwBM0zfj88T8URaq6w,17877
|
|
12
12
|
docling_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
13
|
docling_core/resources/schemas/doc/ANN.json,sha256=04U5j-PU9m5w7IagJ_rHcAx7qUtLkUuaWZO9GuYHnTA,4202
|
|
14
14
|
docling_core/resources/schemas/doc/DOC.json,sha256=9tVKpCqDGGq3074Nn5qlUCdTN-5k1Q0ri_scJblwnLE,6686
|
|
@@ -27,14 +27,22 @@ docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9AC
|
|
|
27
27
|
docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
|
|
28
28
|
docling_core/transforms/chunker/base.py,sha256=kJaRrGQynglG9wpy0IaAYTf4MKheWH5BAPzx4LE9yIg,2824
|
|
29
29
|
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=iYzA65INFo89klc94jixuzQP8ivywe-3aVYznt2Csv8,8287
|
|
30
|
-
docling_core/transforms/chunker/hybrid_chunker.py,sha256=
|
|
30
|
+
docling_core/transforms/chunker/hybrid_chunker.py,sha256=i2rxSE_6JZPClljcA_HVf0Pq5KgLyILhzG7CwRFcTIE,11888
|
|
31
|
+
docling_core/transforms/chunker/tokenizer/__init__.py,sha256=-bhXOTpoI7SYk7vn47z8Ek-RZFjJk4TfZawxsFuNHnE,34
|
|
32
|
+
docling_core/transforms/chunker/tokenizer/base.py,sha256=2gOBQPYJYC0iWXOgMG3DiNP7xEBtii7DYcib0iECq5o,575
|
|
33
|
+
docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=JQ-D3b5vTPQbvu4HaMfYqFzSBLbV_HnmoBGv7d6Kqn4,2220
|
|
34
|
+
docling_core/transforms/chunker/tokenizer/openai.py,sha256=zt2kwcC-r8MafeEG0CESab8E4RIC9aaFXxxnxOGyTMA,918
|
|
35
|
+
docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
|
|
36
|
+
docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
|
|
37
|
+
docling_core/transforms/visualizer/layout_visualizer.py,sha256=nUOiDHuDYLM-Bcagiwz6JicaAhZroOdFCOyl1I8GUjA,7655
|
|
38
|
+
docling_core/transforms/visualizer/reading_order_visualizer.py,sha256=XXVuiI-Y0AH5uJCXINmfzcSSkTwR55-4fL6TOgzir6Y,5203
|
|
31
39
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
32
40
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
33
41
|
docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
|
|
34
42
|
docling_core/types/doc/base.py,sha256=sM3IyFXzVh2WT8IGh5nejXYh8sf39yBh8TBSlHeJ9CI,12611
|
|
35
|
-
docling_core/types/doc/document.py,sha256=
|
|
43
|
+
docling_core/types/doc/document.py,sha256=4toRMU04V1rWaquyvcXPB9hzefD3cH_8MatgBCf1Mc4,140170
|
|
36
44
|
docling_core/types/doc/labels.py,sha256=3QgteZZ4jKi0fideTuTnuriviJBwew-5RKE4pse7Ppk,5812
|
|
37
|
-
docling_core/types/doc/page.py,sha256=
|
|
45
|
+
docling_core/types/doc/page.py,sha256=44tK6XM6Py0pK7zTyJ4kaZ5MLj8PvXIiw31hoQYa-Xs,40309
|
|
38
46
|
docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
|
|
39
47
|
docling_core/types/doc/utils.py,sha256=SaiQD-WMMooFm1bMqwatU-IGhtG048iKJb-ppnJit_k,2250
|
|
40
48
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
@@ -45,7 +53,7 @@ docling_core/types/legacy_doc/base.py,sha256=aBKBunw6M6nvEq4lqP1cfFWK3GpGa6PXwNQ
|
|
|
45
53
|
docling_core/types/legacy_doc/doc_ann.py,sha256=CIQHW8yzu70bsMR9gtu7dqe4oz603Tq2eDDt9sh-tYo,1203
|
|
46
54
|
docling_core/types/legacy_doc/doc_ocr.py,sha256=FfFqHAyMSbFt5cKeE7QLcxS0qUweBilBJoN9CH2TsQs,1394
|
|
47
55
|
docling_core/types/legacy_doc/doc_raw.py,sha256=LrvQ9DhNjBRy98p_F9PUyHZeTGAxMKWqJzY4WJ7v-xs,3895
|
|
48
|
-
docling_core/types/legacy_doc/document.py,sha256=
|
|
56
|
+
docling_core/types/legacy_doc/document.py,sha256=lEuxUS03YrY4dKvfzB1I208x6LtD0zukV9QU0hfjuwM,24549
|
|
49
57
|
docling_core/types/legacy_doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
|
|
50
58
|
docling_core/types/nlp/__init__.py,sha256=hGcztAeVK7xkRBqRRvc4zbY4PGeJ0r0QrEsetnSx9nI,119
|
|
51
59
|
docling_core/types/nlp/qa.py,sha256=TyZjubqkEoREv0YzmuLKlq4WW_TnJNj7BoBY1_r2a1E,2731
|
|
@@ -65,8 +73,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
65
73
|
docling_core/utils/legacy.py,sha256=SqNQAxl97aHfoJEsC9vZcMJg5FNkmqKPFi-wdSrnfI0,24442
|
|
66
74
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
67
75
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
68
|
-
docling_core-2.
|
|
69
|
-
docling_core-2.
|
|
70
|
-
docling_core-2.
|
|
71
|
-
docling_core-2.
|
|
72
|
-
docling_core-2.
|
|
76
|
+
docling_core-2.28.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
77
|
+
docling_core-2.28.1.dist-info/METADATA,sha256=02gkT1pLcBA0yagMKLBAEpjML_omcGsZS8dDG2RCFVY,5976
|
|
78
|
+
docling_core-2.28.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
79
|
+
docling_core-2.28.1.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
|
|
80
|
+
docling_core-2.28.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|