docling-core 2.27.0__py3-none-any.whl → 2.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/experimental/serializer/base.py +25 -19
- docling_core/experimental/serializer/common.py +17 -11
- docling_core/experimental/serializer/doctags.py +14 -11
- docling_core/experimental/serializer/html.py +21 -16
- docling_core/experimental/serializer/markdown.py +24 -16
- docling_core/transforms/chunker/hybrid_chunker.py +49 -31
- docling_core/transforms/chunker/tokenizer/__init__.py +1 -0
- docling_core/transforms/chunker/tokenizer/base.py +25 -0
- docling_core/transforms/chunker/tokenizer/huggingface.py +70 -0
- docling_core/transforms/chunker/tokenizer/openai.py +34 -0
- docling_core/transforms/visualizer/__init__.py +1 -0
- docling_core/transforms/visualizer/base.py +23 -0
- docling_core/transforms/visualizer/layout_visualizer.py +201 -0
- docling_core/transforms/visualizer/reading_order_visualizer.py +149 -0
- docling_core/types/doc/document.py +24 -2
- docling_core/types/doc/page.py +4 -3
- docling_core/types/legacy_doc/document.py +2 -2
- {docling_core-2.27.0.dist-info → docling_core-2.28.0.dist-info}/METADATA +4 -2
- {docling_core-2.27.0.dist-info → docling_core-2.28.0.dist-info}/RECORD +22 -14
- {docling_core-2.27.0.dist-info → docling_core-2.28.0.dist-info}/LICENSE +0 -0
- {docling_core-2.27.0.dist-info → docling_core-2.28.0.dist-info}/WHEEL +0 -0
- {docling_core-2.27.0.dist-info → docling_core-2.28.0.dist-info}/entry_points.txt +0 -0
docling_core/types/doc/page.py
CHANGED
|
@@ -10,6 +10,7 @@ from enum import Enum
|
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
from typing import (
|
|
12
12
|
Annotated,
|
|
13
|
+
Any,
|
|
13
14
|
Dict,
|
|
14
15
|
Iterator,
|
|
15
16
|
List,
|
|
@@ -538,7 +539,7 @@ class SegmentedPdfPage(SegmentedPage):
|
|
|
538
539
|
cells.append(pc)
|
|
539
540
|
return cells
|
|
540
541
|
|
|
541
|
-
def export_to_dict(self) -> Dict:
|
|
542
|
+
def export_to_dict(self) -> Dict[str, Any]:
|
|
542
543
|
"""Export the page data to a dictionary.
|
|
543
544
|
|
|
544
545
|
Returns:
|
|
@@ -1150,7 +1151,7 @@ class PdfTableOfContents(BaseModel):
|
|
|
1150
1151
|
|
|
1151
1152
|
children: List["PdfTableOfContents"] = []
|
|
1152
1153
|
|
|
1153
|
-
def export_to_dict(self, mode: str = "json") -> Dict:
|
|
1154
|
+
def export_to_dict(self, mode: str = "json") -> Dict[str, Any]:
|
|
1154
1155
|
"""Export the table of contents to a dictionary.
|
|
1155
1156
|
|
|
1156
1157
|
Args:
|
|
@@ -1212,7 +1213,7 @@ class ParsedPdfDocument(BaseModel):
|
|
|
1212
1213
|
def export_to_dict(
|
|
1213
1214
|
self,
|
|
1214
1215
|
mode: str = "json",
|
|
1215
|
-
) -> Dict:
|
|
1216
|
+
) -> Dict[str, Any]:
|
|
1216
1217
|
"""Export the document to a dictionary.
|
|
1217
1218
|
|
|
1218
1219
|
Args:
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
"""Models for the Docling Document data type."""
|
|
7
7
|
|
|
8
8
|
from datetime import datetime
|
|
9
|
-
from typing import Dict, Generic, Optional, Union
|
|
9
|
+
from typing import Any, Dict, Generic, Optional, Union
|
|
10
10
|
|
|
11
11
|
from pydantic import (
|
|
12
12
|
AnyHttpUrl,
|
|
@@ -434,7 +434,7 @@ class ExportedCCSDocument(
|
|
|
434
434
|
|
|
435
435
|
return pagedims
|
|
436
436
|
|
|
437
|
-
def export_to_dict(self) -> Dict:
|
|
437
|
+
def export_to_dict(self) -> Dict[str, Any]:
|
|
438
438
|
"""export_to_dict."""
|
|
439
439
|
return self.model_dump(mode="json", by_alias=True, exclude_none=True)
|
|
440
440
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.28.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://github.com/docling-project
|
|
6
6
|
License: MIT
|
|
@@ -26,6 +26,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
|
26
26
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
27
27
|
Classifier: Typing :: Typed
|
|
28
28
|
Provides-Extra: chunking
|
|
29
|
+
Provides-Extra: chunking-openai
|
|
29
30
|
Requires-Dist: jsonref (>=1.1.0,<2.0.0)
|
|
30
31
|
Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
|
|
31
32
|
Requires-Dist: latex2mathml (>=3.77.0,<4.0.0)
|
|
@@ -33,8 +34,9 @@ Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
|
|
33
34
|
Requires-Dist: pillow (>=10.0.0,<12.0.0)
|
|
34
35
|
Requires-Dist: pydantic (>=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2)
|
|
35
36
|
Requires-Dist: pyyaml (>=5.1,<7.0.0)
|
|
36
|
-
Requires-Dist: semchunk (>=2.2.0,<3.0.0) ; extra == "chunking"
|
|
37
|
+
Requires-Dist: semchunk (>=2.2.0,<3.0.0) ; extra == "chunking" or extra == "chunking-openai"
|
|
37
38
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
39
|
+
Requires-Dist: tiktoken (>=0.9.0,<0.10.0) ; extra == "chunking-openai"
|
|
38
40
|
Requires-Dist: transformers (>=4.34.0,<5.0.0) ; extra == "chunking"
|
|
39
41
|
Requires-Dist: typer (>=0.12.5,<0.16.0)
|
|
40
42
|
Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
|
|
@@ -3,12 +3,12 @@ docling_core/cli/__init__.py,sha256=C63yWifzpA0IV7YWDatpAdrhoV8zjqxAKv0xMf09VdM,
|
|
|
3
3
|
docling_core/cli/view.py,sha256=gwxSBYhGqwznMR8pdXaEuAh2bjFD5X_g11xFYSgFgtM,1764
|
|
4
4
|
docling_core/experimental/__init__.py,sha256=XnAVSUHbA6OFhNSpoYqSD3u83-xVaUaki1DIKFw69Ew,99
|
|
5
5
|
docling_core/experimental/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
|
|
6
|
-
docling_core/experimental/serializer/base.py,sha256=
|
|
7
|
-
docling_core/experimental/serializer/common.py,sha256=
|
|
8
|
-
docling_core/experimental/serializer/doctags.py,sha256=
|
|
9
|
-
docling_core/experimental/serializer/html.py,sha256=
|
|
6
|
+
docling_core/experimental/serializer/base.py,sha256=9bgpWA0oMmZNRc3yIuZVnu5bJ1glClBsswtVF1vYwMI,6046
|
|
7
|
+
docling_core/experimental/serializer/common.py,sha256=uviwBuYowzqvCbY-vy8v2VaEadJISk9aDETrkrfDo38,17437
|
|
8
|
+
docling_core/experimental/serializer/doctags.py,sha256=RbHdqmFJ-t3oUvCsv0QjbIZqgUajPrt41jMaJGp4sdA,17874
|
|
9
|
+
docling_core/experimental/serializer/html.py,sha256=By7NoDXQ4GDW-iFf8zWCYuU4f_TOHA8i86eGk60d4WM,33070
|
|
10
10
|
docling_core/experimental/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
|
|
11
|
-
docling_core/experimental/serializer/markdown.py,sha256=
|
|
11
|
+
docling_core/experimental/serializer/markdown.py,sha256=WineuzwGDbFhbqEdz-sNWYewrUwBM0zfj88T8URaq6w,17877
|
|
12
12
|
docling_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
13
|
docling_core/resources/schemas/doc/ANN.json,sha256=04U5j-PU9m5w7IagJ_rHcAx7qUtLkUuaWZO9GuYHnTA,4202
|
|
14
14
|
docling_core/resources/schemas/doc/DOC.json,sha256=9tVKpCqDGGq3074Nn5qlUCdTN-5k1Q0ri_scJblwnLE,6686
|
|
@@ -27,14 +27,22 @@ docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9AC
|
|
|
27
27
|
docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
|
|
28
28
|
docling_core/transforms/chunker/base.py,sha256=kJaRrGQynglG9wpy0IaAYTf4MKheWH5BAPzx4LE9yIg,2824
|
|
29
29
|
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=iYzA65INFo89klc94jixuzQP8ivywe-3aVYznt2Csv8,8287
|
|
30
|
-
docling_core/transforms/chunker/hybrid_chunker.py,sha256=
|
|
30
|
+
docling_core/transforms/chunker/hybrid_chunker.py,sha256=i2rxSE_6JZPClljcA_HVf0Pq5KgLyILhzG7CwRFcTIE,11888
|
|
31
|
+
docling_core/transforms/chunker/tokenizer/__init__.py,sha256=-bhXOTpoI7SYk7vn47z8Ek-RZFjJk4TfZawxsFuNHnE,34
|
|
32
|
+
docling_core/transforms/chunker/tokenizer/base.py,sha256=2gOBQPYJYC0iWXOgMG3DiNP7xEBtii7DYcib0iECq5o,575
|
|
33
|
+
docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=JQ-D3b5vTPQbvu4HaMfYqFzSBLbV_HnmoBGv7d6Kqn4,2220
|
|
34
|
+
docling_core/transforms/chunker/tokenizer/openai.py,sha256=zt2kwcC-r8MafeEG0CESab8E4RIC9aaFXxxnxOGyTMA,918
|
|
35
|
+
docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
|
|
36
|
+
docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
|
|
37
|
+
docling_core/transforms/visualizer/layout_visualizer.py,sha256=G_xPs5S_87RPPAIMKM6ryMU2aV_zGLYUTOlTQprIRD4,7336
|
|
38
|
+
docling_core/transforms/visualizer/reading_order_visualizer.py,sha256=XXVuiI-Y0AH5uJCXINmfzcSSkTwR55-4fL6TOgzir6Y,5203
|
|
31
39
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
32
40
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
33
41
|
docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
|
|
34
42
|
docling_core/types/doc/base.py,sha256=sM3IyFXzVh2WT8IGh5nejXYh8sf39yBh8TBSlHeJ9CI,12611
|
|
35
|
-
docling_core/types/doc/document.py,sha256=
|
|
43
|
+
docling_core/types/doc/document.py,sha256=uYQTUEeZ40T5698Xff7NhC3iTbk1F76omZNvHIUmrfc,140174
|
|
36
44
|
docling_core/types/doc/labels.py,sha256=3QgteZZ4jKi0fideTuTnuriviJBwew-5RKE4pse7Ppk,5812
|
|
37
|
-
docling_core/types/doc/page.py,sha256=
|
|
45
|
+
docling_core/types/doc/page.py,sha256=44tK6XM6Py0pK7zTyJ4kaZ5MLj8PvXIiw31hoQYa-Xs,40309
|
|
38
46
|
docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
|
|
39
47
|
docling_core/types/doc/utils.py,sha256=SaiQD-WMMooFm1bMqwatU-IGhtG048iKJb-ppnJit_k,2250
|
|
40
48
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
@@ -45,7 +53,7 @@ docling_core/types/legacy_doc/base.py,sha256=aBKBunw6M6nvEq4lqP1cfFWK3GpGa6PXwNQ
|
|
|
45
53
|
docling_core/types/legacy_doc/doc_ann.py,sha256=CIQHW8yzu70bsMR9gtu7dqe4oz603Tq2eDDt9sh-tYo,1203
|
|
46
54
|
docling_core/types/legacy_doc/doc_ocr.py,sha256=FfFqHAyMSbFt5cKeE7QLcxS0qUweBilBJoN9CH2TsQs,1394
|
|
47
55
|
docling_core/types/legacy_doc/doc_raw.py,sha256=LrvQ9DhNjBRy98p_F9PUyHZeTGAxMKWqJzY4WJ7v-xs,3895
|
|
48
|
-
docling_core/types/legacy_doc/document.py,sha256=
|
|
56
|
+
docling_core/types/legacy_doc/document.py,sha256=lEuxUS03YrY4dKvfzB1I208x6LtD0zukV9QU0hfjuwM,24549
|
|
49
57
|
docling_core/types/legacy_doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
|
|
50
58
|
docling_core/types/nlp/__init__.py,sha256=hGcztAeVK7xkRBqRRvc4zbY4PGeJ0r0QrEsetnSx9nI,119
|
|
51
59
|
docling_core/types/nlp/qa.py,sha256=TyZjubqkEoREv0YzmuLKlq4WW_TnJNj7BoBY1_r2a1E,2731
|
|
@@ -65,8 +73,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
65
73
|
docling_core/utils/legacy.py,sha256=SqNQAxl97aHfoJEsC9vZcMJg5FNkmqKPFi-wdSrnfI0,24442
|
|
66
74
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
67
75
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
68
|
-
docling_core-2.
|
|
69
|
-
docling_core-2.
|
|
70
|
-
docling_core-2.
|
|
71
|
-
docling_core-2.
|
|
72
|
-
docling_core-2.
|
|
76
|
+
docling_core-2.28.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
77
|
+
docling_core-2.28.0.dist-info/METADATA,sha256=uWyLwSsIWmUuQvfTYctf24fkDeYck3PAE9UsjSf85z8,5976
|
|
78
|
+
docling_core-2.28.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
79
|
+
docling_core-2.28.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
|
|
80
|
+
docling_core-2.28.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|