docling-core 2.6.0__tar.gz → 2.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.6.0 → docling_core-2.7.0}/PKG-INFO +2 -2
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/doc/document.py +95 -3
- docling_core-2.7.0/docling_core/types/doc/utils.py +48 -0
- docling_core-2.7.0/docling_core/types/legacy_doc/tokens.py +202 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/utils/file.py +1 -40
- {docling_core-2.6.0 → docling_core-2.7.0}/pyproject.toml +2 -2
- {docling_core-2.6.0 → docling_core-2.7.0}/LICENSE +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/README.md +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/__init__.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/py.typed +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/search/package.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/base.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.6.0/docling_core/types/legacy_doc → docling_core-2.7.0/docling_core/types/doc}/tokens.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.6.0 → docling_core-2.7.0}/docling_core/utils/validators.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.7.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://ds4sd.github.io/
|
|
6
6
|
License: MIT
|
|
@@ -29,7 +29,7 @@ Requires-Dist: jsonref (>=1.1.0,<2.0.0)
|
|
|
29
29
|
Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
|
|
30
30
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
|
31
31
|
Requires-Dist: pillow (>=10.3.0,<11.0.0)
|
|
32
|
-
Requires-Dist: pydantic (>=2.6.0,<2.10)
|
|
32
|
+
Requires-Dist: pydantic (>=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2)
|
|
33
33
|
Requires-Dist: pyyaml (>=5.1,<7.0.0)
|
|
34
34
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
35
35
|
Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
|
|
@@ -37,8 +37,8 @@ from docling_core.types.base import _JSON_POINTER_REGEX
|
|
|
37
37
|
from docling_core.types.doc import BoundingBox, Size
|
|
38
38
|
from docling_core.types.doc.base import ImageRefMode
|
|
39
39
|
from docling_core.types.doc.labels import DocItemLabel, GroupLabel
|
|
40
|
-
from docling_core.types.
|
|
41
|
-
from docling_core.utils
|
|
40
|
+
from docling_core.types.doc.tokens import DocumentToken, TableToken
|
|
41
|
+
from docling_core.types.doc.utils import relative_path
|
|
42
42
|
|
|
43
43
|
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
44
44
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
@@ -1008,7 +1008,6 @@ class TableItem(FloatingItem):
|
|
|
1008
1008
|
DeprecationWarning,
|
|
1009
1009
|
)
|
|
1010
1010
|
|
|
1011
|
-
body = ""
|
|
1012
1011
|
nrows = self.data.num_rows
|
|
1013
1012
|
ncols = self.data.num_cols
|
|
1014
1013
|
|
|
@@ -1065,6 +1064,99 @@ class TableItem(FloatingItem):
|
|
|
1065
1064
|
|
|
1066
1065
|
return body
|
|
1067
1066
|
|
|
1067
|
+
def export_to_otsl(
|
|
1068
|
+
self,
|
|
1069
|
+
doc: "DoclingDocument",
|
|
1070
|
+
add_cell_location: bool = True,
|
|
1071
|
+
add_cell_text: bool = True,
|
|
1072
|
+
xsize: int = 100,
|
|
1073
|
+
ysize: int = 100,
|
|
1074
|
+
) -> str:
|
|
1075
|
+
"""Export the table as OTSL."""
|
|
1076
|
+
# Possible OTSL tokens...
|
|
1077
|
+
#
|
|
1078
|
+
# Empty and full cells:
|
|
1079
|
+
# "ecel", "fcel"
|
|
1080
|
+
#
|
|
1081
|
+
# Cell spans (horisontal, vertical, 2d):
|
|
1082
|
+
# "lcel", "ucel", "xcel"
|
|
1083
|
+
#
|
|
1084
|
+
# New line:
|
|
1085
|
+
# "nl"
|
|
1086
|
+
#
|
|
1087
|
+
# Headers (column, row, section row):
|
|
1088
|
+
# "ched", "rhed", "srow"
|
|
1089
|
+
|
|
1090
|
+
body = []
|
|
1091
|
+
nrows = self.data.num_rows
|
|
1092
|
+
ncols = self.data.num_cols
|
|
1093
|
+
if len(self.data.table_cells) == 0:
|
|
1094
|
+
return ""
|
|
1095
|
+
|
|
1096
|
+
page_no = 0
|
|
1097
|
+
if len(self.prov) > 0:
|
|
1098
|
+
page_no = self.prov[0].page_no
|
|
1099
|
+
|
|
1100
|
+
for i in range(nrows):
|
|
1101
|
+
for j in range(ncols):
|
|
1102
|
+
cell: TableCell = self.data.grid[i][j]
|
|
1103
|
+
content = cell.text.strip()
|
|
1104
|
+
rowspan, rowstart = (
|
|
1105
|
+
cell.row_span,
|
|
1106
|
+
cell.start_row_offset_idx,
|
|
1107
|
+
)
|
|
1108
|
+
colspan, colstart = (
|
|
1109
|
+
cell.col_span,
|
|
1110
|
+
cell.start_col_offset_idx,
|
|
1111
|
+
)
|
|
1112
|
+
|
|
1113
|
+
if len(doc.pages.keys()):
|
|
1114
|
+
page_w, page_h = doc.pages[page_no].size.as_tuple()
|
|
1115
|
+
cell_loc = ""
|
|
1116
|
+
if cell.bbox is not None:
|
|
1117
|
+
cell_loc = DocumentToken.get_location(
|
|
1118
|
+
bbox=cell.bbox.to_bottom_left_origin(page_h).as_tuple(),
|
|
1119
|
+
page_w=page_w,
|
|
1120
|
+
page_h=page_h,
|
|
1121
|
+
xsize=xsize,
|
|
1122
|
+
ysize=ysize,
|
|
1123
|
+
page_i=page_no,
|
|
1124
|
+
)
|
|
1125
|
+
|
|
1126
|
+
if rowstart == i and colstart == j:
|
|
1127
|
+
if len(content) > 0:
|
|
1128
|
+
if cell.column_header:
|
|
1129
|
+
body.append(str(TableToken.OTSL_CHED.value))
|
|
1130
|
+
elif cell.row_header:
|
|
1131
|
+
body.append(str(TableToken.OTSL_RHED.value))
|
|
1132
|
+
elif cell.row_section:
|
|
1133
|
+
body.append(str(TableToken.OTSL_SROW.value))
|
|
1134
|
+
else:
|
|
1135
|
+
body.append(str(TableToken.OTSL_FCEL.value))
|
|
1136
|
+
if add_cell_location:
|
|
1137
|
+
body.append(str(cell_loc))
|
|
1138
|
+
if add_cell_text:
|
|
1139
|
+
body.append(str(content))
|
|
1140
|
+
else:
|
|
1141
|
+
body.append(str(TableToken.OTSL_ECEL.value))
|
|
1142
|
+
else:
|
|
1143
|
+
add_cross_cell = False
|
|
1144
|
+
if rowstart != i:
|
|
1145
|
+
if colspan == 1:
|
|
1146
|
+
body.append(str(TableToken.OTSL_UCEL.value))
|
|
1147
|
+
else:
|
|
1148
|
+
add_cross_cell = True
|
|
1149
|
+
if colstart != j:
|
|
1150
|
+
if rowspan == 1:
|
|
1151
|
+
body.append(str(TableToken.OTSL_LCEL.value))
|
|
1152
|
+
else:
|
|
1153
|
+
add_cross_cell = True
|
|
1154
|
+
if add_cross_cell:
|
|
1155
|
+
body.append(str(TableToken.OTSL_XCEL.value))
|
|
1156
|
+
body.append(str(TableToken.OTSL_NL.value))
|
|
1157
|
+
body_str = "".join(body)
|
|
1158
|
+
return body_str
|
|
1159
|
+
|
|
1068
1160
|
def export_to_document_tokens(
|
|
1069
1161
|
self,
|
|
1070
1162
|
doc: "DoclingDocument",
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Utils for document types."""
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def relative_path(src: Path, target: Path) -> Path:
|
|
12
|
+
"""Compute the relative path from `src` to `target`.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
src (str | Path): The source directory or file path (must be absolute).
|
|
16
|
+
target (str | Path): The target directory or file path (must be absolute).
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Path: The relative path from `src` to `target`.
|
|
20
|
+
|
|
21
|
+
Raises:
|
|
22
|
+
ValueError: If either `src` or `target` is not an absolute path.
|
|
23
|
+
"""
|
|
24
|
+
src = Path(src).resolve()
|
|
25
|
+
target = Path(target).resolve()
|
|
26
|
+
|
|
27
|
+
# Ensure both paths are absolute
|
|
28
|
+
if not src.is_absolute():
|
|
29
|
+
raise ValueError(f"The source path must be absolute: {src}")
|
|
30
|
+
if not target.is_absolute():
|
|
31
|
+
raise ValueError(f"The target path must be absolute: {target}")
|
|
32
|
+
|
|
33
|
+
# Find the common ancestor
|
|
34
|
+
common_parts = []
|
|
35
|
+
for src_part, target_part in zip(src.parts, target.parts):
|
|
36
|
+
if src_part == target_part:
|
|
37
|
+
common_parts.append(src_part)
|
|
38
|
+
else:
|
|
39
|
+
break
|
|
40
|
+
|
|
41
|
+
# Determine the path to go up from src to the common ancestor
|
|
42
|
+
up_segments = [".."] * (len(src.parts) - len(common_parts))
|
|
43
|
+
|
|
44
|
+
# Add the path from the common ancestor to the target
|
|
45
|
+
down_segments = target.parts[len(common_parts) :]
|
|
46
|
+
|
|
47
|
+
# Combine and return the result
|
|
48
|
+
return Path(*up_segments, *down_segments)
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Tokens used in the docling document model."""
|
|
7
|
+
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from typing import Annotated, Tuple
|
|
10
|
+
|
|
11
|
+
from pydantic import Field
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TableToken(Enum):
|
|
15
|
+
"""Class to represent an LLM friendly representation of a Table."""
|
|
16
|
+
|
|
17
|
+
CELL_LABEL_COLUMN_HEADER = "<column_header>"
|
|
18
|
+
CELL_LABEL_ROW_HEADER = "<row_header>"
|
|
19
|
+
CELL_LABEL_SECTION_HEADERE = "<section_header>"
|
|
20
|
+
CELL_LABEL_DATA = "<data>"
|
|
21
|
+
|
|
22
|
+
OTSL_ECEL = "<ecel>" # empty cell
|
|
23
|
+
OTSL_FCEL = "<fcel>" # cell with content
|
|
24
|
+
OTSL_LCEL = "<lcel>" # left looking cell,
|
|
25
|
+
OTSL_UCEL = "<ucel>" # up looking cell,
|
|
26
|
+
OTSL_XCEL = "<xcel>" # 2d extension cell (cross cell),
|
|
27
|
+
OTSL_NL = "<nl>" # new line,
|
|
28
|
+
OTSL_CHED = "<ched>" # - column header cell,
|
|
29
|
+
OTSL_RHED = "<rhed>" # - row header cell,
|
|
30
|
+
OTSL_SROW = "<srow>" # - section row cell
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def get_special_tokens(cls):
|
|
34
|
+
"""Function to get all special document tokens."""
|
|
35
|
+
special_tokens = [token.value for token in cls]
|
|
36
|
+
return special_tokens
|
|
37
|
+
|
|
38
|
+
@staticmethod
|
|
39
|
+
def is_known_token(label):
|
|
40
|
+
"""Function to check if label is in tokens."""
|
|
41
|
+
return label in TableToken.get_special_tokens()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DocumentToken(Enum):
|
|
45
|
+
"""Class to represent an LLM friendly representation of a Document."""
|
|
46
|
+
|
|
47
|
+
BEG_DOCUMENT = "<document>"
|
|
48
|
+
END_DOCUMENT = "</document>"
|
|
49
|
+
|
|
50
|
+
BEG_TITLE = "<title>"
|
|
51
|
+
END_TITLE = "</title>"
|
|
52
|
+
|
|
53
|
+
BEG_ABSTRACT = "<abstract>"
|
|
54
|
+
END_ABSTRACT = "</abstract>"
|
|
55
|
+
|
|
56
|
+
BEG_DOI = "<doi>"
|
|
57
|
+
END_DOI = "</doi>"
|
|
58
|
+
BEG_DATE = "<date>"
|
|
59
|
+
END_DATE = "</date>"
|
|
60
|
+
|
|
61
|
+
BEG_AUTHORS = "<authors>"
|
|
62
|
+
END_AUTHORS = "</authors>"
|
|
63
|
+
BEG_AUTHOR = "<author>"
|
|
64
|
+
END_AUTHOR = "</author>"
|
|
65
|
+
|
|
66
|
+
BEG_AFFILIATIONS = "<affiliations>"
|
|
67
|
+
END_AFFILIATIONS = "</affiliations>"
|
|
68
|
+
BEG_AFFILIATION = "<affiliation>"
|
|
69
|
+
END_AFFILIATION = "</affiliation>"
|
|
70
|
+
|
|
71
|
+
BEG_HEADER = "<section-header>"
|
|
72
|
+
END_HEADER = "</section-header>"
|
|
73
|
+
BEG_TEXT = "<text>"
|
|
74
|
+
END_TEXT = "</text>"
|
|
75
|
+
BEG_PARAGRAPH = "<paragraph>"
|
|
76
|
+
END_PARAGRAPH = "</paragraph>"
|
|
77
|
+
BEG_TABLE = "<table>"
|
|
78
|
+
END_TABLE = "</table>"
|
|
79
|
+
BEG_FIGURE = "<figure>"
|
|
80
|
+
END_FIGURE = "</figure>"
|
|
81
|
+
BEG_CAPTION = "<caption>"
|
|
82
|
+
END_CAPTION = "</caption>"
|
|
83
|
+
BEG_EQUATION = "<equation>"
|
|
84
|
+
END_EQUATION = "</equation>"
|
|
85
|
+
BEG_LIST = "<list>"
|
|
86
|
+
END_LIST = "</list>"
|
|
87
|
+
BEG_LISTITEM = "<list-item>"
|
|
88
|
+
END_LISTITEM = "</list-item>"
|
|
89
|
+
|
|
90
|
+
BEG_LOCATION = "<location>"
|
|
91
|
+
END_LOCATION = "</location>"
|
|
92
|
+
BEG_GROUP = "<group>"
|
|
93
|
+
END_GROUP = "</group>"
|
|
94
|
+
|
|
95
|
+
@classmethod
|
|
96
|
+
def get_special_tokens(
|
|
97
|
+
cls,
|
|
98
|
+
max_rows: int = 100,
|
|
99
|
+
max_cols: int = 100,
|
|
100
|
+
max_pages: int = 1000,
|
|
101
|
+
page_dimension: Tuple[int, int] = (100, 100),
|
|
102
|
+
):
|
|
103
|
+
"""Function to get all special document tokens."""
|
|
104
|
+
special_tokens = [token.value for token in cls]
|
|
105
|
+
|
|
106
|
+
# Adding dynamically generated row and col tokens
|
|
107
|
+
for i in range(0, max_rows + 1):
|
|
108
|
+
special_tokens += [f"<row_{i}>", f"</row_{i}>"]
|
|
109
|
+
|
|
110
|
+
for i in range(0, max_cols + 1):
|
|
111
|
+
special_tokens += [f"<col_{i}>", f"</col_{i}>"]
|
|
112
|
+
|
|
113
|
+
for i in range(6):
|
|
114
|
+
special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
|
|
115
|
+
|
|
116
|
+
# FIXME: this is synonym of section header
|
|
117
|
+
for i in range(6):
|
|
118
|
+
special_tokens += [f"<subtitle-level-{i}>", f"</subtitle-level-{i}>"]
|
|
119
|
+
|
|
120
|
+
# Adding dynamically generated page-tokens
|
|
121
|
+
for i in range(0, max_pages + 1):
|
|
122
|
+
special_tokens.append(f"<page_{i}>")
|
|
123
|
+
special_tokens.append(f"</page_{i}>")
|
|
124
|
+
|
|
125
|
+
# Adding dynamically generated location-tokens
|
|
126
|
+
for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
|
|
127
|
+
special_tokens.append(f"<loc_{i}>")
|
|
128
|
+
|
|
129
|
+
return special_tokens
|
|
130
|
+
|
|
131
|
+
@staticmethod
|
|
132
|
+
def is_known_token(label):
|
|
133
|
+
"""Function to check if label is in tokens."""
|
|
134
|
+
return label in DocumentToken.get_special_tokens()
|
|
135
|
+
|
|
136
|
+
@staticmethod
|
|
137
|
+
def get_row_token(row: int, beg=bool) -> str:
|
|
138
|
+
"""Function to get page tokens."""
|
|
139
|
+
if beg:
|
|
140
|
+
return f"<row_{row}>"
|
|
141
|
+
else:
|
|
142
|
+
return f"</row_{row}>"
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
def get_col_token(col: int, beg=bool) -> str:
|
|
146
|
+
"""Function to get page tokens."""
|
|
147
|
+
if beg:
|
|
148
|
+
return f"<col_{col}>"
|
|
149
|
+
else:
|
|
150
|
+
return f"</col_{col}>"
|
|
151
|
+
|
|
152
|
+
@staticmethod
|
|
153
|
+
def get_page_token(page: int):
|
|
154
|
+
"""Function to get page tokens."""
|
|
155
|
+
return f"<page_{page}>"
|
|
156
|
+
|
|
157
|
+
@staticmethod
|
|
158
|
+
def get_location_token(val: float, rnorm: int = 100):
|
|
159
|
+
"""Function to get location tokens."""
|
|
160
|
+
val_ = round(rnorm * val)
|
|
161
|
+
|
|
162
|
+
if val_ < 0:
|
|
163
|
+
return "<loc_0>"
|
|
164
|
+
|
|
165
|
+
if val_ > rnorm:
|
|
166
|
+
return f"<loc_{rnorm}>"
|
|
167
|
+
|
|
168
|
+
return f"<loc_{val_}>"
|
|
169
|
+
|
|
170
|
+
@staticmethod
|
|
171
|
+
def get_location(
|
|
172
|
+
# bbox: Tuple[float, float, float, float],
|
|
173
|
+
bbox: Annotated[list[float], Field(min_length=4, max_length=4)],
|
|
174
|
+
page_w: float,
|
|
175
|
+
page_h: float,
|
|
176
|
+
xsize: int = 100,
|
|
177
|
+
ysize: int = 100,
|
|
178
|
+
page_i: int = -1,
|
|
179
|
+
):
|
|
180
|
+
"""Get the location string give bbox and page-dim."""
|
|
181
|
+
assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}"
|
|
182
|
+
assert bbox[1] <= bbox[3], f"bbox[1]<=bbox[3] => {bbox[1]}<={bbox[3]}"
|
|
183
|
+
|
|
184
|
+
x0 = bbox[0] / page_w
|
|
185
|
+
y0 = bbox[1] / page_h
|
|
186
|
+
x1 = bbox[2] / page_w
|
|
187
|
+
y1 = bbox[3] / page_h
|
|
188
|
+
|
|
189
|
+
page_tok = ""
|
|
190
|
+
if page_i != -1:
|
|
191
|
+
page_tok = DocumentToken.get_page_token(page=page_i)
|
|
192
|
+
|
|
193
|
+
x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize)
|
|
194
|
+
y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize)
|
|
195
|
+
x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize)
|
|
196
|
+
y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize)
|
|
197
|
+
|
|
198
|
+
loc_str = f"{DocumentToken.BEG_LOCATION.value}"
|
|
199
|
+
loc_str += f"{page_tok}{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
|
|
200
|
+
loc_str += f"{DocumentToken.END_LOCATION.value}"
|
|
201
|
+
|
|
202
|
+
return loc_str
|
|
@@ -15,6 +15,7 @@ import requests
|
|
|
15
15
|
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
|
16
16
|
from typing_extensions import deprecated
|
|
17
17
|
|
|
18
|
+
from docling_core.types.doc.utils import relative_path # noqa
|
|
18
19
|
from docling_core.types.io import DocumentStream
|
|
19
20
|
|
|
20
21
|
|
|
@@ -168,43 +169,3 @@ def resolve_file_source(
|
|
|
168
169
|
source=source,
|
|
169
170
|
headers=headers,
|
|
170
171
|
)
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
def relative_path(src: Path, target: Path) -> Path:
|
|
174
|
-
"""Compute the relative path from `src` to `target`.
|
|
175
|
-
|
|
176
|
-
Args:
|
|
177
|
-
src (str | Path): The source directory or file path (must be absolute).
|
|
178
|
-
target (str | Path): The target directory or file path (must be absolute).
|
|
179
|
-
|
|
180
|
-
Returns:
|
|
181
|
-
Path: The relative path from `src` to `target`.
|
|
182
|
-
|
|
183
|
-
Raises:
|
|
184
|
-
ValueError: If either `src` or `target` is not an absolute path.
|
|
185
|
-
"""
|
|
186
|
-
src = Path(src).resolve()
|
|
187
|
-
target = Path(target).resolve()
|
|
188
|
-
|
|
189
|
-
# Ensure both paths are absolute
|
|
190
|
-
if not src.is_absolute():
|
|
191
|
-
raise ValueError(f"The source path must be absolute: {src}")
|
|
192
|
-
if not target.is_absolute():
|
|
193
|
-
raise ValueError(f"The target path must be absolute: {target}")
|
|
194
|
-
|
|
195
|
-
# Find the common ancestor
|
|
196
|
-
common_parts = []
|
|
197
|
-
for src_part, target_part in zip(src.parts, target.parts):
|
|
198
|
-
if src_part == target_part:
|
|
199
|
-
common_parts.append(src_part)
|
|
200
|
-
else:
|
|
201
|
-
break
|
|
202
|
-
|
|
203
|
-
# Determine the path to go up from src to the common ancestor
|
|
204
|
-
up_segments = [".."] * (len(src.parts) - len(common_parts))
|
|
205
|
-
|
|
206
|
-
# Add the path from the common ancestor to the target
|
|
207
|
-
down_segments = target.parts[len(common_parts) :]
|
|
208
|
-
|
|
209
|
-
# Combine and return the result
|
|
210
|
-
return Path(*up_segments, *down_segments)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "docling-core"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.7.0"
|
|
4
4
|
description = "A python library to define and validate data types in Docling."
|
|
5
5
|
license = "MIT"
|
|
6
6
|
authors = [
|
|
@@ -48,7 +48,7 @@ generate_docs = "docling_core.utils.generate_docs:main"
|
|
|
48
48
|
[tool.poetry.dependencies]
|
|
49
49
|
python = "^3.9"
|
|
50
50
|
jsonschema = "^4.16.0"
|
|
51
|
-
pydantic = ">=2.6.0,<2.10"
|
|
51
|
+
pydantic = ">=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2"
|
|
52
52
|
jsonref = "^1.1.0"
|
|
53
53
|
tabulate = "^0.9.0"
|
|
54
54
|
pandas = "^2.1.4"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.6.0 → docling_core-2.7.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.6.0 → docling_core-2.7.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.6.0 → docling_core-2.7.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|