docling-core 2.23.3__py3-none-any.whl → 2.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -546,7 +546,7 @@ class SegmentedPdfPage(SegmentedPage):
546
546
 
547
547
  def save_as_json(
548
548
  self,
549
- filename: Path,
549
+ filename: Union[str, Path],
550
550
  indent: int = 2,
551
551
  ):
552
552
  """Save the page data as a JSON file.
@@ -555,12 +555,14 @@ class SegmentedPdfPage(SegmentedPage):
555
555
  filename: Path to save the JSON file
556
556
  indent: Indentation level for JSON formatting
557
557
  """
558
+ if isinstance(filename, str):
559
+ filename = Path(filename)
558
560
  out = self.export_to_dict()
559
561
  with open(filename, "w", encoding="utf-8") as fw:
560
562
  json.dump(out, fw, indent=indent)
561
563
 
562
564
  @classmethod
563
- def load_from_json(cls, filename: Path) -> "SegmentedPdfPage":
565
+ def load_from_json(cls, filename: Union[str, Path]) -> "SegmentedPdfPage":
564
566
  """Load page data from a JSON file.
565
567
 
566
568
  Args:
@@ -569,6 +571,8 @@ class SegmentedPdfPage(SegmentedPage):
569
571
  Returns:
570
572
  Instantiated SegmentedPdfPage object
571
573
  """
574
+ if isinstance(filename, str):
575
+ filename = Path(filename)
572
576
  with open(filename, "r", encoding="utf-8") as f:
573
577
  return cls.model_validate_json(f.read())
574
578
 
@@ -1155,19 +1159,21 @@ class PdfTableOfContents(BaseModel):
1155
1159
  """
1156
1160
  return self.model_dump(mode=mode, by_alias=True, exclude_none=True)
1157
1161
 
1158
- def save_as_json(self, filename: Path, indent: int = 2):
1162
+ def save_as_json(self, filename: Union[str, Path], indent: int = 2):
1159
1163
  """Save the table of contents as a JSON file.
1160
1164
 
1161
1165
  Args:
1162
1166
  filename: Path to save the JSON file
1163
1167
  indent: Indentation level for JSON formatting
1164
1168
  """
1169
+ if isinstance(filename, str):
1170
+ filename = Path(filename)
1165
1171
  out = self.export_to_dict()
1166
1172
  with open(filename, "w", encoding="utf-8") as fw:
1167
1173
  json.dump(out, fw, indent=indent)
1168
1174
 
1169
1175
  @classmethod
1170
- def load_from_json(cls, filename: Path) -> "PdfTableOfContents":
1176
+ def load_from_json(cls, filename: Union[str, Path]) -> "PdfTableOfContents":
1171
1177
  """Load table of contents from a JSON file.
1172
1178
 
1173
1179
  Args:
@@ -1176,6 +1182,8 @@ class PdfTableOfContents(BaseModel):
1176
1182
  Returns:
1177
1183
  Instantiated PdfTableOfContents object
1178
1184
  """
1185
+ if isinstance(filename, str):
1186
+ filename = Path(filename)
1179
1187
  with open(filename, "r", encoding="utf-8") as f:
1180
1188
  return cls.model_validate_json(f.read())
1181
1189
 
@@ -1213,19 +1221,21 @@ class ParsedPdfDocument(BaseModel):
1213
1221
  """
1214
1222
  return self.model_dump(mode=mode, by_alias=True, exclude_none=True)
1215
1223
 
1216
- def save_as_json(self, filename: Path, indent: int = 2):
1224
+ def save_as_json(self, filename: Union[str, Path], indent: int = 2):
1217
1225
  """Save the document as a JSON file.
1218
1226
 
1219
1227
  Args:
1220
1228
  filename: Path to save the JSON file
1221
1229
  indent: Indentation level for JSON formatting
1222
1230
  """
1231
+ if isinstance(filename, str):
1232
+ filename = Path(filename)
1223
1233
  out = self.export_to_dict()
1224
1234
  with open(filename, "w", encoding="utf-8") as fw:
1225
1235
  json.dump(out, fw, indent=indent)
1226
1236
 
1227
1237
  @classmethod
1228
- def load_from_json(cls, filename: Path) -> "ParsedPdfDocument":
1238
+ def load_from_json(cls, filename: Union[str, Path]) -> "ParsedPdfDocument":
1229
1239
  """Load document from a JSON file.
1230
1240
 
1231
1241
  Args:
@@ -1234,5 +1244,7 @@ class ParsedPdfDocument(BaseModel):
1234
1244
  Returns:
1235
1245
  Instantiated ParsedPdfDocument object
1236
1246
  """
1247
+ if isinstance(filename, str):
1248
+ filename = Path(filename)
1237
1249
  with open(filename, "r", encoding="utf-8") as f:
1238
1250
  return cls.model_validate_json(f.read())
@@ -8,10 +8,10 @@
8
8
  from enum import Enum
9
9
  from typing import Tuple
10
10
 
11
- from docling_core.types.doc.labels import PictureClassificationLabel
11
+ from docling_core.types.doc.labels import DocItemLabel
12
12
 
13
13
 
14
- class TableToken(Enum):
14
+ class TableToken(str, Enum):
15
15
  """Class to represent an LLM friendly representation of a Table."""
16
16
 
17
17
  CELL_LABEL_COLUMN_HEADER = "<column_header>"
@@ -41,41 +41,207 @@ class TableToken(Enum):
41
41
  return label in TableToken.get_special_tokens()
42
42
 
43
43
 
44
- class DocumentToken(Enum):
44
+ _LOC_PREFIX = "loc_"
45
+ _SECTION_HEADER_PREFIX = "section_header_level_"
46
+
47
+
48
+ class _PictureClassificationToken(str, Enum):
49
+ """PictureClassificationToken."""
50
+
51
+ OTHER = "<other>"
52
+
53
+ # If more than one picture is grouped together, it
54
+ # is generally not possible to assign a label
55
+ PICTURE_GROUP = "<picture_group>"
56
+
57
+ # General
58
+ PIE_CHART = "<pie_chart>"
59
+ BAR_CHART = "<bar_chart>"
60
+ LINE_CHART = "<line_chart>"
61
+ FLOW_CHART = "<flow_chart>"
62
+ SCATTER_CHART = "<scatter_chart>"
63
+ HEATMAP = "<heatmap>"
64
+ REMOTE_SENSING = "<remote_sensing>"
65
+
66
+ NATURAL_IMAGE = "<natural_image>"
67
+
68
+ # Chemistry
69
+ MOLECULAR_STRUCTURE = "<chemistry_molecular_structure>"
70
+ MARKUSH_STRUCTURE = "<chemistry_markush_structure>"
71
+
72
+ # Company
73
+ ICON = "<icon>"
74
+ LOGO = "<logo>"
75
+ SIGNATURE = "<signature>"
76
+ STAMP = "<stamp>"
77
+ QR_CODE = "<qr_code>"
78
+ BAR_CODE = "<bar_code>"
79
+ SCREENSHOT = "<screenshot>"
80
+
81
+ # Geology/Geography
82
+ GEOGRAPHIC_MAP = "<map>"
83
+ STRATIGRAPHIC_CHART = "<stratigraphic_chart>"
84
+
85
+ # Engineering
86
+ CAD_DRAWING = "<cad_drawing>"
87
+ ELECTRICAL_DIAGRAM = "<electrical_diagram>"
88
+
89
+
90
+ class _CodeLanguageToken(str, Enum):
91
+ """CodeLanguageToken."""
92
+
93
+ ADA = "<_Ada_>"
94
+ AWK = "<_Awk_>"
95
+ BASH = "<_Bash_>"
96
+ BC = "<_bc_>"
97
+ C = "<_C_>"
98
+ C_SHARP = "<_C#_>"
99
+ C_PLUS_PLUS = "<_C++_>"
100
+ CMAKE = "<_CMake_>"
101
+ COBOL = "<_COBOL_>"
102
+ CSS = "<_CSS_>"
103
+ CEYLON = "<_Ceylon_>"
104
+ CLOJURE = "<_Clojure_>"
105
+ CRYSTAL = "<_Crystal_>"
106
+ CUDA = "<_Cuda_>"
107
+ CYTHON = "<_Cython_>"
108
+ D = "<_D_>"
109
+ DART = "<_Dart_>"
110
+ DC = "<_dc_>"
111
+ DOCKERFILE = "<_Dockerfile_>"
112
+ ELIXIR = "<_Elixir_>"
113
+ ERLANG = "<_Erlang_>"
114
+ FORTRAN = "<_FORTRAN_>"
115
+ FORTH = "<_Forth_>"
116
+ GO = "<_Go_>"
117
+ HTML = "<_HTML_>"
118
+ HASKELL = "<_Haskell_>"
119
+ HAXE = "<_Haxe_>"
120
+ JAVA = "<_Java_>"
121
+ JAVASCRIPT = "<_JavaScript_>"
122
+ JULIA = "<_Julia_>"
123
+ KOTLIN = "<_Kotlin_>"
124
+ LISP = "<_Lisp_>"
125
+ LUA = "<_Lua_>"
126
+ MATLAB = "<_Matlab_>"
127
+ MOONSCRIPT = "<_MoonScript_>"
128
+ NIM = "<_Nim_>"
129
+ OCAML = "<_OCaml_>"
130
+ OBJECTIVEC = "<_ObjectiveC_>"
131
+ OCTAVE = "<_Octave_>"
132
+ PHP = "<_PHP_>"
133
+ PASCAL = "<_Pascal_>"
134
+ PERL = "<_Perl_>"
135
+ PROLOG = "<_Prolog_>"
136
+ PYTHON = "<_Python_>"
137
+ RACKET = "<_Racket_>"
138
+ RUBY = "<_Ruby_>"
139
+ RUST = "<_Rust_>"
140
+ SML = "<_SML_>"
141
+ SQL = "<_SQL_>"
142
+ SCALA = "<_Scala_>"
143
+ SCHEME = "<_Scheme_>"
144
+ SWIFT = "<_Swift_>"
145
+ TYPESCRIPT = "<_TypeScript_>"
146
+ UNKNOWN = "<_unknown_>"
147
+ VISUALBASIC = "<_VisualBasic_>"
148
+ XML = "<_XML_>"
149
+ YAML = "<_YAML_>"
150
+
151
+
152
+ class DocumentToken(str, Enum):
45
153
  """Class to represent an LLM friendly representation of a Document."""
46
154
 
47
155
  DOCUMENT = "doctag"
48
156
  OTSL = "otsl"
49
157
  ORDERED_LIST = "ordered_list"
50
158
  UNORDERED_LIST = "unordered_list"
51
- LOC = "loc_"
52
159
  PAGE_BREAK = "page_break"
53
160
  SMILES = "smiles"
161
+ INLINE = "inline"
162
+
163
+ CAPTION = "caption"
164
+ FOOTNOTE = "footnote"
165
+ FORMULA = "formula"
166
+ LIST_ITEM = "list_item"
167
+ PAGE_FOOTER = "page_footer"
168
+ PAGE_HEADER = "page_header"
169
+ PICTURE = "picture"
170
+ TABLE = "table"
171
+ TEXT = "text"
172
+ TITLE = "title"
173
+ DOCUMENT_INDEX = "document_index"
174
+ CODE = "code"
175
+ CHECKBOX_SELECTED = "checkbox_selected"
176
+ CHECKBOX_UNSELECTED = "checkbox_unselected"
177
+ FORM = "form"
178
+ KEY_VALUE_REGION = "key_value_region"
179
+
180
+ PARAGRAPH = "paragraph"
181
+ REFERENCE = "reference"
54
182
 
55
183
  @classmethod
56
184
  def get_special_tokens(
57
185
  cls,
58
- page_dimension: Tuple[int, int] = (100, 100),
186
+ page_dimension: Tuple[int, int] = (500, 500),
59
187
  ):
60
188
  """Function to get all special document tokens."""
61
- special_tokens = [token.value for token in cls]
189
+ special_tokens: list[str] = []
190
+ for token in cls:
191
+ special_tokens.append(f"<{token.value}>")
192
+ special_tokens.append(f"</{token.value}>")
62
193
 
63
194
  for i in range(6):
64
195
  special_tokens += [
65
- f"<section_header_level_{i}>",
66
- f"</section_header_level_{i}>",
196
+ f"<{_SECTION_HEADER_PREFIX}{i}>",
197
+ f"</{_SECTION_HEADER_PREFIX}{i}>",
67
198
  ]
68
199
 
69
- # Add dynamically picture classification tokens
70
- for _, member in PictureClassificationLabel.__members__.items():
71
- special_tokens.append(f"<{member}>")
200
+ special_tokens.extend([t.value for t in _PictureClassificationToken])
201
+ special_tokens.extend([t.value for t in _CodeLanguageToken])
202
+
203
+ special_tokens.extend(TableToken.get_special_tokens())
72
204
 
73
205
  # Adding dynamically generated location-tokens
74
- for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
75
- special_tokens.append(f"<loc_{i}>")
206
+ for i in range(0, max(page_dimension[0], page_dimension[1])):
207
+ special_tokens.append(f"<{_LOC_PREFIX}{i}>")
76
208
 
77
209
  return special_tokens
78
210
 
211
+ @classmethod
212
+ def create_token_name_from_doc_item_label(cls, label: str, level: int = 1) -> str:
213
+ """Get token corresponding to passed doc item label."""
214
+ doc_token_by_item_label = {
215
+ DocItemLabel.CAPTION: DocumentToken.CAPTION,
216
+ DocItemLabel.FOOTNOTE: DocumentToken.FOOTNOTE,
217
+ DocItemLabel.FORMULA: DocumentToken.FORMULA,
218
+ DocItemLabel.LIST_ITEM: DocumentToken.LIST_ITEM,
219
+ DocItemLabel.PAGE_FOOTER: DocumentToken.PAGE_FOOTER,
220
+ DocItemLabel.PAGE_HEADER: DocumentToken.PAGE_HEADER,
221
+ DocItemLabel.PICTURE: DocumentToken.PICTURE,
222
+ DocItemLabel.TABLE: DocumentToken.TABLE,
223
+ DocItemLabel.TEXT: DocumentToken.TEXT,
224
+ DocItemLabel.TITLE: DocumentToken.TITLE,
225
+ DocItemLabel.DOCUMENT_INDEX: DocumentToken.DOCUMENT_INDEX,
226
+ DocItemLabel.CODE: DocumentToken.CODE,
227
+ DocItemLabel.CHECKBOX_SELECTED: DocumentToken.CHECKBOX_SELECTED,
228
+ DocItemLabel.CHECKBOX_UNSELECTED: DocumentToken.CHECKBOX_UNSELECTED,
229
+ DocItemLabel.FORM: DocumentToken.FORM,
230
+ DocItemLabel.KEY_VALUE_REGION: DocumentToken.KEY_VALUE_REGION,
231
+ DocItemLabel.PARAGRAPH: DocumentToken.PARAGRAPH,
232
+ DocItemLabel.REFERENCE: DocumentToken.REFERENCE,
233
+ }
234
+
235
+ res: str
236
+ if label == DocItemLabel.SECTION_HEADER:
237
+ res = f"{_SECTION_HEADER_PREFIX}{level}"
238
+ else:
239
+ try:
240
+ res = doc_token_by_item_label[DocItemLabel(label)].value
241
+ except KeyError as e:
242
+ raise RuntimeError(f"Unexpected DocItemLabel: {label}") from e
243
+ return res
244
+
79
245
  @staticmethod
80
246
  def is_known_token(label):
81
247
  """Function to check if label is in tokens."""
@@ -83,29 +249,29 @@ class DocumentToken(Enum):
83
249
 
84
250
  @staticmethod
85
251
  def get_picture_classification_token(classification: str) -> str:
86
- """Function to get picture classification tokens."""
87
- return f"<{classification}>"
252
+ """Function to get the token for a given picture classification value."""
253
+ return _PictureClassificationToken(f"<{classification}>").value
254
+
255
+ @staticmethod
256
+ def get_code_language_token(code_language: str) -> str:
257
+ """Function to get the token for a given code language."""
258
+ return _CodeLanguageToken(f"<_{code_language}_>").value
88
259
 
89
260
  @staticmethod
90
- def get_location_token(val: float, rnorm: int = 100):
261
+ def get_location_token(val: float, rnorm: int = 500): # TODO review
91
262
  """Function to get location tokens."""
92
263
  val_ = round(rnorm * val)
93
-
94
- if val_ < 0:
95
- return "<loc_0>"
96
-
97
- if val_ > rnorm:
98
- return f"<loc_{rnorm}>"
99
-
100
- return f"<loc_{val_}>"
264
+ val_ = max(val_, 0)
265
+ val_ = min(val_, rnorm - 1)
266
+ return f"<{_LOC_PREFIX}{val_}>"
101
267
 
102
268
  @staticmethod
103
269
  def get_location(
104
270
  bbox: tuple[float, float, float, float],
105
271
  page_w: float,
106
272
  page_h: float,
107
- xsize: int = 100,
108
- ysize: int = 100,
273
+ xsize: int = 500, # TODO review
274
+ ysize: int = 500, # TODO review
109
275
  ):
110
276
  """Get the location string give bbox and page-dim."""
111
277
  assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.23.3
3
+ Version: 2.24.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://github.com/docling-project
6
6
  License: MIT
@@ -3,9 +3,10 @@ docling_core/cli/__init__.py,sha256=C63yWifzpA0IV7YWDatpAdrhoV8zjqxAKv0xMf09VdM,
3
3
  docling_core/cli/view.py,sha256=gwxSBYhGqwznMR8pdXaEuAh2bjFD5X_g11xFYSgFgtM,1764
4
4
  docling_core/experimental/__init__.py,sha256=XnAVSUHbA6OFhNSpoYqSD3u83-xVaUaki1DIKFw69Ew,99
5
5
  docling_core/experimental/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
6
- docling_core/experimental/serializer/base.py,sha256=3rMQajYerAMMBJpW7dzzmRvGJ9LTdjpu0ucrK75KTVY,5142
7
- docling_core/experimental/serializer/common.py,sha256=AbIYG2Dh5C2KtAKaqLHfffOFlKa4MYNNxqVjO1rQx8o,11615
8
- docling_core/experimental/serializer/markdown.py,sha256=J0enJuW7oGVHs038CSME5KBSaFylrCCh1rdpS6EIfzc,14764
6
+ docling_core/experimental/serializer/base.py,sha256=avNYy8Lgv45Gm0jfO1OV4wSRsv-O9Eeow2PkUAPY1pA,5152
7
+ docling_core/experimental/serializer/common.py,sha256=g_o-wSQONXIZM7YJF_ghlwc3W3_VkePpM6pDS4ZjrhI,13701
8
+ docling_core/experimental/serializer/doctags.py,sha256=bNUd5vOj1JnvIYFfSc_TSzQKQ7eQ34TY7NAUNK3C604,15953
9
+ docling_core/experimental/serializer/markdown.py,sha256=oEzuPXiooJPVL7yTbXPPFhWF8Phstmzm3mev3yqcqbo,15950
9
10
  docling_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
11
  docling_core/resources/schemas/doc/ANN.json,sha256=04U5j-PU9m5w7IagJ_rHcAx7qUtLkUuaWZO9GuYHnTA,4202
11
12
  docling_core/resources/schemas/doc/DOC.json,sha256=9tVKpCqDGGq3074Nn5qlUCdTN-5k1Q0ri_scJblwnLE,6686
@@ -29,10 +30,10 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
29
30
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
30
31
  docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
31
32
  docling_core/types/doc/base.py,sha256=22U1qDlD-2ICmgzbdZrjNayoPHnq4S1ks1GRoqB7y1Q,12542
32
- docling_core/types/doc/document.py,sha256=j3v1hL2O6_DzN9n8Ak0Ho46sRhElqmRXU_Gd4zqThLA,128422
33
+ docling_core/types/doc/document.py,sha256=_FJtmp0yh6F_3AVLVN4Xpo7E1hz50gvS_-HrJmp8FOA,128806
33
34
  docling_core/types/doc/labels.py,sha256=0J9Gsqz-jQ4FP2yxs9wOxoTr3qg97BniFX7MJVziUmk,5684
34
- docling_core/types/doc/page.py,sha256=8A9sM-6mNad_JzaoaIXlfsBoo6zbw29uk7fp6j24omg,39461
35
- docling_core/types/doc/tokens.py,sha256=Z2FuzHWinYQzWZdTvOBsEQACAKPcBiSf777w5S9NJms,3947
35
+ docling_core/types/doc/page.py,sha256=qCXp_s0cY3N1WWkICv6fjH52OVYYbjYiqRQit86FxG4,39989
36
+ docling_core/types/doc/tokens.py,sha256=fpPtVHfO5RXk8mkqZ7YrW5LyHipg697kbFBNqn6jXQU,9159
36
37
  docling_core/types/doc/utils.py,sha256=SaiQD-WMMooFm1bMqwatU-IGhtG048iKJb-ppnJit_k,2250
37
38
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
38
39
  docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
@@ -62,8 +63,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
62
63
  docling_core/utils/legacy.py,sha256=SqNQAxl97aHfoJEsC9vZcMJg5FNkmqKPFi-wdSrnfI0,24442
63
64
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
64
65
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
65
- docling_core-2.23.3.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
66
- docling_core-2.23.3.dist-info/METADATA,sha256=JSY_qNdtZqYS_9pflWQncaxDlisQdzq_DtTiaCTfcWY,5843
67
- docling_core-2.23.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
68
- docling_core-2.23.3.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
69
- docling_core-2.23.3.dist-info/RECORD,,
66
+ docling_core-2.24.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
67
+ docling_core-2.24.0.dist-info/METADATA,sha256=ycw0ioISQ7Uv0rL9_RU5zpsimerhh35wfKv0bul1e9g,5843
68
+ docling_core-2.24.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
69
+ docling_core-2.24.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
70
+ docling_core-2.24.0.dist-info/RECORD,,