docling-core 2.21.1__py3-none-any.whl → 2.22.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -73,7 +73,7 @@ class HybridChunker(BaseChunker):
73
73
  for t in text:
74
74
  total += self._count_text_tokens(t)
75
75
  return total
76
- return len(self._tokenizer.tokenize(text, max_length=None))
76
+ return len(self._tokenizer.tokenize(text))
77
77
 
78
78
  class _ChunkLengthInfo(BaseModel):
79
79
  total_len: int
@@ -82,7 +82,7 @@ class HybridChunker(BaseChunker):
82
82
 
83
83
  def _count_chunk_tokens(self, doc_chunk: DocChunk):
84
84
  ser_txt = self.serialize(chunk=doc_chunk)
85
- return len(self._tokenizer.tokenize(text=ser_txt, max_length=None))
85
+ return len(self._tokenizer.tokenize(text=ser_txt))
86
86
 
87
87
  def _doc_chunk_length(self, doc_chunk: DocChunk):
88
88
  text_length = self._count_text_tokens(doc_chunk.text)
@@ -4,6 +4,7 @@ import base64
4
4
  import copy
5
5
  import hashlib
6
6
  import html
7
+ import itertools
7
8
  import json
8
9
  import logging
9
10
  import mimetypes
@@ -37,7 +38,7 @@ from pydantic import (
37
38
  model_validator,
38
39
  )
39
40
  from tabulate import tabulate
40
- from typing_extensions import Annotated, Self
41
+ from typing_extensions import Annotated, Self, deprecated
41
42
 
42
43
  from docling_core.search.package import VERSION_PATTERN
43
44
  from docling_core.types.base import _JSON_POINTER_REGEX
@@ -522,6 +523,49 @@ class ImageRef(BaseModel):
522
523
  )
523
524
 
524
525
 
526
+ class DocTagsPage(BaseModel):
527
+ """DocTagsPage."""
528
+
529
+ model_config = ConfigDict(arbitrary_types_allowed=True)
530
+
531
+ tokens: str
532
+ image: Optional[PILImage.Image] = None
533
+
534
+
535
+ class DocTagsDocument(BaseModel):
536
+ """DocTagsDocument."""
537
+
538
+ pages: List[DocTagsPage] = []
539
+
540
+ @classmethod
541
+ def from_doctags_and_image_pairs(
542
+ cls, doctags: List[Union[Path, str]], images: List[Union[Path, PILImage.Image]]
543
+ ):
544
+ """from_doctags_and_image_pairs."""
545
+ if len(doctags) != len(images):
546
+ raise ValueError("Number of page doctags must be equal to page images!")
547
+ doctags_doc = cls()
548
+
549
+ pages = []
550
+ for dt, img in zip(doctags, images):
551
+ if isinstance(dt, Path):
552
+ with dt.open("r") as fp:
553
+ dt = fp.read()
554
+ elif isinstance(dt, str):
555
+ pass
556
+
557
+ if isinstance(img, Path):
558
+ img = PILImage.open(img)
559
+ elif isinstance(dt, PILImage.Image):
560
+ pass
561
+
562
+ page = DocTagsPage(tokens=dt, image=img)
563
+ pages.append(page)
564
+
565
+ doctags_doc.pages = pages
566
+ return doctags_doc
567
+
568
+
525
569
  class ProvenanceItem(BaseModel):
526
570
  """ProvenanceItem."""
527
571
 
@@ -800,7 +844,7 @@ class CodeItem(FloatingItem, TextItem):
800
844
  :param add_content: bool: (Default value = True)
801
845
 
802
846
  """
803
- body = f"<{self.label.value}{new_line}"
847
+ body = f"<{self.label.value}>{new_line}"
804
848
 
805
849
  if add_location:
806
850
  body += self.get_location_tokens(
@@ -813,7 +857,7 @@ class CodeItem(FloatingItem, TextItem):
813
857
  if add_content and self.text is not None:
814
858
  body += f"<_{self.code_language.value}_>{self.text}{new_line}"
815
859
 
816
- body += f"</{self.label.value}\n"
860
+ body += f"</{self.label.value}>\n"
817
861
 
818
862
  return body
819
863
 
@@ -1003,6 +1047,20 @@ class PictureItem(FloatingItem):
1003
1047
  predicted_class = classifications[0].predicted_classes[0].class_name
1004
1048
  body += DocumentToken.get_picture_classification_token(predicted_class)
1005
1049
 
1050
+ smiles_annotations = [
1051
+ ann for ann in self.annotations if isinstance(ann, PictureMoleculeData)
1052
+ ]
1053
+ if len(smiles_annotations) > 0:
1054
+ body += (
1055
+ "<"
1056
+ + DocumentToken.SMILES.value
1057
+ + ">"
1058
+ + smiles_annotations[0].smi
1059
+ + "</"
1060
+ + DocumentToken.SMILES.value
1061
+ + ">"
1062
+ )
1063
+
1006
1064
  if add_caption and len(self.captions):
1007
1065
  text = self.caption_text(doc)
1008
1066
 
@@ -2487,7 +2545,6 @@ class DoclingDocument(BaseModel):
2487
2545
  is_inline_scope=is_inline_scope,
2488
2546
  visited=visited,
2489
2547
  )
2490
- # NOTE: assumes unordered (flag & marker currently in ListItem)
2491
2548
  indent_str = list_level * indent * " "
2492
2549
  is_ol = item.label == GroupLabel.ORDERED_LIST
2493
2550
  text = "\n".join(
@@ -2501,7 +2558,12 @@ class DoclingDocument(BaseModel):
2501
2558
  for i, c in enumerate(comps)
2502
2559
  ]
2503
2560
  )
2504
- _ingest_text(text=text)
2561
+ _ingest_text(
2562
+ text=text,
2563
+ # special chars have already been escaped as needed
2564
+ do_escape_html=False,
2565
+ do_escape_underscores=False,
2566
+ )
2505
2567
  elif item.label == GroupLabel.INLINE:
2506
2568
  comps = self._get_markdown_components(
2507
2569
  node=item,
@@ -2520,7 +2582,13 @@ class DoclingDocument(BaseModel):
2520
2582
  is_inline_scope=True,
2521
2583
  visited=visited,
2522
2584
  )
2523
- _ingest_text(" ".join(comps))
2585
+ text = " ".join(comps)
2586
+ _ingest_text(
2587
+ text=text,
2588
+ # special chars have already been escaped as needed
2589
+ do_escape_html=False,
2590
+ do_escape_underscores=False,
2591
+ )
2524
2592
  else:
2525
2593
  continue
2526
2594
 
@@ -2838,7 +2906,7 @@ class DoclingDocument(BaseModel):
2838
2906
 
2839
2907
  # Building a math equation in MathML format
2840
2908
  # ref https://www.w3.org/TR/wai-aria-1.1/#math
2841
- elif formula_to_mathml:
2909
+ elif formula_to_mathml and len(math_formula) > 0:
2842
2910
  try:
2843
2911
  mathml_element = latex2mathml.converter.convert_to_element(
2844
2912
  math_formula, display="block"
@@ -2860,7 +2928,7 @@ class DoclingDocument(BaseModel):
2860
2928
  and img_fallback is not None
2861
2929
  ):
2862
2930
  text = img_fallback
2863
- elif len(math_formula) > 0:
2931
+ else:
2864
2932
  text = f"<pre>{math_formula}</pre>"
2865
2933
 
2866
2934
  elif math_formula != "":
@@ -2926,7 +2994,378 @@ class DoclingDocument(BaseModel):
2926
2994
 
2927
2995
  return html_text
2928
2996
 
2929
- def save_as_document_tokens(
2997
+ def load_from_doctags( # noqa: C901
2998
+ self,
2999
+ doctag_document: DocTagsDocument,
3000
+ ) -> "DoclingDocument":
3001
+ r"""Load Docling document from lists of DocTags and Images."""
3002
+ # Maps the recognized tag to a Docling label.
3003
+ # Code items will be given DocItemLabel.CODE
3004
+ tag_to_doclabel = {
3005
+ "title": DocItemLabel.TITLE,
3006
+ "document_index": DocItemLabel.DOCUMENT_INDEX,
3007
+ "otsl": DocItemLabel.TABLE,
3008
+ "section_header_level_1": DocItemLabel.SECTION_HEADER,
3009
+ "checkbox_selected": DocItemLabel.CHECKBOX_SELECTED,
3010
+ "checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED,
3011
+ "text": DocItemLabel.TEXT,
3012
+ "page_header": DocItemLabel.PAGE_HEADER,
3013
+ "page_footer": DocItemLabel.PAGE_FOOTER,
3014
+ "formula": DocItemLabel.FORMULA,
3015
+ "caption": DocItemLabel.CAPTION,
3016
+ "picture": DocItemLabel.PICTURE,
3017
+ "list_item": DocItemLabel.LIST_ITEM,
3018
+ "footnote": DocItemLabel.FOOTNOTE,
3019
+ "code": DocItemLabel.CODE,
3020
+ }
3021
+
3022
+ def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
3023
+ """Extract <loc_...> coords from the chunk, normalized by / 500."""
3024
+ coords = re.findall(r"<loc_(\d+)>", text_chunk)
3025
+ if len(coords) == 4:
3026
+ l, t, r, b = map(float, coords)
3027
+ return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
3028
+ return None
3029
+
3030
+ def extract_inner_text(text_chunk: str) -> str:
3031
+ """Strip all <...> tags inside the chunk to get the raw text content."""
3032
+ return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
3033
+
3034
+ def otsl_parse_texts(texts, tokens):
3035
+ split_word = TableToken.OTSL_NL.value
3036
+ split_row_tokens = [
3037
+ list(y)
3038
+ for x, y in itertools.groupby(tokens, lambda z: z == split_word)
3039
+ if not x
3040
+ ]
3041
+ table_cells = []
3042
+ r_idx = 0
3043
+ c_idx = 0
3044
+
3045
+ def count_right(tokens, c_idx, r_idx, which_tokens):
3046
+ span = 0
3047
+ c_idx_iter = c_idx
3048
+ while tokens[r_idx][c_idx_iter] in which_tokens:
3049
+ c_idx_iter += 1
3050
+ span += 1
3051
+ if c_idx_iter >= len(tokens[r_idx]):
3052
+ return span
3053
+ return span
3054
+
3055
+ def count_down(tokens, c_idx, r_idx, which_tokens):
3056
+ span = 0
3057
+ r_idx_iter = r_idx
3058
+ while tokens[r_idx_iter][c_idx] in which_tokens:
3059
+ r_idx_iter += 1
3060
+ span += 1
3061
+ if r_idx_iter >= len(tokens):
3062
+ return span
3063
+ return span
3064
+
3065
+ for i, text in enumerate(texts):
3066
+ cell_text = ""
3067
+ if text in [
3068
+ TableToken.OTSL_FCEL.value,
3069
+ TableToken.OTSL_ECEL.value,
3070
+ TableToken.OTSL_CHED.value,
3071
+ TableToken.OTSL_RHED.value,
3072
+ TableToken.OTSL_SROW.value,
3073
+ ]:
3074
+ row_span = 1
3075
+ col_span = 1
3076
+ right_offset = 1
3077
+ if text != TableToken.OTSL_ECEL.value:
3078
+ cell_text = texts[i + 1]
3079
+ right_offset = 2
3080
+
3081
+ # Check next element(s) for lcel / ucel / xcel,
3082
+ # set properly row_span, col_span
3083
+ next_right_cell = ""
3084
+ if i + right_offset < len(texts):
3085
+ next_right_cell = texts[i + right_offset]
3086
+
3087
+ next_bottom_cell = ""
3088
+ if r_idx + 1 < len(split_row_tokens):
3089
+ if c_idx < len(split_row_tokens[r_idx + 1]):
3090
+ next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
3091
+
3092
+ if next_right_cell in [
3093
+ TableToken.OTSL_LCEL.value,
3094
+ TableToken.OTSL_XCEL.value,
3095
+ ]:
3096
+ # we have horisontal spanning cell or 2d spanning cell
3097
+ col_span += count_right(
3098
+ split_row_tokens,
3099
+ c_idx + 1,
3100
+ r_idx,
3101
+ [TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
3102
+ )
3103
+ if next_bottom_cell in [
3104
+ TableToken.OTSL_UCEL.value,
3105
+ TableToken.OTSL_XCEL.value,
3106
+ ]:
3107
+ # we have a vertical spanning cell or 2d spanning cell
3108
+ row_span += count_down(
3109
+ split_row_tokens,
3110
+ c_idx,
3111
+ r_idx + 1,
3112
+ [TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
3113
+ )
3114
+
3115
+ table_cells.append(
3116
+ TableCell(
3117
+ text=cell_text.strip(),
3118
+ row_span=row_span,
3119
+ col_span=col_span,
3120
+ start_row_offset_idx=r_idx,
3121
+ end_row_offset_idx=r_idx + row_span,
3122
+ start_col_offset_idx=c_idx,
3123
+ end_col_offset_idx=c_idx + col_span,
3124
+ )
3125
+ )
3126
+ if text in [
3127
+ TableToken.OTSL_FCEL.value,
3128
+ TableToken.OTSL_ECEL.value,
3129
+ TableToken.OTSL_CHED.value,
3130
+ TableToken.OTSL_RHED.value,
3131
+ TableToken.OTSL_SROW.value,
3132
+ TableToken.OTSL_LCEL.value,
3133
+ TableToken.OTSL_UCEL.value,
3134
+ TableToken.OTSL_XCEL.value,
3135
+ ]:
3136
+ c_idx += 1
3137
+ if text == TableToken.OTSL_NL.value:
3138
+ r_idx += 1
3139
+ c_idx = 0
3140
+ return table_cells, split_row_tokens
3141
+
3142
+ def otsl_extract_tokens_and_text(s: str):
3143
+ # Pattern to match anything enclosed by < >
3144
+ # (including the angle brackets themselves)
3145
+ pattern = r"(<[^>]+>)"
3146
+ # Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
3147
+ tokens = re.findall(pattern, s)
3148
+ # Remove any tokens that start with "<loc_"
3149
+ tokens = [
3150
+ token
3151
+ for token in tokens
3152
+ if not (
3153
+ token.startswith(rf"<{DocumentToken.LOC.value}")
3154
+ or token
3155
+ in [
3156
+ rf"<{DocumentToken.OTSL.value}>",
3157
+ rf"</{DocumentToken.OTSL.value}>",
3158
+ ]
3159
+ )
3160
+ ]
3161
+ # Split the string by those tokens to get the in-between text
3162
+ text_parts = re.split(pattern, s)
3163
+ text_parts = [
3164
+ token
3165
+ for token in text_parts
3166
+ if not (
3167
+ token.startswith(rf"<{DocumentToken.LOC.value}")
3168
+ or token
3169
+ in [
3170
+ rf"<{DocumentToken.OTSL.value}>",
3171
+ rf"</{DocumentToken.OTSL.value}>",
3172
+ ]
3173
+ )
3174
+ ]
3175
+ # Remove any empty or purely whitespace strings from text_parts
3176
+ text_parts = [part for part in text_parts if part.strip()]
3177
+
3178
+ return tokens, text_parts
3179
+
3180
+ def parse_table_content(otsl_content: str) -> TableData:
3181
+ tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
3182
+ table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
3183
+
3184
+ return TableData(
3185
+ num_rows=len(split_row_tokens),
3186
+ num_cols=(
3187
+ max(len(row) for row in split_row_tokens) if split_row_tokens else 0
3188
+ ),
3189
+ table_cells=table_cells,
3190
+ )
3191
+
3192
+ # doc = DoclingDocument(name="Document")
3193
+ for pg_idx, doctag_page in enumerate(doctag_document.pages):
3194
+ page_doctags = doctag_page.tokens
3195
+ image = doctag_page.image
3196
+
3197
+ page_no = pg_idx + 1
3198
+ # bounding_boxes = []
3199
+
3200
+ if image is not None:
3201
+ pg_width = image.width
3202
+ pg_height = image.height
3203
+ else:
3204
+ pg_width = 1
3205
+ pg_height = 1
3206
+
3207
+ """
3208
+ 1. Finds all <tag>...</tag>
3209
+ blocks in the entire string (multi-line friendly)
3210
+ in the order they appear.
3211
+ 2. For each chunk, extracts bounding box (if any) and inner text.
3212
+ 3. Adds the item to a DoclingDocument structure with the right label.
3213
+ 4. Tracks bounding boxes+color in a separate list for later visualization.
3214
+ """
3215
+
3216
+ # Regex for root level recognized tags
3217
+ tag_pattern = (
3218
+ rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
3219
+ rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
3220
+ rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
3221
+ rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
3222
+ rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
3223
+ rf"{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
3224
+ rf"{DocItemLabel.SECTION_HEADER}_level_1|"
3225
+ rf"{DocumentToken.ORDERED_LIST.value}|"
3226
+ rf"{DocumentToken.UNORDERED_LIST.value}|"
3227
+ rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
3228
+ )
3229
+
3230
+ # DocumentToken.OTSL
3231
+ pattern = re.compile(tag_pattern, re.DOTALL)
3232
+
3233
+ # Go through each match in order
3234
+ for match in pattern.finditer(page_doctags):
3235
+ full_chunk = match.group(0)
3236
+ tag_name = match.group("tag")
3237
+
3238
+ bbox = extract_bounding_box(full_chunk) if image else None
3239
+ doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
3240
+
3241
+ if tag_name == DocumentToken.OTSL.value:
3242
+ table_data = parse_table_content(full_chunk)
3243
+ bbox = extract_bounding_box(full_chunk) if image else None
3244
+
3245
+ if bbox:
3246
+ prov = ProvenanceItem(
3247
+ bbox=bbox.resize_by_scale(pg_width, pg_height),
3248
+ charspan=(0, 0),
3249
+ page_no=page_no,
3250
+ )
3251
+ self.add_table(data=table_data, prov=prov)
3252
+ else:
3253
+ self.add_table(data=table_data)
3254
+
3255
+ elif tag_name == DocItemLabel.PICTURE:
3256
+ text_caption_content = extract_inner_text(full_chunk)
3257
+ if image:
3258
+ if bbox:
3259
+ im_width, im_height = image.size
3260
+
3261
+ crop_box = (
3262
+ int(bbox.l * im_width),
3263
+ int(bbox.t * im_height),
3264
+ int(bbox.r * im_width),
3265
+ int(bbox.b * im_height),
3266
+ )
3267
+ cropped_image = image.crop(crop_box)
3268
+ pic = self.add_picture(
3269
+ parent=None,
3270
+ image=ImageRef.from_pil(image=cropped_image, dpi=72),
3271
+ prov=(
3272
+ ProvenanceItem(
3273
+ bbox=bbox.resize_by_scale(pg_width, pg_height),
3274
+ charspan=(0, 0),
3275
+ page_no=page_no,
3276
+ )
3277
+ ),
3278
+ )
3279
+ # If there is a caption to an image, add it as well
3280
+ if len(text_caption_content) > 0:
3281
+ caption_item = self.add_text(
3282
+ label=DocItemLabel.CAPTION,
3283
+ text=text_caption_content,
3284
+ parent=None,
3285
+ )
3286
+ pic.captions.append(caption_item.get_ref())
3287
+ else:
3288
+ if bbox:
3289
+ # In case we don't have access to an binary of an image
3290
+ self.add_picture(
3291
+ parent=None,
3292
+ prov=ProvenanceItem(
3293
+ bbox=bbox, charspan=(0, 0), page_no=page_no
3294
+ ),
3295
+ )
3296
+ # If there is a caption to an image, add it as well
3297
+ if len(text_caption_content) > 0:
3298
+ caption_item = self.add_text(
3299
+ label=DocItemLabel.CAPTION,
3300
+ text=text_caption_content,
3301
+ parent=None,
3302
+ )
3303
+ pic.captions.append(caption_item.get_ref())
3304
+ elif tag_name in [
3305
+ DocumentToken.ORDERED_LIST.value,
3306
+ DocumentToken.UNORDERED_LIST.value,
3307
+ ]:
3308
+ list_label = GroupLabel.LIST
3309
+ enum_marker = ""
3310
+ enum_value = 0
3311
+ if tag_name == DocumentToken.ORDERED_LIST.value:
3312
+ list_label = GroupLabel.ORDERED_LIST
3313
+
3314
+ list_item_pattern = (
3315
+ rf"<(?P<tag>{DocItemLabel.LIST_ITEM})>.*?</(?P=tag)>"
3316
+ )
3317
+ li_pattern = re.compile(list_item_pattern, re.DOTALL)
3318
+ # Add list group:
3319
+ new_list = self.add_group(label=list_label, name="list")
3320
+ # Pricess list items
3321
+ for li_match in li_pattern.finditer(full_chunk):
3322
+ enum_value += 1
3323
+ if tag_name == DocumentToken.ORDERED_LIST.value:
3324
+ enum_marker = str(enum_value) + "."
3325
+
3326
+ li_full_chunk = li_match.group(0)
3327
+ li_bbox = extract_bounding_box(li_full_chunk) if image else None
3328
+ text_content = extract_inner_text(li_full_chunk)
3329
+ # Add list item
3330
+ self.add_list_item(
3331
+ marker=enum_marker,
3332
+ enumerated=(tag_name == DocumentToken.ORDERED_LIST.value),
3333
+ parent=new_list,
3334
+ text=text_content,
3335
+ prov=(
3336
+ ProvenanceItem(
3337
+ bbox=li_bbox.resize_by_scale(pg_width, pg_height),
3338
+ charspan=(0, len(text_content)),
3339
+ page_no=page_no,
3340
+ )
3341
+ if li_bbox
3342
+ else None
3343
+ ),
3344
+ )
3345
+ else:
3346
+ # For everything else, treat as text
3347
+ text_content = extract_inner_text(full_chunk)
3348
+ self.add_text(
3349
+ label=doc_label,
3350
+ text=text_content,
3351
+ prov=(
3352
+ ProvenanceItem(
3353
+ bbox=bbox.resize_by_scale(pg_width, pg_height),
3354
+ charspan=(0, len(text_content)),
3355
+ page_no=page_no,
3356
+ )
3357
+ if bbox
3358
+ else None
3359
+ ),
3360
+ )
3361
+ return self
3362
+
3363
+ @deprecated("Use save_as_doctags instead.")
3364
+ def save_as_document_tokens(self, *args, **kwargs):
3365
+ r"""Save the document content to a DocumentToken format."""
3366
+ return self.save_as_doctags(*args, **kwargs)
3367
+
3368
+ def save_as_doctags(
2930
3369
  self,
2931
3370
  filename: Path,
2932
3371
  delim: str = "",
@@ -2942,7 +3381,7 @@ class DoclingDocument(BaseModel):
2942
3381
  add_table_cell_location: bool = False,
2943
3382
  add_table_cell_text: bool = True,
2944
3383
  ):
2945
- r"""Save the document content to a DocumentToken format."""
3384
+ r"""Save the document content to DocTags format."""
2946
3385
  out = self.export_to_document_tokens(
2947
3386
  delim=delim,
2948
3387
  from_element=from_element,
@@ -50,6 +50,7 @@ class DocumentToken(Enum):
50
50
  UNORDERED_LIST = "unordered_list"
51
51
  LOC = "loc_"
52
52
  PAGE_BREAK = "page_break"
53
+ SMILES = "smiles"
53
54
 
54
55
  @classmethod
55
56
  def get_special_tokens(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.21.1
3
+ Version: 2.22.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -19,14 +19,14 @@ docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9AC
19
19
  docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
20
20
  docling_core/transforms/chunker/base.py,sha256=BSWTiFOsF5YaZaZJZY8nwIdOXb9uufJMRIds7LxRNh8,2546
21
21
  docling_core/transforms/chunker/hierarchical_chunker.py,sha256=MStDUDtzFGc6j8v9AkcAnnSHTDxdoiVrp8FTmRdGqU8,8138
22
- docling_core/transforms/chunker/hybrid_chunker.py,sha256=kokjDdxjc_gygOokQwYFVnHv2NjWTgf9uex8o0ole7w,9876
22
+ docling_core/transforms/chunker/hybrid_chunker.py,sha256=v-HpFg-HvQLi0gQtHm-6KlMfcMWupkBjwr5qF-rfr4E,9842
23
23
  docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
24
24
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
25
25
  docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
26
26
  docling_core/types/doc/base.py,sha256=22U1qDlD-2ICmgzbdZrjNayoPHnq4S1ks1GRoqB7y1Q,12542
27
- docling_core/types/doc/document.py,sha256=P8dx5lP3oVrdlrXJx-Y-nk-UM7llDF6ZwOqs046HAM4,110451
27
+ docling_core/types/doc/document.py,sha256=CBNAWaR9UkwmU60N9MYJoygu8fNlTXT-taBRaAURyEo,128879
28
28
  docling_core/types/doc/labels.py,sha256=0J9Gsqz-jQ4FP2yxs9wOxoTr3qg97BniFX7MJVziUmk,5684
29
- docling_core/types/doc/tokens.py,sha256=i73PXkmqXCLsQ5SddnJX8L9e_Ub2_K_DYSE-VE8NDq0,3925
29
+ docling_core/types/doc/tokens.py,sha256=Z2FuzHWinYQzWZdTvOBsEQACAKPcBiSf777w5S9NJms,3947
30
30
  docling_core/types/doc/utils.py,sha256=SaiQD-WMMooFm1bMqwatU-IGhtG048iKJb-ppnJit_k,2250
31
31
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
32
32
  docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
@@ -56,8 +56,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
56
56
  docling_core/utils/legacy.py,sha256=SqNQAxl97aHfoJEsC9vZcMJg5FNkmqKPFi-wdSrnfI0,24442
57
57
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
58
58
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
59
- docling_core-2.21.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
- docling_core-2.21.1.dist-info/METADATA,sha256=qz2AeXj0vfiBu24oWyMDiQSPvKM0yUn1Rj85JaUd7Yg,5803
61
- docling_core-2.21.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
- docling_core-2.21.1.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
63
- docling_core-2.21.1.dist-info/RECORD,,
59
+ docling_core-2.22.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
+ docling_core-2.22.0.dist-info/METADATA,sha256=NlaL99G-0wcPzYLzYF6W3X-rgt0SXe1DeYuaCYDLQGU,5803
61
+ docling_core-2.22.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
+ docling_core-2.22.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
63
+ docling_core-2.22.0.dist-info/RECORD,,