docling-core 2.21.2__tar.gz → 2.22.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (62) hide show
  1. {docling_core-2.21.2 → docling_core-2.22.0}/PKG-INFO +1 -1
  2. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/doc/document.py +432 -3
  3. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/doc/tokens.py +1 -0
  4. {docling_core-2.21.2 → docling_core-2.22.0}/pyproject.toml +1 -1
  5. {docling_core-2.21.2 → docling_core-2.22.0}/LICENSE +0 -0
  6. {docling_core-2.21.2 → docling_core-2.22.0}/README.md +0 -0
  7. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/__init__.py +0 -0
  8. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/cli/__init__.py +0 -0
  9. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/cli/view.py +0 -0
  10. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/py.typed +0 -0
  11. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  12. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  13. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  14. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  15. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  16. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  17. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  18. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  19. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/search/__init__.py +0 -0
  20. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  21. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/search/mapping.py +0 -0
  22. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/search/meta.py +0 -0
  23. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/search/package.py +0 -0
  24. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/transforms/__init__.py +0 -0
  25. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/transforms/chunker/__init__.py +0 -0
  26. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/transforms/chunker/base.py +0 -0
  27. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  28. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  29. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/__init__.py +0 -0
  30. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/base.py +0 -0
  31. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/doc/__init__.py +0 -0
  32. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/doc/base.py +0 -0
  33. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/doc/labels.py +0 -0
  34. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/doc/utils.py +0 -0
  35. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/gen/__init__.py +0 -0
  36. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/gen/generic.py +0 -0
  37. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/io/__init__.py +0 -0
  38. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  39. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/legacy_doc/base.py +0 -0
  40. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  41. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  42. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  43. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/legacy_doc/document.py +0 -0
  44. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  45. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/nlp/__init__.py +0 -0
  46. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/nlp/qa.py +0 -0
  47. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/nlp/qa_labels.py +0 -0
  48. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/rec/__init__.py +0 -0
  49. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/rec/attribute.py +0 -0
  50. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/rec/base.py +0 -0
  51. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/rec/predicate.py +0 -0
  52. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/rec/record.py +0 -0
  53. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/rec/statement.py +0 -0
  54. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/rec/subject.py +0 -0
  55. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/utils/__init__.py +0 -0
  56. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/utils/alias.py +0 -0
  57. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/utils/file.py +0 -0
  58. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/utils/generate_docs.py +0 -0
  59. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/utils/generate_jsonschema.py +0 -0
  60. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/utils/legacy.py +0 -0
  61. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/utils/validate.py +0 -0
  62. {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.21.2
3
+ Version: 2.22.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -4,6 +4,7 @@ import base64
4
4
  import copy
5
5
  import hashlib
6
6
  import html
7
+ import itertools
7
8
  import json
8
9
  import logging
9
10
  import mimetypes
@@ -37,7 +38,7 @@ from pydantic import (
37
38
  model_validator,
38
39
  )
39
40
  from tabulate import tabulate
40
- from typing_extensions import Annotated, Self
41
+ from typing_extensions import Annotated, Self, deprecated
41
42
 
42
43
  from docling_core.search.package import VERSION_PATTERN
43
44
  from docling_core.types.base import _JSON_POINTER_REGEX
@@ -522,6 +523,49 @@ class ImageRef(BaseModel):
522
523
  )
523
524
 
524
525
 
526
+ class DocTagsPage(BaseModel):
527
+ """DocTagsPage."""
528
+
529
+ model_config = ConfigDict(arbitrary_types_allowed=True)
530
+
531
+ tokens: str
532
+ image: Optional[PILImage.Image] = None
533
+
534
+
535
+ class DocTagsDocument(BaseModel):
536
+ """DocTagsDocument."""
537
+
538
+ pages: List[DocTagsPage] = []
539
+
540
+ @classmethod
541
+ def from_doctags_and_image_pairs(
542
+ cls, doctags: List[Union[Path, str]], images: List[Union[Path, PILImage.Image]]
543
+ ):
544
+ """from_doctags_and_image_pairs."""
545
+ if len(doctags) != len(images):
546
+ raise ValueError("Number of page doctags must be equal to page images!")
547
+ doctags_doc = cls()
548
+
549
+ pages = []
550
+ for dt, img in zip(doctags, images):
551
+ if isinstance(dt, Path):
552
+ with dt.open("r") as fp:
553
+ dt = fp.read()
554
+ elif isinstance(dt, str):
555
+ pass
556
+
557
+ if isinstance(img, Path):
558
+ img = PILImage.open(img)
559
+ elif isinstance(dt, PILImage.Image):
560
+ pass
561
+
562
+ page = DocTagsPage(tokens=dt, image=img)
563
+ pages.append(page)
564
+
565
+ doctags_doc.pages = pages
566
+ return doctags_doc
567
+
568
+
525
569
  class ProvenanceItem(BaseModel):
526
570
  """ProvenanceItem."""
527
571
 
@@ -1003,6 +1047,20 @@ class PictureItem(FloatingItem):
1003
1047
  predicted_class = classifications[0].predicted_classes[0].class_name
1004
1048
  body += DocumentToken.get_picture_classification_token(predicted_class)
1005
1049
 
1050
+ smiles_annotations = [
1051
+ ann for ann in self.annotations if isinstance(ann, PictureMoleculeData)
1052
+ ]
1053
+ if len(smiles_annotations) > 0:
1054
+ body += (
1055
+ "<"
1056
+ + DocumentToken.SMILES.value
1057
+ + ">"
1058
+ + smiles_annotations[0].smi
1059
+ + "</"
1060
+ + DocumentToken.SMILES.value
1061
+ + ">"
1062
+ )
1063
+
1006
1064
  if add_caption and len(self.captions):
1007
1065
  text = self.caption_text(doc)
1008
1066
 
@@ -2936,7 +2994,378 @@ class DoclingDocument(BaseModel):
2936
2994
 
2937
2995
  return html_text
2938
2996
 
2939
- def save_as_document_tokens(
2997
+ def load_from_doctags( # noqa: C901
2998
+ self,
2999
+ doctag_document: DocTagsDocument,
3000
+ ) -> "DoclingDocument":
3001
+ r"""Load Docling document from lists of DocTags and Images."""
3002
+ # Maps the recognized tag to a Docling label.
3003
+ # Code items will be given DocItemLabel.CODE
3004
+ tag_to_doclabel = {
3005
+ "title": DocItemLabel.TITLE,
3006
+ "document_index": DocItemLabel.DOCUMENT_INDEX,
3007
+ "otsl": DocItemLabel.TABLE,
3008
+ "section_header_level_1": DocItemLabel.SECTION_HEADER,
3009
+ "checkbox_selected": DocItemLabel.CHECKBOX_SELECTED,
3010
+ "checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED,
3011
+ "text": DocItemLabel.TEXT,
3012
+ "page_header": DocItemLabel.PAGE_HEADER,
3013
+ "page_footer": DocItemLabel.PAGE_FOOTER,
3014
+ "formula": DocItemLabel.FORMULA,
3015
+ "caption": DocItemLabel.CAPTION,
3016
+ "picture": DocItemLabel.PICTURE,
3017
+ "list_item": DocItemLabel.LIST_ITEM,
3018
+ "footnote": DocItemLabel.FOOTNOTE,
3019
+ "code": DocItemLabel.CODE,
3020
+ }
3021
+
3022
+ def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
3023
+ """Extract <loc_...> coords from the chunk, normalized by / 500."""
3024
+ coords = re.findall(r"<loc_(\d+)>", text_chunk)
3025
+ if len(coords) == 4:
3026
+ l, t, r, b = map(float, coords)
3027
+ return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
3028
+ return None
3029
+
3030
+ def extract_inner_text(text_chunk: str) -> str:
3031
+ """Strip all <...> tags inside the chunk to get the raw text content."""
3032
+ return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
3033
+
3034
+ def otsl_parse_texts(texts, tokens):
3035
+ split_word = TableToken.OTSL_NL.value
3036
+ split_row_tokens = [
3037
+ list(y)
3038
+ for x, y in itertools.groupby(tokens, lambda z: z == split_word)
3039
+ if not x
3040
+ ]
3041
+ table_cells = []
3042
+ r_idx = 0
3043
+ c_idx = 0
3044
+
3045
+ def count_right(tokens, c_idx, r_idx, which_tokens):
3046
+ span = 0
3047
+ c_idx_iter = c_idx
3048
+ while tokens[r_idx][c_idx_iter] in which_tokens:
3049
+ c_idx_iter += 1
3050
+ span += 1
3051
+ if c_idx_iter >= len(tokens[r_idx]):
3052
+ return span
3053
+ return span
3054
+
3055
+ def count_down(tokens, c_idx, r_idx, which_tokens):
3056
+ span = 0
3057
+ r_idx_iter = r_idx
3058
+ while tokens[r_idx_iter][c_idx] in which_tokens:
3059
+ r_idx_iter += 1
3060
+ span += 1
3061
+ if r_idx_iter >= len(tokens):
3062
+ return span
3063
+ return span
3064
+
3065
+ for i, text in enumerate(texts):
3066
+ cell_text = ""
3067
+ if text in [
3068
+ TableToken.OTSL_FCEL.value,
3069
+ TableToken.OTSL_ECEL.value,
3070
+ TableToken.OTSL_CHED.value,
3071
+ TableToken.OTSL_RHED.value,
3072
+ TableToken.OTSL_SROW.value,
3073
+ ]:
3074
+ row_span = 1
3075
+ col_span = 1
3076
+ right_offset = 1
3077
+ if text != TableToken.OTSL_ECEL.value:
3078
+ cell_text = texts[i + 1]
3079
+ right_offset = 2
3080
+
3081
+ # Check next element(s) for lcel / ucel / xcel,
3082
+ # set properly row_span, col_span
3083
+ next_right_cell = ""
3084
+ if i + right_offset < len(texts):
3085
+ next_right_cell = texts[i + right_offset]
3086
+
3087
+ next_bottom_cell = ""
3088
+ if r_idx + 1 < len(split_row_tokens):
3089
+ if c_idx < len(split_row_tokens[r_idx + 1]):
3090
+ next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
3091
+
3092
+ if next_right_cell in [
3093
+ TableToken.OTSL_LCEL.value,
3094
+ TableToken.OTSL_XCEL.value,
3095
+ ]:
3096
+ # we have horisontal spanning cell or 2d spanning cell
3097
+ col_span += count_right(
3098
+ split_row_tokens,
3099
+ c_idx + 1,
3100
+ r_idx,
3101
+ [TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
3102
+ )
3103
+ if next_bottom_cell in [
3104
+ TableToken.OTSL_UCEL.value,
3105
+ TableToken.OTSL_XCEL.value,
3106
+ ]:
3107
+ # we have a vertical spanning cell or 2d spanning cell
3108
+ row_span += count_down(
3109
+ split_row_tokens,
3110
+ c_idx,
3111
+ r_idx + 1,
3112
+ [TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
3113
+ )
3114
+
3115
+ table_cells.append(
3116
+ TableCell(
3117
+ text=cell_text.strip(),
3118
+ row_span=row_span,
3119
+ col_span=col_span,
3120
+ start_row_offset_idx=r_idx,
3121
+ end_row_offset_idx=r_idx + row_span,
3122
+ start_col_offset_idx=c_idx,
3123
+ end_col_offset_idx=c_idx + col_span,
3124
+ )
3125
+ )
3126
+ if text in [
3127
+ TableToken.OTSL_FCEL.value,
3128
+ TableToken.OTSL_ECEL.value,
3129
+ TableToken.OTSL_CHED.value,
3130
+ TableToken.OTSL_RHED.value,
3131
+ TableToken.OTSL_SROW.value,
3132
+ TableToken.OTSL_LCEL.value,
3133
+ TableToken.OTSL_UCEL.value,
3134
+ TableToken.OTSL_XCEL.value,
3135
+ ]:
3136
+ c_idx += 1
3137
+ if text == TableToken.OTSL_NL.value:
3138
+ r_idx += 1
3139
+ c_idx = 0
3140
+ return table_cells, split_row_tokens
3141
+
3142
+ def otsl_extract_tokens_and_text(s: str):
3143
+ # Pattern to match anything enclosed by < >
3144
+ # (including the angle brackets themselves)
3145
+ pattern = r"(<[^>]+>)"
3146
+ # Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
3147
+ tokens = re.findall(pattern, s)
3148
+ # Remove any tokens that start with "<loc_"
3149
+ tokens = [
3150
+ token
3151
+ for token in tokens
3152
+ if not (
3153
+ token.startswith(rf"<{DocumentToken.LOC.value}")
3154
+ or token
3155
+ in [
3156
+ rf"<{DocumentToken.OTSL.value}>",
3157
+ rf"</{DocumentToken.OTSL.value}>",
3158
+ ]
3159
+ )
3160
+ ]
3161
+ # Split the string by those tokens to get the in-between text
3162
+ text_parts = re.split(pattern, s)
3163
+ text_parts = [
3164
+ token
3165
+ for token in text_parts
3166
+ if not (
3167
+ token.startswith(rf"<{DocumentToken.LOC.value}")
3168
+ or token
3169
+ in [
3170
+ rf"<{DocumentToken.OTSL.value}>",
3171
+ rf"</{DocumentToken.OTSL.value}>",
3172
+ ]
3173
+ )
3174
+ ]
3175
+ # Remove any empty or purely whitespace strings from text_parts
3176
+ text_parts = [part for part in text_parts if part.strip()]
3177
+
3178
+ return tokens, text_parts
3179
+
3180
+ def parse_table_content(otsl_content: str) -> TableData:
3181
+ tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
3182
+ table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
3183
+
3184
+ return TableData(
3185
+ num_rows=len(split_row_tokens),
3186
+ num_cols=(
3187
+ max(len(row) for row in split_row_tokens) if split_row_tokens else 0
3188
+ ),
3189
+ table_cells=table_cells,
3190
+ )
3191
+
3192
+ # doc = DoclingDocument(name="Document")
3193
+ for pg_idx, doctag_page in enumerate(doctag_document.pages):
3194
+ page_doctags = doctag_page.tokens
3195
+ image = doctag_page.image
3196
+
3197
+ page_no = pg_idx + 1
3198
+ # bounding_boxes = []
3199
+
3200
+ if image is not None:
3201
+ pg_width = image.width
3202
+ pg_height = image.height
3203
+ else:
3204
+ pg_width = 1
3205
+ pg_height = 1
3206
+
3207
+ """
3208
+ 1. Finds all <tag>...</tag>
3209
+ blocks in the entire string (multi-line friendly)
3210
+ in the order they appear.
3211
+ 2. For each chunk, extracts bounding box (if any) and inner text.
3212
+ 3. Adds the item to a DoclingDocument structure with the right label.
3213
+ 4. Tracks bounding boxes+color in a separate list for later visualization.
3214
+ """
3215
+
3216
+ # Regex for root level recognized tags
3217
+ tag_pattern = (
3218
+ rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
3219
+ rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
3220
+ rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
3221
+ rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
3222
+ rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
3223
+ rf"{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
3224
+ rf"{DocItemLabel.SECTION_HEADER}_level_1|"
3225
+ rf"{DocumentToken.ORDERED_LIST.value}|"
3226
+ rf"{DocumentToken.UNORDERED_LIST.value}|"
3227
+ rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
3228
+ )
3229
+
3230
+ # DocumentToken.OTSL
3231
+ pattern = re.compile(tag_pattern, re.DOTALL)
3232
+
3233
+ # Go through each match in order
3234
+ for match in pattern.finditer(page_doctags):
3235
+ full_chunk = match.group(0)
3236
+ tag_name = match.group("tag")
3237
+
3238
+ bbox = extract_bounding_box(full_chunk) if image else None
3239
+ doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
3240
+
3241
+ if tag_name == DocumentToken.OTSL.value:
3242
+ table_data = parse_table_content(full_chunk)
3243
+ bbox = extract_bounding_box(full_chunk) if image else None
3244
+
3245
+ if bbox:
3246
+ prov = ProvenanceItem(
3247
+ bbox=bbox.resize_by_scale(pg_width, pg_height),
3248
+ charspan=(0, 0),
3249
+ page_no=page_no,
3250
+ )
3251
+ self.add_table(data=table_data, prov=prov)
3252
+ else:
3253
+ self.add_table(data=table_data)
3254
+
3255
+ elif tag_name == DocItemLabel.PICTURE:
3256
+ text_caption_content = extract_inner_text(full_chunk)
3257
+ if image:
3258
+ if bbox:
3259
+ im_width, im_height = image.size
3260
+
3261
+ crop_box = (
3262
+ int(bbox.l * im_width),
3263
+ int(bbox.t * im_height),
3264
+ int(bbox.r * im_width),
3265
+ int(bbox.b * im_height),
3266
+ )
3267
+ cropped_image = image.crop(crop_box)
3268
+ pic = self.add_picture(
3269
+ parent=None,
3270
+ image=ImageRef.from_pil(image=cropped_image, dpi=72),
3271
+ prov=(
3272
+ ProvenanceItem(
3273
+ bbox=bbox.resize_by_scale(pg_width, pg_height),
3274
+ charspan=(0, 0),
3275
+ page_no=page_no,
3276
+ )
3277
+ ),
3278
+ )
3279
+ # If there is a caption to an image, add it as well
3280
+ if len(text_caption_content) > 0:
3281
+ caption_item = self.add_text(
3282
+ label=DocItemLabel.CAPTION,
3283
+ text=text_caption_content,
3284
+ parent=None,
3285
+ )
3286
+ pic.captions.append(caption_item.get_ref())
3287
+ else:
3288
+ if bbox:
3289
+ # In case we don't have access to an binary of an image
3290
+ self.add_picture(
3291
+ parent=None,
3292
+ prov=ProvenanceItem(
3293
+ bbox=bbox, charspan=(0, 0), page_no=page_no
3294
+ ),
3295
+ )
3296
+ # If there is a caption to an image, add it as well
3297
+ if len(text_caption_content) > 0:
3298
+ caption_item = self.add_text(
3299
+ label=DocItemLabel.CAPTION,
3300
+ text=text_caption_content,
3301
+ parent=None,
3302
+ )
3303
+ pic.captions.append(caption_item.get_ref())
3304
+ elif tag_name in [
3305
+ DocumentToken.ORDERED_LIST.value,
3306
+ DocumentToken.UNORDERED_LIST.value,
3307
+ ]:
3308
+ list_label = GroupLabel.LIST
3309
+ enum_marker = ""
3310
+ enum_value = 0
3311
+ if tag_name == DocumentToken.ORDERED_LIST.value:
3312
+ list_label = GroupLabel.ORDERED_LIST
3313
+
3314
+ list_item_pattern = (
3315
+ rf"<(?P<tag>{DocItemLabel.LIST_ITEM})>.*?</(?P=tag)>"
3316
+ )
3317
+ li_pattern = re.compile(list_item_pattern, re.DOTALL)
3318
+ # Add list group:
3319
+ new_list = self.add_group(label=list_label, name="list")
3320
+ # Pricess list items
3321
+ for li_match in li_pattern.finditer(full_chunk):
3322
+ enum_value += 1
3323
+ if tag_name == DocumentToken.ORDERED_LIST.value:
3324
+ enum_marker = str(enum_value) + "."
3325
+
3326
+ li_full_chunk = li_match.group(0)
3327
+ li_bbox = extract_bounding_box(li_full_chunk) if image else None
3328
+ text_content = extract_inner_text(li_full_chunk)
3329
+ # Add list item
3330
+ self.add_list_item(
3331
+ marker=enum_marker,
3332
+ enumerated=(tag_name == DocumentToken.ORDERED_LIST.value),
3333
+ parent=new_list,
3334
+ text=text_content,
3335
+ prov=(
3336
+ ProvenanceItem(
3337
+ bbox=li_bbox.resize_by_scale(pg_width, pg_height),
3338
+ charspan=(0, len(text_content)),
3339
+ page_no=page_no,
3340
+ )
3341
+ if li_bbox
3342
+ else None
3343
+ ),
3344
+ )
3345
+ else:
3346
+ # For everything else, treat as text
3347
+ text_content = extract_inner_text(full_chunk)
3348
+ self.add_text(
3349
+ label=doc_label,
3350
+ text=text_content,
3351
+ prov=(
3352
+ ProvenanceItem(
3353
+ bbox=bbox.resize_by_scale(pg_width, pg_height),
3354
+ charspan=(0, len(text_content)),
3355
+ page_no=page_no,
3356
+ )
3357
+ if bbox
3358
+ else None
3359
+ ),
3360
+ )
3361
+ return self
3362
+
3363
+ @deprecated("Use save_as_doctags instead.")
3364
+ def save_as_document_tokens(self, *args, **kwargs):
3365
+ r"""Save the document content to a DocumentToken format."""
3366
+ return self.save_as_doctags(*args, **kwargs)
3367
+
3368
+ def save_as_doctags(
2940
3369
  self,
2941
3370
  filename: Path,
2942
3371
  delim: str = "",
@@ -2952,7 +3381,7 @@ class DoclingDocument(BaseModel):
2952
3381
  add_table_cell_location: bool = False,
2953
3382
  add_table_cell_text: bool = True,
2954
3383
  ):
2955
- r"""Save the document content to a DocumentToken format."""
3384
+ r"""Save the document content to DocTags format."""
2956
3385
  out = self.export_to_document_tokens(
2957
3386
  delim=delim,
2958
3387
  from_element=from_element,
@@ -50,6 +50,7 @@ class DocumentToken(Enum):
50
50
  UNORDERED_LIST = "unordered_list"
51
51
  LOC = "loc_"
52
52
  PAGE_BREAK = "page_break"
53
+ SMILES = "smiles"
53
54
 
54
55
  @classmethod
55
56
  def get_special_tokens(
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.21.2"
3
+ version = "2.22.0"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
File without changes
File without changes