docling-core 2.21.2__tar.gz → 2.22.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.21.2 → docling_core-2.22.0}/PKG-INFO +1 -1
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/doc/document.py +432 -3
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/doc/tokens.py +1 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/pyproject.toml +1 -1
- {docling_core-2.21.2 → docling_core-2.22.0}/LICENSE +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/README.md +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/__init__.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/py.typed +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/search/package.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/base.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.21.2 → docling_core-2.22.0}/docling_core/utils/validators.py +0 -0
|
@@ -4,6 +4,7 @@ import base64
|
|
|
4
4
|
import copy
|
|
5
5
|
import hashlib
|
|
6
6
|
import html
|
|
7
|
+
import itertools
|
|
7
8
|
import json
|
|
8
9
|
import logging
|
|
9
10
|
import mimetypes
|
|
@@ -37,7 +38,7 @@ from pydantic import (
|
|
|
37
38
|
model_validator,
|
|
38
39
|
)
|
|
39
40
|
from tabulate import tabulate
|
|
40
|
-
from typing_extensions import Annotated, Self
|
|
41
|
+
from typing_extensions import Annotated, Self, deprecated
|
|
41
42
|
|
|
42
43
|
from docling_core.search.package import VERSION_PATTERN
|
|
43
44
|
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
@@ -522,6 +523,49 @@ class ImageRef(BaseModel):
|
|
|
522
523
|
)
|
|
523
524
|
|
|
524
525
|
|
|
526
|
+
class DocTagsPage(BaseModel):
|
|
527
|
+
"""DocTagsPage."""
|
|
528
|
+
|
|
529
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
530
|
+
|
|
531
|
+
tokens: str
|
|
532
|
+
image: Optional[PILImage.Image] = None
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
class DocTagsDocument(BaseModel):
|
|
536
|
+
"""DocTagsDocument."""
|
|
537
|
+
|
|
538
|
+
pages: List[DocTagsPage] = []
|
|
539
|
+
|
|
540
|
+
@classmethod
|
|
541
|
+
def from_doctags_and_image_pairs(
|
|
542
|
+
cls, doctags: List[Union[Path, str]], images: List[Union[Path, PILImage.Image]]
|
|
543
|
+
):
|
|
544
|
+
"""from_doctags_and_image_pairs."""
|
|
545
|
+
if len(doctags) != len(images):
|
|
546
|
+
raise ValueError("Number of page doctags must be equal to page images!")
|
|
547
|
+
doctags_doc = cls()
|
|
548
|
+
|
|
549
|
+
pages = []
|
|
550
|
+
for dt, img in zip(doctags, images):
|
|
551
|
+
if isinstance(dt, Path):
|
|
552
|
+
with dt.open("r") as fp:
|
|
553
|
+
dt = fp.read()
|
|
554
|
+
elif isinstance(dt, str):
|
|
555
|
+
pass
|
|
556
|
+
|
|
557
|
+
if isinstance(img, Path):
|
|
558
|
+
img = PILImage.open(img)
|
|
559
|
+
elif isinstance(dt, PILImage.Image):
|
|
560
|
+
pass
|
|
561
|
+
|
|
562
|
+
page = DocTagsPage(tokens=dt, image=img)
|
|
563
|
+
pages.append(page)
|
|
564
|
+
|
|
565
|
+
doctags_doc.pages = pages
|
|
566
|
+
return doctags_doc
|
|
567
|
+
|
|
568
|
+
|
|
525
569
|
class ProvenanceItem(BaseModel):
|
|
526
570
|
"""ProvenanceItem."""
|
|
527
571
|
|
|
@@ -1003,6 +1047,20 @@ class PictureItem(FloatingItem):
|
|
|
1003
1047
|
predicted_class = classifications[0].predicted_classes[0].class_name
|
|
1004
1048
|
body += DocumentToken.get_picture_classification_token(predicted_class)
|
|
1005
1049
|
|
|
1050
|
+
smiles_annotations = [
|
|
1051
|
+
ann for ann in self.annotations if isinstance(ann, PictureMoleculeData)
|
|
1052
|
+
]
|
|
1053
|
+
if len(smiles_annotations) > 0:
|
|
1054
|
+
body += (
|
|
1055
|
+
"<"
|
|
1056
|
+
+ DocumentToken.SMILES.value
|
|
1057
|
+
+ ">"
|
|
1058
|
+
+ smiles_annotations[0].smi
|
|
1059
|
+
+ "</"
|
|
1060
|
+
+ DocumentToken.SMILES.value
|
|
1061
|
+
+ ">"
|
|
1062
|
+
)
|
|
1063
|
+
|
|
1006
1064
|
if add_caption and len(self.captions):
|
|
1007
1065
|
text = self.caption_text(doc)
|
|
1008
1066
|
|
|
@@ -2936,7 +2994,378 @@ class DoclingDocument(BaseModel):
|
|
|
2936
2994
|
|
|
2937
2995
|
return html_text
|
|
2938
2996
|
|
|
2939
|
-
def
|
|
2997
|
+
def load_from_doctags( # noqa: C901
|
|
2998
|
+
self,
|
|
2999
|
+
doctag_document: DocTagsDocument,
|
|
3000
|
+
) -> "DoclingDocument":
|
|
3001
|
+
r"""Load Docling document from lists of DocTags and Images."""
|
|
3002
|
+
# Maps the recognized tag to a Docling label.
|
|
3003
|
+
# Code items will be given DocItemLabel.CODE
|
|
3004
|
+
tag_to_doclabel = {
|
|
3005
|
+
"title": DocItemLabel.TITLE,
|
|
3006
|
+
"document_index": DocItemLabel.DOCUMENT_INDEX,
|
|
3007
|
+
"otsl": DocItemLabel.TABLE,
|
|
3008
|
+
"section_header_level_1": DocItemLabel.SECTION_HEADER,
|
|
3009
|
+
"checkbox_selected": DocItemLabel.CHECKBOX_SELECTED,
|
|
3010
|
+
"checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED,
|
|
3011
|
+
"text": DocItemLabel.TEXT,
|
|
3012
|
+
"page_header": DocItemLabel.PAGE_HEADER,
|
|
3013
|
+
"page_footer": DocItemLabel.PAGE_FOOTER,
|
|
3014
|
+
"formula": DocItemLabel.FORMULA,
|
|
3015
|
+
"caption": DocItemLabel.CAPTION,
|
|
3016
|
+
"picture": DocItemLabel.PICTURE,
|
|
3017
|
+
"list_item": DocItemLabel.LIST_ITEM,
|
|
3018
|
+
"footnote": DocItemLabel.FOOTNOTE,
|
|
3019
|
+
"code": DocItemLabel.CODE,
|
|
3020
|
+
}
|
|
3021
|
+
|
|
3022
|
+
def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
|
|
3023
|
+
"""Extract <loc_...> coords from the chunk, normalized by / 500."""
|
|
3024
|
+
coords = re.findall(r"<loc_(\d+)>", text_chunk)
|
|
3025
|
+
if len(coords) == 4:
|
|
3026
|
+
l, t, r, b = map(float, coords)
|
|
3027
|
+
return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
|
|
3028
|
+
return None
|
|
3029
|
+
|
|
3030
|
+
def extract_inner_text(text_chunk: str) -> str:
|
|
3031
|
+
"""Strip all <...> tags inside the chunk to get the raw text content."""
|
|
3032
|
+
return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
|
|
3033
|
+
|
|
3034
|
+
def otsl_parse_texts(texts, tokens):
|
|
3035
|
+
split_word = TableToken.OTSL_NL.value
|
|
3036
|
+
split_row_tokens = [
|
|
3037
|
+
list(y)
|
|
3038
|
+
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
|
|
3039
|
+
if not x
|
|
3040
|
+
]
|
|
3041
|
+
table_cells = []
|
|
3042
|
+
r_idx = 0
|
|
3043
|
+
c_idx = 0
|
|
3044
|
+
|
|
3045
|
+
def count_right(tokens, c_idx, r_idx, which_tokens):
|
|
3046
|
+
span = 0
|
|
3047
|
+
c_idx_iter = c_idx
|
|
3048
|
+
while tokens[r_idx][c_idx_iter] in which_tokens:
|
|
3049
|
+
c_idx_iter += 1
|
|
3050
|
+
span += 1
|
|
3051
|
+
if c_idx_iter >= len(tokens[r_idx]):
|
|
3052
|
+
return span
|
|
3053
|
+
return span
|
|
3054
|
+
|
|
3055
|
+
def count_down(tokens, c_idx, r_idx, which_tokens):
|
|
3056
|
+
span = 0
|
|
3057
|
+
r_idx_iter = r_idx
|
|
3058
|
+
while tokens[r_idx_iter][c_idx] in which_tokens:
|
|
3059
|
+
r_idx_iter += 1
|
|
3060
|
+
span += 1
|
|
3061
|
+
if r_idx_iter >= len(tokens):
|
|
3062
|
+
return span
|
|
3063
|
+
return span
|
|
3064
|
+
|
|
3065
|
+
for i, text in enumerate(texts):
|
|
3066
|
+
cell_text = ""
|
|
3067
|
+
if text in [
|
|
3068
|
+
TableToken.OTSL_FCEL.value,
|
|
3069
|
+
TableToken.OTSL_ECEL.value,
|
|
3070
|
+
TableToken.OTSL_CHED.value,
|
|
3071
|
+
TableToken.OTSL_RHED.value,
|
|
3072
|
+
TableToken.OTSL_SROW.value,
|
|
3073
|
+
]:
|
|
3074
|
+
row_span = 1
|
|
3075
|
+
col_span = 1
|
|
3076
|
+
right_offset = 1
|
|
3077
|
+
if text != TableToken.OTSL_ECEL.value:
|
|
3078
|
+
cell_text = texts[i + 1]
|
|
3079
|
+
right_offset = 2
|
|
3080
|
+
|
|
3081
|
+
# Check next element(s) for lcel / ucel / xcel,
|
|
3082
|
+
# set properly row_span, col_span
|
|
3083
|
+
next_right_cell = ""
|
|
3084
|
+
if i + right_offset < len(texts):
|
|
3085
|
+
next_right_cell = texts[i + right_offset]
|
|
3086
|
+
|
|
3087
|
+
next_bottom_cell = ""
|
|
3088
|
+
if r_idx + 1 < len(split_row_tokens):
|
|
3089
|
+
if c_idx < len(split_row_tokens[r_idx + 1]):
|
|
3090
|
+
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
|
|
3091
|
+
|
|
3092
|
+
if next_right_cell in [
|
|
3093
|
+
TableToken.OTSL_LCEL.value,
|
|
3094
|
+
TableToken.OTSL_XCEL.value,
|
|
3095
|
+
]:
|
|
3096
|
+
# we have horisontal spanning cell or 2d spanning cell
|
|
3097
|
+
col_span += count_right(
|
|
3098
|
+
split_row_tokens,
|
|
3099
|
+
c_idx + 1,
|
|
3100
|
+
r_idx,
|
|
3101
|
+
[TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
|
|
3102
|
+
)
|
|
3103
|
+
if next_bottom_cell in [
|
|
3104
|
+
TableToken.OTSL_UCEL.value,
|
|
3105
|
+
TableToken.OTSL_XCEL.value,
|
|
3106
|
+
]:
|
|
3107
|
+
# we have a vertical spanning cell or 2d spanning cell
|
|
3108
|
+
row_span += count_down(
|
|
3109
|
+
split_row_tokens,
|
|
3110
|
+
c_idx,
|
|
3111
|
+
r_idx + 1,
|
|
3112
|
+
[TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
|
|
3113
|
+
)
|
|
3114
|
+
|
|
3115
|
+
table_cells.append(
|
|
3116
|
+
TableCell(
|
|
3117
|
+
text=cell_text.strip(),
|
|
3118
|
+
row_span=row_span,
|
|
3119
|
+
col_span=col_span,
|
|
3120
|
+
start_row_offset_idx=r_idx,
|
|
3121
|
+
end_row_offset_idx=r_idx + row_span,
|
|
3122
|
+
start_col_offset_idx=c_idx,
|
|
3123
|
+
end_col_offset_idx=c_idx + col_span,
|
|
3124
|
+
)
|
|
3125
|
+
)
|
|
3126
|
+
if text in [
|
|
3127
|
+
TableToken.OTSL_FCEL.value,
|
|
3128
|
+
TableToken.OTSL_ECEL.value,
|
|
3129
|
+
TableToken.OTSL_CHED.value,
|
|
3130
|
+
TableToken.OTSL_RHED.value,
|
|
3131
|
+
TableToken.OTSL_SROW.value,
|
|
3132
|
+
TableToken.OTSL_LCEL.value,
|
|
3133
|
+
TableToken.OTSL_UCEL.value,
|
|
3134
|
+
TableToken.OTSL_XCEL.value,
|
|
3135
|
+
]:
|
|
3136
|
+
c_idx += 1
|
|
3137
|
+
if text == TableToken.OTSL_NL.value:
|
|
3138
|
+
r_idx += 1
|
|
3139
|
+
c_idx = 0
|
|
3140
|
+
return table_cells, split_row_tokens
|
|
3141
|
+
|
|
3142
|
+
def otsl_extract_tokens_and_text(s: str):
|
|
3143
|
+
# Pattern to match anything enclosed by < >
|
|
3144
|
+
# (including the angle brackets themselves)
|
|
3145
|
+
pattern = r"(<[^>]+>)"
|
|
3146
|
+
# Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
|
|
3147
|
+
tokens = re.findall(pattern, s)
|
|
3148
|
+
# Remove any tokens that start with "<loc_"
|
|
3149
|
+
tokens = [
|
|
3150
|
+
token
|
|
3151
|
+
for token in tokens
|
|
3152
|
+
if not (
|
|
3153
|
+
token.startswith(rf"<{DocumentToken.LOC.value}")
|
|
3154
|
+
or token
|
|
3155
|
+
in [
|
|
3156
|
+
rf"<{DocumentToken.OTSL.value}>",
|
|
3157
|
+
rf"</{DocumentToken.OTSL.value}>",
|
|
3158
|
+
]
|
|
3159
|
+
)
|
|
3160
|
+
]
|
|
3161
|
+
# Split the string by those tokens to get the in-between text
|
|
3162
|
+
text_parts = re.split(pattern, s)
|
|
3163
|
+
text_parts = [
|
|
3164
|
+
token
|
|
3165
|
+
for token in text_parts
|
|
3166
|
+
if not (
|
|
3167
|
+
token.startswith(rf"<{DocumentToken.LOC.value}")
|
|
3168
|
+
or token
|
|
3169
|
+
in [
|
|
3170
|
+
rf"<{DocumentToken.OTSL.value}>",
|
|
3171
|
+
rf"</{DocumentToken.OTSL.value}>",
|
|
3172
|
+
]
|
|
3173
|
+
)
|
|
3174
|
+
]
|
|
3175
|
+
# Remove any empty or purely whitespace strings from text_parts
|
|
3176
|
+
text_parts = [part for part in text_parts if part.strip()]
|
|
3177
|
+
|
|
3178
|
+
return tokens, text_parts
|
|
3179
|
+
|
|
3180
|
+
def parse_table_content(otsl_content: str) -> TableData:
|
|
3181
|
+
tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
|
|
3182
|
+
table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
|
|
3183
|
+
|
|
3184
|
+
return TableData(
|
|
3185
|
+
num_rows=len(split_row_tokens),
|
|
3186
|
+
num_cols=(
|
|
3187
|
+
max(len(row) for row in split_row_tokens) if split_row_tokens else 0
|
|
3188
|
+
),
|
|
3189
|
+
table_cells=table_cells,
|
|
3190
|
+
)
|
|
3191
|
+
|
|
3192
|
+
# doc = DoclingDocument(name="Document")
|
|
3193
|
+
for pg_idx, doctag_page in enumerate(doctag_document.pages):
|
|
3194
|
+
page_doctags = doctag_page.tokens
|
|
3195
|
+
image = doctag_page.image
|
|
3196
|
+
|
|
3197
|
+
page_no = pg_idx + 1
|
|
3198
|
+
# bounding_boxes = []
|
|
3199
|
+
|
|
3200
|
+
if image is not None:
|
|
3201
|
+
pg_width = image.width
|
|
3202
|
+
pg_height = image.height
|
|
3203
|
+
else:
|
|
3204
|
+
pg_width = 1
|
|
3205
|
+
pg_height = 1
|
|
3206
|
+
|
|
3207
|
+
"""
|
|
3208
|
+
1. Finds all <tag>...</tag>
|
|
3209
|
+
blocks in the entire string (multi-line friendly)
|
|
3210
|
+
in the order they appear.
|
|
3211
|
+
2. For each chunk, extracts bounding box (if any) and inner text.
|
|
3212
|
+
3. Adds the item to a DoclingDocument structure with the right label.
|
|
3213
|
+
4. Tracks bounding boxes+color in a separate list for later visualization.
|
|
3214
|
+
"""
|
|
3215
|
+
|
|
3216
|
+
# Regex for root level recognized tags
|
|
3217
|
+
tag_pattern = (
|
|
3218
|
+
rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
|
|
3219
|
+
rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
|
|
3220
|
+
rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
|
|
3221
|
+
rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
|
|
3222
|
+
rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
|
|
3223
|
+
rf"{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
|
|
3224
|
+
rf"{DocItemLabel.SECTION_HEADER}_level_1|"
|
|
3225
|
+
rf"{DocumentToken.ORDERED_LIST.value}|"
|
|
3226
|
+
rf"{DocumentToken.UNORDERED_LIST.value}|"
|
|
3227
|
+
rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
|
|
3228
|
+
)
|
|
3229
|
+
|
|
3230
|
+
# DocumentToken.OTSL
|
|
3231
|
+
pattern = re.compile(tag_pattern, re.DOTALL)
|
|
3232
|
+
|
|
3233
|
+
# Go through each match in order
|
|
3234
|
+
for match in pattern.finditer(page_doctags):
|
|
3235
|
+
full_chunk = match.group(0)
|
|
3236
|
+
tag_name = match.group("tag")
|
|
3237
|
+
|
|
3238
|
+
bbox = extract_bounding_box(full_chunk) if image else None
|
|
3239
|
+
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
|
|
3240
|
+
|
|
3241
|
+
if tag_name == DocumentToken.OTSL.value:
|
|
3242
|
+
table_data = parse_table_content(full_chunk)
|
|
3243
|
+
bbox = extract_bounding_box(full_chunk) if image else None
|
|
3244
|
+
|
|
3245
|
+
if bbox:
|
|
3246
|
+
prov = ProvenanceItem(
|
|
3247
|
+
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
|
3248
|
+
charspan=(0, 0),
|
|
3249
|
+
page_no=page_no,
|
|
3250
|
+
)
|
|
3251
|
+
self.add_table(data=table_data, prov=prov)
|
|
3252
|
+
else:
|
|
3253
|
+
self.add_table(data=table_data)
|
|
3254
|
+
|
|
3255
|
+
elif tag_name == DocItemLabel.PICTURE:
|
|
3256
|
+
text_caption_content = extract_inner_text(full_chunk)
|
|
3257
|
+
if image:
|
|
3258
|
+
if bbox:
|
|
3259
|
+
im_width, im_height = image.size
|
|
3260
|
+
|
|
3261
|
+
crop_box = (
|
|
3262
|
+
int(bbox.l * im_width),
|
|
3263
|
+
int(bbox.t * im_height),
|
|
3264
|
+
int(bbox.r * im_width),
|
|
3265
|
+
int(bbox.b * im_height),
|
|
3266
|
+
)
|
|
3267
|
+
cropped_image = image.crop(crop_box)
|
|
3268
|
+
pic = self.add_picture(
|
|
3269
|
+
parent=None,
|
|
3270
|
+
image=ImageRef.from_pil(image=cropped_image, dpi=72),
|
|
3271
|
+
prov=(
|
|
3272
|
+
ProvenanceItem(
|
|
3273
|
+
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
|
3274
|
+
charspan=(0, 0),
|
|
3275
|
+
page_no=page_no,
|
|
3276
|
+
)
|
|
3277
|
+
),
|
|
3278
|
+
)
|
|
3279
|
+
# If there is a caption to an image, add it as well
|
|
3280
|
+
if len(text_caption_content) > 0:
|
|
3281
|
+
caption_item = self.add_text(
|
|
3282
|
+
label=DocItemLabel.CAPTION,
|
|
3283
|
+
text=text_caption_content,
|
|
3284
|
+
parent=None,
|
|
3285
|
+
)
|
|
3286
|
+
pic.captions.append(caption_item.get_ref())
|
|
3287
|
+
else:
|
|
3288
|
+
if bbox:
|
|
3289
|
+
# In case we don't have access to an binary of an image
|
|
3290
|
+
self.add_picture(
|
|
3291
|
+
parent=None,
|
|
3292
|
+
prov=ProvenanceItem(
|
|
3293
|
+
bbox=bbox, charspan=(0, 0), page_no=page_no
|
|
3294
|
+
),
|
|
3295
|
+
)
|
|
3296
|
+
# If there is a caption to an image, add it as well
|
|
3297
|
+
if len(text_caption_content) > 0:
|
|
3298
|
+
caption_item = self.add_text(
|
|
3299
|
+
label=DocItemLabel.CAPTION,
|
|
3300
|
+
text=text_caption_content,
|
|
3301
|
+
parent=None,
|
|
3302
|
+
)
|
|
3303
|
+
pic.captions.append(caption_item.get_ref())
|
|
3304
|
+
elif tag_name in [
|
|
3305
|
+
DocumentToken.ORDERED_LIST.value,
|
|
3306
|
+
DocumentToken.UNORDERED_LIST.value,
|
|
3307
|
+
]:
|
|
3308
|
+
list_label = GroupLabel.LIST
|
|
3309
|
+
enum_marker = ""
|
|
3310
|
+
enum_value = 0
|
|
3311
|
+
if tag_name == DocumentToken.ORDERED_LIST.value:
|
|
3312
|
+
list_label = GroupLabel.ORDERED_LIST
|
|
3313
|
+
|
|
3314
|
+
list_item_pattern = (
|
|
3315
|
+
rf"<(?P<tag>{DocItemLabel.LIST_ITEM})>.*?</(?P=tag)>"
|
|
3316
|
+
)
|
|
3317
|
+
li_pattern = re.compile(list_item_pattern, re.DOTALL)
|
|
3318
|
+
# Add list group:
|
|
3319
|
+
new_list = self.add_group(label=list_label, name="list")
|
|
3320
|
+
# Pricess list items
|
|
3321
|
+
for li_match in li_pattern.finditer(full_chunk):
|
|
3322
|
+
enum_value += 1
|
|
3323
|
+
if tag_name == DocumentToken.ORDERED_LIST.value:
|
|
3324
|
+
enum_marker = str(enum_value) + "."
|
|
3325
|
+
|
|
3326
|
+
li_full_chunk = li_match.group(0)
|
|
3327
|
+
li_bbox = extract_bounding_box(li_full_chunk) if image else None
|
|
3328
|
+
text_content = extract_inner_text(li_full_chunk)
|
|
3329
|
+
# Add list item
|
|
3330
|
+
self.add_list_item(
|
|
3331
|
+
marker=enum_marker,
|
|
3332
|
+
enumerated=(tag_name == DocumentToken.ORDERED_LIST.value),
|
|
3333
|
+
parent=new_list,
|
|
3334
|
+
text=text_content,
|
|
3335
|
+
prov=(
|
|
3336
|
+
ProvenanceItem(
|
|
3337
|
+
bbox=li_bbox.resize_by_scale(pg_width, pg_height),
|
|
3338
|
+
charspan=(0, len(text_content)),
|
|
3339
|
+
page_no=page_no,
|
|
3340
|
+
)
|
|
3341
|
+
if li_bbox
|
|
3342
|
+
else None
|
|
3343
|
+
),
|
|
3344
|
+
)
|
|
3345
|
+
else:
|
|
3346
|
+
# For everything else, treat as text
|
|
3347
|
+
text_content = extract_inner_text(full_chunk)
|
|
3348
|
+
self.add_text(
|
|
3349
|
+
label=doc_label,
|
|
3350
|
+
text=text_content,
|
|
3351
|
+
prov=(
|
|
3352
|
+
ProvenanceItem(
|
|
3353
|
+
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
|
3354
|
+
charspan=(0, len(text_content)),
|
|
3355
|
+
page_no=page_no,
|
|
3356
|
+
)
|
|
3357
|
+
if bbox
|
|
3358
|
+
else None
|
|
3359
|
+
),
|
|
3360
|
+
)
|
|
3361
|
+
return self
|
|
3362
|
+
|
|
3363
|
+
@deprecated("Use save_as_doctags instead.")
|
|
3364
|
+
def save_as_document_tokens(self, *args, **kwargs):
|
|
3365
|
+
r"""Save the document content to a DocumentToken format."""
|
|
3366
|
+
return self.save_as_doctags(*args, **kwargs)
|
|
3367
|
+
|
|
3368
|
+
def save_as_doctags(
|
|
2940
3369
|
self,
|
|
2941
3370
|
filename: Path,
|
|
2942
3371
|
delim: str = "",
|
|
@@ -2952,7 +3381,7 @@ class DoclingDocument(BaseModel):
|
|
|
2952
3381
|
add_table_cell_location: bool = False,
|
|
2953
3382
|
add_table_cell_text: bool = True,
|
|
2954
3383
|
):
|
|
2955
|
-
r"""Save the document content to
|
|
3384
|
+
r"""Save the document content to DocTags format."""
|
|
2956
3385
|
out = self.export_to_document_tokens(
|
|
2957
3386
|
delim=delim,
|
|
2958
3387
|
from_element=from_element,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.21.2 → docling_core-2.22.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.21.2 → docling_core-2.22.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.21.2 → docling_core-2.22.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.21.2 → docling_core-2.22.0}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|