docling-core 2.21.1__tar.gz → 2.22.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.21.1 → docling_core-2.22.0}/PKG-INFO +1 -1
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/transforms/chunker/hybrid_chunker.py +2 -2
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/doc/document.py +449 -10
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/doc/tokens.py +1 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/pyproject.toml +1 -1
- {docling_core-2.21.1 → docling_core-2.22.0}/LICENSE +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/README.md +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/__init__.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/py.typed +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/search/package.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/base.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.21.1 → docling_core-2.22.0}/docling_core/utils/validators.py +0 -0
{docling_core-2.21.1 → docling_core-2.22.0}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
@@ -73,7 +73,7 @@ class HybridChunker(BaseChunker):
|
|
|
73
73
|
for t in text:
|
|
74
74
|
total += self._count_text_tokens(t)
|
|
75
75
|
return total
|
|
76
|
-
return len(self._tokenizer.tokenize(text
|
|
76
|
+
return len(self._tokenizer.tokenize(text))
|
|
77
77
|
|
|
78
78
|
class _ChunkLengthInfo(BaseModel):
|
|
79
79
|
total_len: int
|
|
@@ -82,7 +82,7 @@ class HybridChunker(BaseChunker):
|
|
|
82
82
|
|
|
83
83
|
def _count_chunk_tokens(self, doc_chunk: DocChunk):
|
|
84
84
|
ser_txt = self.serialize(chunk=doc_chunk)
|
|
85
|
-
return len(self._tokenizer.tokenize(text=ser_txt
|
|
85
|
+
return len(self._tokenizer.tokenize(text=ser_txt))
|
|
86
86
|
|
|
87
87
|
def _doc_chunk_length(self, doc_chunk: DocChunk):
|
|
88
88
|
text_length = self._count_text_tokens(doc_chunk.text)
|
|
@@ -4,6 +4,7 @@ import base64
|
|
|
4
4
|
import copy
|
|
5
5
|
import hashlib
|
|
6
6
|
import html
|
|
7
|
+
import itertools
|
|
7
8
|
import json
|
|
8
9
|
import logging
|
|
9
10
|
import mimetypes
|
|
@@ -37,7 +38,7 @@ from pydantic import (
|
|
|
37
38
|
model_validator,
|
|
38
39
|
)
|
|
39
40
|
from tabulate import tabulate
|
|
40
|
-
from typing_extensions import Annotated, Self
|
|
41
|
+
from typing_extensions import Annotated, Self, deprecated
|
|
41
42
|
|
|
42
43
|
from docling_core.search.package import VERSION_PATTERN
|
|
43
44
|
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
@@ -522,6 +523,49 @@ class ImageRef(BaseModel):
|
|
|
522
523
|
)
|
|
523
524
|
|
|
524
525
|
|
|
526
|
+
class DocTagsPage(BaseModel):
|
|
527
|
+
"""DocTagsPage."""
|
|
528
|
+
|
|
529
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
530
|
+
|
|
531
|
+
tokens: str
|
|
532
|
+
image: Optional[PILImage.Image] = None
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
class DocTagsDocument(BaseModel):
|
|
536
|
+
"""DocTagsDocument."""
|
|
537
|
+
|
|
538
|
+
pages: List[DocTagsPage] = []
|
|
539
|
+
|
|
540
|
+
@classmethod
|
|
541
|
+
def from_doctags_and_image_pairs(
|
|
542
|
+
cls, doctags: List[Union[Path, str]], images: List[Union[Path, PILImage.Image]]
|
|
543
|
+
):
|
|
544
|
+
"""from_doctags_and_image_pairs."""
|
|
545
|
+
if len(doctags) != len(images):
|
|
546
|
+
raise ValueError("Number of page doctags must be equal to page images!")
|
|
547
|
+
doctags_doc = cls()
|
|
548
|
+
|
|
549
|
+
pages = []
|
|
550
|
+
for dt, img in zip(doctags, images):
|
|
551
|
+
if isinstance(dt, Path):
|
|
552
|
+
with dt.open("r") as fp:
|
|
553
|
+
dt = fp.read()
|
|
554
|
+
elif isinstance(dt, str):
|
|
555
|
+
pass
|
|
556
|
+
|
|
557
|
+
if isinstance(img, Path):
|
|
558
|
+
img = PILImage.open(img)
|
|
559
|
+
elif isinstance(dt, PILImage.Image):
|
|
560
|
+
pass
|
|
561
|
+
|
|
562
|
+
page = DocTagsPage(tokens=dt, image=img)
|
|
563
|
+
pages.append(page)
|
|
564
|
+
|
|
565
|
+
doctags_doc.pages = pages
|
|
566
|
+
return doctags_doc
|
|
567
|
+
|
|
568
|
+
|
|
525
569
|
class ProvenanceItem(BaseModel):
|
|
526
570
|
"""ProvenanceItem."""
|
|
527
571
|
|
|
@@ -800,7 +844,7 @@ class CodeItem(FloatingItem, TextItem):
|
|
|
800
844
|
:param add_content: bool: (Default value = True)
|
|
801
845
|
|
|
802
846
|
"""
|
|
803
|
-
body = f"<{self.label.value}{new_line}"
|
|
847
|
+
body = f"<{self.label.value}>{new_line}"
|
|
804
848
|
|
|
805
849
|
if add_location:
|
|
806
850
|
body += self.get_location_tokens(
|
|
@@ -813,7 +857,7 @@ class CodeItem(FloatingItem, TextItem):
|
|
|
813
857
|
if add_content and self.text is not None:
|
|
814
858
|
body += f"<_{self.code_language.value}_>{self.text}{new_line}"
|
|
815
859
|
|
|
816
|
-
body += f"</{self.label.value}
|
|
860
|
+
body += f"</{self.label.value}>\n"
|
|
817
861
|
|
|
818
862
|
return body
|
|
819
863
|
|
|
@@ -1003,6 +1047,20 @@ class PictureItem(FloatingItem):
|
|
|
1003
1047
|
predicted_class = classifications[0].predicted_classes[0].class_name
|
|
1004
1048
|
body += DocumentToken.get_picture_classification_token(predicted_class)
|
|
1005
1049
|
|
|
1050
|
+
smiles_annotations = [
|
|
1051
|
+
ann for ann in self.annotations if isinstance(ann, PictureMoleculeData)
|
|
1052
|
+
]
|
|
1053
|
+
if len(smiles_annotations) > 0:
|
|
1054
|
+
body += (
|
|
1055
|
+
"<"
|
|
1056
|
+
+ DocumentToken.SMILES.value
|
|
1057
|
+
+ ">"
|
|
1058
|
+
+ smiles_annotations[0].smi
|
|
1059
|
+
+ "</"
|
|
1060
|
+
+ DocumentToken.SMILES.value
|
|
1061
|
+
+ ">"
|
|
1062
|
+
)
|
|
1063
|
+
|
|
1006
1064
|
if add_caption and len(self.captions):
|
|
1007
1065
|
text = self.caption_text(doc)
|
|
1008
1066
|
|
|
@@ -2487,7 +2545,6 @@ class DoclingDocument(BaseModel):
|
|
|
2487
2545
|
is_inline_scope=is_inline_scope,
|
|
2488
2546
|
visited=visited,
|
|
2489
2547
|
)
|
|
2490
|
-
# NOTE: assumes unordered (flag & marker currently in ListItem)
|
|
2491
2548
|
indent_str = list_level * indent * " "
|
|
2492
2549
|
is_ol = item.label == GroupLabel.ORDERED_LIST
|
|
2493
2550
|
text = "\n".join(
|
|
@@ -2501,7 +2558,12 @@ class DoclingDocument(BaseModel):
|
|
|
2501
2558
|
for i, c in enumerate(comps)
|
|
2502
2559
|
]
|
|
2503
2560
|
)
|
|
2504
|
-
_ingest_text(
|
|
2561
|
+
_ingest_text(
|
|
2562
|
+
text=text,
|
|
2563
|
+
# special chars have already been escaped as needed
|
|
2564
|
+
do_escape_html=False,
|
|
2565
|
+
do_escape_underscores=False,
|
|
2566
|
+
)
|
|
2505
2567
|
elif item.label == GroupLabel.INLINE:
|
|
2506
2568
|
comps = self._get_markdown_components(
|
|
2507
2569
|
node=item,
|
|
@@ -2520,7 +2582,13 @@ class DoclingDocument(BaseModel):
|
|
|
2520
2582
|
is_inline_scope=True,
|
|
2521
2583
|
visited=visited,
|
|
2522
2584
|
)
|
|
2523
|
-
|
|
2585
|
+
text = " ".join(comps)
|
|
2586
|
+
_ingest_text(
|
|
2587
|
+
text=text,
|
|
2588
|
+
# special chars have already been escaped as needed
|
|
2589
|
+
do_escape_html=False,
|
|
2590
|
+
do_escape_underscores=False,
|
|
2591
|
+
)
|
|
2524
2592
|
else:
|
|
2525
2593
|
continue
|
|
2526
2594
|
|
|
@@ -2838,7 +2906,7 @@ class DoclingDocument(BaseModel):
|
|
|
2838
2906
|
|
|
2839
2907
|
# Building a math equation in MathML format
|
|
2840
2908
|
# ref https://www.w3.org/TR/wai-aria-1.1/#math
|
|
2841
|
-
elif formula_to_mathml:
|
|
2909
|
+
elif formula_to_mathml and len(math_formula) > 0:
|
|
2842
2910
|
try:
|
|
2843
2911
|
mathml_element = latex2mathml.converter.convert_to_element(
|
|
2844
2912
|
math_formula, display="block"
|
|
@@ -2860,7 +2928,7 @@ class DoclingDocument(BaseModel):
|
|
|
2860
2928
|
and img_fallback is not None
|
|
2861
2929
|
):
|
|
2862
2930
|
text = img_fallback
|
|
2863
|
-
|
|
2931
|
+
else:
|
|
2864
2932
|
text = f"<pre>{math_formula}</pre>"
|
|
2865
2933
|
|
|
2866
2934
|
elif math_formula != "":
|
|
@@ -2926,7 +2994,378 @@ class DoclingDocument(BaseModel):
|
|
|
2926
2994
|
|
|
2927
2995
|
return html_text
|
|
2928
2996
|
|
|
2929
|
-
def
|
|
2997
|
+
def load_from_doctags( # noqa: C901
|
|
2998
|
+
self,
|
|
2999
|
+
doctag_document: DocTagsDocument,
|
|
3000
|
+
) -> "DoclingDocument":
|
|
3001
|
+
r"""Load Docling document from lists of DocTags and Images."""
|
|
3002
|
+
# Maps the recognized tag to a Docling label.
|
|
3003
|
+
# Code items will be given DocItemLabel.CODE
|
|
3004
|
+
tag_to_doclabel = {
|
|
3005
|
+
"title": DocItemLabel.TITLE,
|
|
3006
|
+
"document_index": DocItemLabel.DOCUMENT_INDEX,
|
|
3007
|
+
"otsl": DocItemLabel.TABLE,
|
|
3008
|
+
"section_header_level_1": DocItemLabel.SECTION_HEADER,
|
|
3009
|
+
"checkbox_selected": DocItemLabel.CHECKBOX_SELECTED,
|
|
3010
|
+
"checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED,
|
|
3011
|
+
"text": DocItemLabel.TEXT,
|
|
3012
|
+
"page_header": DocItemLabel.PAGE_HEADER,
|
|
3013
|
+
"page_footer": DocItemLabel.PAGE_FOOTER,
|
|
3014
|
+
"formula": DocItemLabel.FORMULA,
|
|
3015
|
+
"caption": DocItemLabel.CAPTION,
|
|
3016
|
+
"picture": DocItemLabel.PICTURE,
|
|
3017
|
+
"list_item": DocItemLabel.LIST_ITEM,
|
|
3018
|
+
"footnote": DocItemLabel.FOOTNOTE,
|
|
3019
|
+
"code": DocItemLabel.CODE,
|
|
3020
|
+
}
|
|
3021
|
+
|
|
3022
|
+
def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
|
|
3023
|
+
"""Extract <loc_...> coords from the chunk, normalized by / 500."""
|
|
3024
|
+
coords = re.findall(r"<loc_(\d+)>", text_chunk)
|
|
3025
|
+
if len(coords) == 4:
|
|
3026
|
+
l, t, r, b = map(float, coords)
|
|
3027
|
+
return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
|
|
3028
|
+
return None
|
|
3029
|
+
|
|
3030
|
+
def extract_inner_text(text_chunk: str) -> str:
|
|
3031
|
+
"""Strip all <...> tags inside the chunk to get the raw text content."""
|
|
3032
|
+
return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
|
|
3033
|
+
|
|
3034
|
+
def otsl_parse_texts(texts, tokens):
|
|
3035
|
+
split_word = TableToken.OTSL_NL.value
|
|
3036
|
+
split_row_tokens = [
|
|
3037
|
+
list(y)
|
|
3038
|
+
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
|
|
3039
|
+
if not x
|
|
3040
|
+
]
|
|
3041
|
+
table_cells = []
|
|
3042
|
+
r_idx = 0
|
|
3043
|
+
c_idx = 0
|
|
3044
|
+
|
|
3045
|
+
def count_right(tokens, c_idx, r_idx, which_tokens):
|
|
3046
|
+
span = 0
|
|
3047
|
+
c_idx_iter = c_idx
|
|
3048
|
+
while tokens[r_idx][c_idx_iter] in which_tokens:
|
|
3049
|
+
c_idx_iter += 1
|
|
3050
|
+
span += 1
|
|
3051
|
+
if c_idx_iter >= len(tokens[r_idx]):
|
|
3052
|
+
return span
|
|
3053
|
+
return span
|
|
3054
|
+
|
|
3055
|
+
def count_down(tokens, c_idx, r_idx, which_tokens):
|
|
3056
|
+
span = 0
|
|
3057
|
+
r_idx_iter = r_idx
|
|
3058
|
+
while tokens[r_idx_iter][c_idx] in which_tokens:
|
|
3059
|
+
r_idx_iter += 1
|
|
3060
|
+
span += 1
|
|
3061
|
+
if r_idx_iter >= len(tokens):
|
|
3062
|
+
return span
|
|
3063
|
+
return span
|
|
3064
|
+
|
|
3065
|
+
for i, text in enumerate(texts):
|
|
3066
|
+
cell_text = ""
|
|
3067
|
+
if text in [
|
|
3068
|
+
TableToken.OTSL_FCEL.value,
|
|
3069
|
+
TableToken.OTSL_ECEL.value,
|
|
3070
|
+
TableToken.OTSL_CHED.value,
|
|
3071
|
+
TableToken.OTSL_RHED.value,
|
|
3072
|
+
TableToken.OTSL_SROW.value,
|
|
3073
|
+
]:
|
|
3074
|
+
row_span = 1
|
|
3075
|
+
col_span = 1
|
|
3076
|
+
right_offset = 1
|
|
3077
|
+
if text != TableToken.OTSL_ECEL.value:
|
|
3078
|
+
cell_text = texts[i + 1]
|
|
3079
|
+
right_offset = 2
|
|
3080
|
+
|
|
3081
|
+
# Check next element(s) for lcel / ucel / xcel,
|
|
3082
|
+
# set properly row_span, col_span
|
|
3083
|
+
next_right_cell = ""
|
|
3084
|
+
if i + right_offset < len(texts):
|
|
3085
|
+
next_right_cell = texts[i + right_offset]
|
|
3086
|
+
|
|
3087
|
+
next_bottom_cell = ""
|
|
3088
|
+
if r_idx + 1 < len(split_row_tokens):
|
|
3089
|
+
if c_idx < len(split_row_tokens[r_idx + 1]):
|
|
3090
|
+
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
|
|
3091
|
+
|
|
3092
|
+
if next_right_cell in [
|
|
3093
|
+
TableToken.OTSL_LCEL.value,
|
|
3094
|
+
TableToken.OTSL_XCEL.value,
|
|
3095
|
+
]:
|
|
3096
|
+
# we have horisontal spanning cell or 2d spanning cell
|
|
3097
|
+
col_span += count_right(
|
|
3098
|
+
split_row_tokens,
|
|
3099
|
+
c_idx + 1,
|
|
3100
|
+
r_idx,
|
|
3101
|
+
[TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
|
|
3102
|
+
)
|
|
3103
|
+
if next_bottom_cell in [
|
|
3104
|
+
TableToken.OTSL_UCEL.value,
|
|
3105
|
+
TableToken.OTSL_XCEL.value,
|
|
3106
|
+
]:
|
|
3107
|
+
# we have a vertical spanning cell or 2d spanning cell
|
|
3108
|
+
row_span += count_down(
|
|
3109
|
+
split_row_tokens,
|
|
3110
|
+
c_idx,
|
|
3111
|
+
r_idx + 1,
|
|
3112
|
+
[TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
|
|
3113
|
+
)
|
|
3114
|
+
|
|
3115
|
+
table_cells.append(
|
|
3116
|
+
TableCell(
|
|
3117
|
+
text=cell_text.strip(),
|
|
3118
|
+
row_span=row_span,
|
|
3119
|
+
col_span=col_span,
|
|
3120
|
+
start_row_offset_idx=r_idx,
|
|
3121
|
+
end_row_offset_idx=r_idx + row_span,
|
|
3122
|
+
start_col_offset_idx=c_idx,
|
|
3123
|
+
end_col_offset_idx=c_idx + col_span,
|
|
3124
|
+
)
|
|
3125
|
+
)
|
|
3126
|
+
if text in [
|
|
3127
|
+
TableToken.OTSL_FCEL.value,
|
|
3128
|
+
TableToken.OTSL_ECEL.value,
|
|
3129
|
+
TableToken.OTSL_CHED.value,
|
|
3130
|
+
TableToken.OTSL_RHED.value,
|
|
3131
|
+
TableToken.OTSL_SROW.value,
|
|
3132
|
+
TableToken.OTSL_LCEL.value,
|
|
3133
|
+
TableToken.OTSL_UCEL.value,
|
|
3134
|
+
TableToken.OTSL_XCEL.value,
|
|
3135
|
+
]:
|
|
3136
|
+
c_idx += 1
|
|
3137
|
+
if text == TableToken.OTSL_NL.value:
|
|
3138
|
+
r_idx += 1
|
|
3139
|
+
c_idx = 0
|
|
3140
|
+
return table_cells, split_row_tokens
|
|
3141
|
+
|
|
3142
|
+
def otsl_extract_tokens_and_text(s: str):
|
|
3143
|
+
# Pattern to match anything enclosed by < >
|
|
3144
|
+
# (including the angle brackets themselves)
|
|
3145
|
+
pattern = r"(<[^>]+>)"
|
|
3146
|
+
# Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
|
|
3147
|
+
tokens = re.findall(pattern, s)
|
|
3148
|
+
# Remove any tokens that start with "<loc_"
|
|
3149
|
+
tokens = [
|
|
3150
|
+
token
|
|
3151
|
+
for token in tokens
|
|
3152
|
+
if not (
|
|
3153
|
+
token.startswith(rf"<{DocumentToken.LOC.value}")
|
|
3154
|
+
or token
|
|
3155
|
+
in [
|
|
3156
|
+
rf"<{DocumentToken.OTSL.value}>",
|
|
3157
|
+
rf"</{DocumentToken.OTSL.value}>",
|
|
3158
|
+
]
|
|
3159
|
+
)
|
|
3160
|
+
]
|
|
3161
|
+
# Split the string by those tokens to get the in-between text
|
|
3162
|
+
text_parts = re.split(pattern, s)
|
|
3163
|
+
text_parts = [
|
|
3164
|
+
token
|
|
3165
|
+
for token in text_parts
|
|
3166
|
+
if not (
|
|
3167
|
+
token.startswith(rf"<{DocumentToken.LOC.value}")
|
|
3168
|
+
or token
|
|
3169
|
+
in [
|
|
3170
|
+
rf"<{DocumentToken.OTSL.value}>",
|
|
3171
|
+
rf"</{DocumentToken.OTSL.value}>",
|
|
3172
|
+
]
|
|
3173
|
+
)
|
|
3174
|
+
]
|
|
3175
|
+
# Remove any empty or purely whitespace strings from text_parts
|
|
3176
|
+
text_parts = [part for part in text_parts if part.strip()]
|
|
3177
|
+
|
|
3178
|
+
return tokens, text_parts
|
|
3179
|
+
|
|
3180
|
+
def parse_table_content(otsl_content: str) -> TableData:
|
|
3181
|
+
tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
|
|
3182
|
+
table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
|
|
3183
|
+
|
|
3184
|
+
return TableData(
|
|
3185
|
+
num_rows=len(split_row_tokens),
|
|
3186
|
+
num_cols=(
|
|
3187
|
+
max(len(row) for row in split_row_tokens) if split_row_tokens else 0
|
|
3188
|
+
),
|
|
3189
|
+
table_cells=table_cells,
|
|
3190
|
+
)
|
|
3191
|
+
|
|
3192
|
+
# doc = DoclingDocument(name="Document")
|
|
3193
|
+
for pg_idx, doctag_page in enumerate(doctag_document.pages):
|
|
3194
|
+
page_doctags = doctag_page.tokens
|
|
3195
|
+
image = doctag_page.image
|
|
3196
|
+
|
|
3197
|
+
page_no = pg_idx + 1
|
|
3198
|
+
# bounding_boxes = []
|
|
3199
|
+
|
|
3200
|
+
if image is not None:
|
|
3201
|
+
pg_width = image.width
|
|
3202
|
+
pg_height = image.height
|
|
3203
|
+
else:
|
|
3204
|
+
pg_width = 1
|
|
3205
|
+
pg_height = 1
|
|
3206
|
+
|
|
3207
|
+
"""
|
|
3208
|
+
1. Finds all <tag>...</tag>
|
|
3209
|
+
blocks in the entire string (multi-line friendly)
|
|
3210
|
+
in the order they appear.
|
|
3211
|
+
2. For each chunk, extracts bounding box (if any) and inner text.
|
|
3212
|
+
3. Adds the item to a DoclingDocument structure with the right label.
|
|
3213
|
+
4. Tracks bounding boxes+color in a separate list for later visualization.
|
|
3214
|
+
"""
|
|
3215
|
+
|
|
3216
|
+
# Regex for root level recognized tags
|
|
3217
|
+
tag_pattern = (
|
|
3218
|
+
rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
|
|
3219
|
+
rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
|
|
3220
|
+
rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
|
|
3221
|
+
rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
|
|
3222
|
+
rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
|
|
3223
|
+
rf"{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
|
|
3224
|
+
rf"{DocItemLabel.SECTION_HEADER}_level_1|"
|
|
3225
|
+
rf"{DocumentToken.ORDERED_LIST.value}|"
|
|
3226
|
+
rf"{DocumentToken.UNORDERED_LIST.value}|"
|
|
3227
|
+
rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
|
|
3228
|
+
)
|
|
3229
|
+
|
|
3230
|
+
# DocumentToken.OTSL
|
|
3231
|
+
pattern = re.compile(tag_pattern, re.DOTALL)
|
|
3232
|
+
|
|
3233
|
+
# Go through each match in order
|
|
3234
|
+
for match in pattern.finditer(page_doctags):
|
|
3235
|
+
full_chunk = match.group(0)
|
|
3236
|
+
tag_name = match.group("tag")
|
|
3237
|
+
|
|
3238
|
+
bbox = extract_bounding_box(full_chunk) if image else None
|
|
3239
|
+
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
|
|
3240
|
+
|
|
3241
|
+
if tag_name == DocumentToken.OTSL.value:
|
|
3242
|
+
table_data = parse_table_content(full_chunk)
|
|
3243
|
+
bbox = extract_bounding_box(full_chunk) if image else None
|
|
3244
|
+
|
|
3245
|
+
if bbox:
|
|
3246
|
+
prov = ProvenanceItem(
|
|
3247
|
+
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
|
3248
|
+
charspan=(0, 0),
|
|
3249
|
+
page_no=page_no,
|
|
3250
|
+
)
|
|
3251
|
+
self.add_table(data=table_data, prov=prov)
|
|
3252
|
+
else:
|
|
3253
|
+
self.add_table(data=table_data)
|
|
3254
|
+
|
|
3255
|
+
elif tag_name == DocItemLabel.PICTURE:
|
|
3256
|
+
text_caption_content = extract_inner_text(full_chunk)
|
|
3257
|
+
if image:
|
|
3258
|
+
if bbox:
|
|
3259
|
+
im_width, im_height = image.size
|
|
3260
|
+
|
|
3261
|
+
crop_box = (
|
|
3262
|
+
int(bbox.l * im_width),
|
|
3263
|
+
int(bbox.t * im_height),
|
|
3264
|
+
int(bbox.r * im_width),
|
|
3265
|
+
int(bbox.b * im_height),
|
|
3266
|
+
)
|
|
3267
|
+
cropped_image = image.crop(crop_box)
|
|
3268
|
+
pic = self.add_picture(
|
|
3269
|
+
parent=None,
|
|
3270
|
+
image=ImageRef.from_pil(image=cropped_image, dpi=72),
|
|
3271
|
+
prov=(
|
|
3272
|
+
ProvenanceItem(
|
|
3273
|
+
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
|
3274
|
+
charspan=(0, 0),
|
|
3275
|
+
page_no=page_no,
|
|
3276
|
+
)
|
|
3277
|
+
),
|
|
3278
|
+
)
|
|
3279
|
+
# If there is a caption to an image, add it as well
|
|
3280
|
+
if len(text_caption_content) > 0:
|
|
3281
|
+
caption_item = self.add_text(
|
|
3282
|
+
label=DocItemLabel.CAPTION,
|
|
3283
|
+
text=text_caption_content,
|
|
3284
|
+
parent=None,
|
|
3285
|
+
)
|
|
3286
|
+
pic.captions.append(caption_item.get_ref())
|
|
3287
|
+
else:
|
|
3288
|
+
if bbox:
|
|
3289
|
+
# In case we don't have access to an binary of an image
|
|
3290
|
+
self.add_picture(
|
|
3291
|
+
parent=None,
|
|
3292
|
+
prov=ProvenanceItem(
|
|
3293
|
+
bbox=bbox, charspan=(0, 0), page_no=page_no
|
|
3294
|
+
),
|
|
3295
|
+
)
|
|
3296
|
+
# If there is a caption to an image, add it as well
|
|
3297
|
+
if len(text_caption_content) > 0:
|
|
3298
|
+
caption_item = self.add_text(
|
|
3299
|
+
label=DocItemLabel.CAPTION,
|
|
3300
|
+
text=text_caption_content,
|
|
3301
|
+
parent=None,
|
|
3302
|
+
)
|
|
3303
|
+
pic.captions.append(caption_item.get_ref())
|
|
3304
|
+
elif tag_name in [
|
|
3305
|
+
DocumentToken.ORDERED_LIST.value,
|
|
3306
|
+
DocumentToken.UNORDERED_LIST.value,
|
|
3307
|
+
]:
|
|
3308
|
+
list_label = GroupLabel.LIST
|
|
3309
|
+
enum_marker = ""
|
|
3310
|
+
enum_value = 0
|
|
3311
|
+
if tag_name == DocumentToken.ORDERED_LIST.value:
|
|
3312
|
+
list_label = GroupLabel.ORDERED_LIST
|
|
3313
|
+
|
|
3314
|
+
list_item_pattern = (
|
|
3315
|
+
rf"<(?P<tag>{DocItemLabel.LIST_ITEM})>.*?</(?P=tag)>"
|
|
3316
|
+
)
|
|
3317
|
+
li_pattern = re.compile(list_item_pattern, re.DOTALL)
|
|
3318
|
+
# Add list group:
|
|
3319
|
+
new_list = self.add_group(label=list_label, name="list")
|
|
3320
|
+
# Pricess list items
|
|
3321
|
+
for li_match in li_pattern.finditer(full_chunk):
|
|
3322
|
+
enum_value += 1
|
|
3323
|
+
if tag_name == DocumentToken.ORDERED_LIST.value:
|
|
3324
|
+
enum_marker = str(enum_value) + "."
|
|
3325
|
+
|
|
3326
|
+
li_full_chunk = li_match.group(0)
|
|
3327
|
+
li_bbox = extract_bounding_box(li_full_chunk) if image else None
|
|
3328
|
+
text_content = extract_inner_text(li_full_chunk)
|
|
3329
|
+
# Add list item
|
|
3330
|
+
self.add_list_item(
|
|
3331
|
+
marker=enum_marker,
|
|
3332
|
+
enumerated=(tag_name == DocumentToken.ORDERED_LIST.value),
|
|
3333
|
+
parent=new_list,
|
|
3334
|
+
text=text_content,
|
|
3335
|
+
prov=(
|
|
3336
|
+
ProvenanceItem(
|
|
3337
|
+
bbox=li_bbox.resize_by_scale(pg_width, pg_height),
|
|
3338
|
+
charspan=(0, len(text_content)),
|
|
3339
|
+
page_no=page_no,
|
|
3340
|
+
)
|
|
3341
|
+
if li_bbox
|
|
3342
|
+
else None
|
|
3343
|
+
),
|
|
3344
|
+
)
|
|
3345
|
+
else:
|
|
3346
|
+
# For everything else, treat as text
|
|
3347
|
+
text_content = extract_inner_text(full_chunk)
|
|
3348
|
+
self.add_text(
|
|
3349
|
+
label=doc_label,
|
|
3350
|
+
text=text_content,
|
|
3351
|
+
prov=(
|
|
3352
|
+
ProvenanceItem(
|
|
3353
|
+
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
|
3354
|
+
charspan=(0, len(text_content)),
|
|
3355
|
+
page_no=page_no,
|
|
3356
|
+
)
|
|
3357
|
+
if bbox
|
|
3358
|
+
else None
|
|
3359
|
+
),
|
|
3360
|
+
)
|
|
3361
|
+
return self
|
|
3362
|
+
|
|
3363
|
+
@deprecated("Use save_as_doctags instead.")
|
|
3364
|
+
def save_as_document_tokens(self, *args, **kwargs):
|
|
3365
|
+
r"""Save the document content to a DocumentToken format."""
|
|
3366
|
+
return self.save_as_doctags(*args, **kwargs)
|
|
3367
|
+
|
|
3368
|
+
def save_as_doctags(
|
|
2930
3369
|
self,
|
|
2931
3370
|
filename: Path,
|
|
2932
3371
|
delim: str = "",
|
|
@@ -2942,7 +3381,7 @@ class DoclingDocument(BaseModel):
|
|
|
2942
3381
|
add_table_cell_location: bool = False,
|
|
2943
3382
|
add_table_cell_text: bool = True,
|
|
2944
3383
|
):
|
|
2945
|
-
r"""Save the document content to
|
|
3384
|
+
r"""Save the document content to DocTags format."""
|
|
2946
3385
|
out = self.export_to_document_tokens(
|
|
2947
3386
|
delim=delim,
|
|
2948
3387
|
from_element=from_element,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.21.1 → docling_core-2.22.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.21.1 → docling_core-2.22.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.21.1 → docling_core-2.22.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|