docling-core 2.44.2__tar.gz → 2.45.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.44.2 → docling_core-2.45.0}/PKG-INFO +1 -1
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/serializer/html.py +2 -2
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/doc/document.py +165 -236
- docling_core-2.45.0/docling_core/types/doc/utils.py +282 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core.egg-info/PKG-INFO +1 -1
- {docling_core-2.44.2 → docling_core-2.45.0}/pyproject.toml +1 -1
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_docling_doc.py +119 -1
- docling_core-2.44.2/docling_core/types/doc/utils.py +0 -86
- {docling_core-2.44.2 → docling_core-2.45.0}/LICENSE +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/README.md +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/py.typed +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/search/package.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/chunker/page_chunker.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/serializer/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/serializer/base.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/serializer/common.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/serializer/doctags.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/serializer/html_styles.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/serializer/markdown.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/visualizer/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/visualizer/base.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/visualizer/key_value_visualizer.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/base.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/doc/page.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/utils/validators.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core.egg-info/SOURCES.txt +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core.egg-info/dependency_links.txt +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core.egg-info/entry_points.txt +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core.egg-info/requires.txt +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/docling_core.egg-info/top_level.txt +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/setup.cfg +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_base.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_collection.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_data_gen_flag.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_doc_base.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_doc_legacy_convert.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_doc_schema.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_doc_schema_extractor.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_doctags_load.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_hierarchical_chunker.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_hybrid_chunker.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_json_schema_to_search_mapper.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_nlp_qa.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_otsl_table_export.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_page.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_page_chunker.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_rec_schema.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_search_meta.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_serialization.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_utils.py +0 -0
- {docling_core-2.44.2 → docling_core-2.45.0}/test/test_visualization.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.45.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -1057,7 +1057,7 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
1057
1057
|
if self.params.html_head is not None:
|
|
1058
1058
|
return self.params.html_head
|
|
1059
1059
|
|
|
1060
|
-
head_parts = ["<head>", '<meta charset="UTF-8"
|
|
1060
|
+
head_parts = ["<head>", '<meta charset="UTF-8"/>']
|
|
1061
1061
|
|
|
1062
1062
|
# Add metadata if requested
|
|
1063
1063
|
if params.add_document_metadata:
|
|
@@ -1067,7 +1067,7 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
1067
1067
|
head_parts.append("<title>Docling Document</title>")
|
|
1068
1068
|
|
|
1069
1069
|
head_parts.append(
|
|
1070
|
-
'<meta name="generator" content="Docling HTML Serializer"
|
|
1070
|
+
'<meta name="generator" content="Docling HTML Serializer"/>'
|
|
1071
1071
|
)
|
|
1072
1072
|
|
|
1073
1073
|
# Add default styles or custom CSS
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
import base64
|
|
4
4
|
import copy
|
|
5
5
|
import hashlib
|
|
6
|
-
import itertools
|
|
7
6
|
import json
|
|
8
7
|
import logging
|
|
9
8
|
import mimetypes
|
|
@@ -54,8 +53,8 @@ from docling_core.types.doc.labels import (
|
|
|
54
53
|
GroupLabel,
|
|
55
54
|
PictureClassificationLabel,
|
|
56
55
|
)
|
|
57
|
-
from docling_core.types.doc.tokens import
|
|
58
|
-
from docling_core.types.doc.utils import relative_path
|
|
56
|
+
from docling_core.types.doc.tokens import DocumentToken, TableToken
|
|
57
|
+
from docling_core.types.doc.utils import parse_otsl_table_content, relative_path
|
|
59
58
|
|
|
60
59
|
_logger = logging.getLogger(__name__)
|
|
61
60
|
|
|
@@ -4688,181 +4687,6 @@ class DoclingDocument(BaseModel):
|
|
|
4688
4687
|
bbox = None
|
|
4689
4688
|
return caption_item, bbox
|
|
4690
4689
|
|
|
4691
|
-
def otsl_parse_texts(texts, tokens):
|
|
4692
|
-
split_word = TableToken.OTSL_NL.value
|
|
4693
|
-
# CLEAN tokens from extra tags, only structural OTSL allowed
|
|
4694
|
-
clean_tokens = []
|
|
4695
|
-
for t in tokens:
|
|
4696
|
-
if t in [
|
|
4697
|
-
TableToken.OTSL_ECEL.value,
|
|
4698
|
-
TableToken.OTSL_FCEL.value,
|
|
4699
|
-
TableToken.OTSL_LCEL.value,
|
|
4700
|
-
TableToken.OTSL_UCEL.value,
|
|
4701
|
-
TableToken.OTSL_XCEL.value,
|
|
4702
|
-
TableToken.OTSL_NL.value,
|
|
4703
|
-
TableToken.OTSL_CHED.value,
|
|
4704
|
-
TableToken.OTSL_RHED.value,
|
|
4705
|
-
TableToken.OTSL_SROW.value,
|
|
4706
|
-
]:
|
|
4707
|
-
clean_tokens.append(t)
|
|
4708
|
-
tokens = clean_tokens
|
|
4709
|
-
split_row_tokens = [
|
|
4710
|
-
list(y)
|
|
4711
|
-
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
|
|
4712
|
-
if not x
|
|
4713
|
-
]
|
|
4714
|
-
|
|
4715
|
-
table_cells = []
|
|
4716
|
-
r_idx = 0
|
|
4717
|
-
c_idx = 0
|
|
4718
|
-
|
|
4719
|
-
def count_right(tokens, c_idx, r_idx, which_tokens):
|
|
4720
|
-
span = 0
|
|
4721
|
-
c_idx_iter = c_idx
|
|
4722
|
-
while tokens[r_idx][c_idx_iter] in which_tokens:
|
|
4723
|
-
c_idx_iter += 1
|
|
4724
|
-
span += 1
|
|
4725
|
-
if c_idx_iter >= len(tokens[r_idx]):
|
|
4726
|
-
return span
|
|
4727
|
-
return span
|
|
4728
|
-
|
|
4729
|
-
def count_down(tokens, c_idx, r_idx, which_tokens):
|
|
4730
|
-
span = 0
|
|
4731
|
-
r_idx_iter = r_idx
|
|
4732
|
-
while tokens[r_idx_iter][c_idx] in which_tokens:
|
|
4733
|
-
r_idx_iter += 1
|
|
4734
|
-
span += 1
|
|
4735
|
-
if r_idx_iter >= len(tokens):
|
|
4736
|
-
return span
|
|
4737
|
-
return span
|
|
4738
|
-
|
|
4739
|
-
for i, text in enumerate(texts):
|
|
4740
|
-
cell_text = ""
|
|
4741
|
-
if text in [
|
|
4742
|
-
TableToken.OTSL_FCEL.value,
|
|
4743
|
-
TableToken.OTSL_ECEL.value,
|
|
4744
|
-
TableToken.OTSL_CHED.value,
|
|
4745
|
-
TableToken.OTSL_RHED.value,
|
|
4746
|
-
TableToken.OTSL_SROW.value,
|
|
4747
|
-
]:
|
|
4748
|
-
row_span = 1
|
|
4749
|
-
col_span = 1
|
|
4750
|
-
right_offset = 1
|
|
4751
|
-
if text != TableToken.OTSL_ECEL.value:
|
|
4752
|
-
cell_text = texts[i + 1]
|
|
4753
|
-
right_offset = 2
|
|
4754
|
-
|
|
4755
|
-
# Check next element(s) for lcel / ucel / xcel,
|
|
4756
|
-
# set properly row_span, col_span
|
|
4757
|
-
next_right_cell = ""
|
|
4758
|
-
if i + right_offset < len(texts):
|
|
4759
|
-
next_right_cell = texts[i + right_offset]
|
|
4760
|
-
|
|
4761
|
-
next_bottom_cell = ""
|
|
4762
|
-
if r_idx + 1 < len(split_row_tokens):
|
|
4763
|
-
if c_idx < len(split_row_tokens[r_idx + 1]):
|
|
4764
|
-
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
|
|
4765
|
-
|
|
4766
|
-
if next_right_cell in [
|
|
4767
|
-
TableToken.OTSL_LCEL.value,
|
|
4768
|
-
TableToken.OTSL_XCEL.value,
|
|
4769
|
-
]:
|
|
4770
|
-
# we have horisontal spanning cell or 2d spanning cell
|
|
4771
|
-
col_span += count_right(
|
|
4772
|
-
split_row_tokens,
|
|
4773
|
-
c_idx + 1,
|
|
4774
|
-
r_idx,
|
|
4775
|
-
[TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
|
|
4776
|
-
)
|
|
4777
|
-
if next_bottom_cell in [
|
|
4778
|
-
TableToken.OTSL_UCEL.value,
|
|
4779
|
-
TableToken.OTSL_XCEL.value,
|
|
4780
|
-
]:
|
|
4781
|
-
# we have a vertical spanning cell or 2d spanning cell
|
|
4782
|
-
row_span += count_down(
|
|
4783
|
-
split_row_tokens,
|
|
4784
|
-
c_idx,
|
|
4785
|
-
r_idx + 1,
|
|
4786
|
-
[TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
|
|
4787
|
-
)
|
|
4788
|
-
|
|
4789
|
-
table_cells.append(
|
|
4790
|
-
TableCell(
|
|
4791
|
-
text=cell_text.strip(),
|
|
4792
|
-
row_span=row_span,
|
|
4793
|
-
col_span=col_span,
|
|
4794
|
-
start_row_offset_idx=r_idx,
|
|
4795
|
-
end_row_offset_idx=r_idx + row_span,
|
|
4796
|
-
start_col_offset_idx=c_idx,
|
|
4797
|
-
end_col_offset_idx=c_idx + col_span,
|
|
4798
|
-
)
|
|
4799
|
-
)
|
|
4800
|
-
if text in [
|
|
4801
|
-
TableToken.OTSL_FCEL.value,
|
|
4802
|
-
TableToken.OTSL_ECEL.value,
|
|
4803
|
-
TableToken.OTSL_CHED.value,
|
|
4804
|
-
TableToken.OTSL_RHED.value,
|
|
4805
|
-
TableToken.OTSL_SROW.value,
|
|
4806
|
-
TableToken.OTSL_LCEL.value,
|
|
4807
|
-
TableToken.OTSL_UCEL.value,
|
|
4808
|
-
TableToken.OTSL_XCEL.value,
|
|
4809
|
-
]:
|
|
4810
|
-
c_idx += 1
|
|
4811
|
-
if text == TableToken.OTSL_NL.value:
|
|
4812
|
-
r_idx += 1
|
|
4813
|
-
c_idx = 0
|
|
4814
|
-
return table_cells, split_row_tokens
|
|
4815
|
-
|
|
4816
|
-
def otsl_extract_tokens_and_text(s: str):
|
|
4817
|
-
# Pattern to match anything enclosed by < >
|
|
4818
|
-
# (including the angle brackets themselves)
|
|
4819
|
-
pattern = r"(<[^>]+>)"
|
|
4820
|
-
# Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
|
|
4821
|
-
tokens = re.findall(pattern, s)
|
|
4822
|
-
# Remove any tokens that start with "<loc_"
|
|
4823
|
-
tokens = [
|
|
4824
|
-
token
|
|
4825
|
-
for token in tokens
|
|
4826
|
-
if not (
|
|
4827
|
-
token.startswith(rf"<{_LOC_PREFIX}")
|
|
4828
|
-
or token
|
|
4829
|
-
in [
|
|
4830
|
-
rf"<{DocumentToken.OTSL.value}>",
|
|
4831
|
-
rf"</{DocumentToken.OTSL.value}>",
|
|
4832
|
-
]
|
|
4833
|
-
)
|
|
4834
|
-
]
|
|
4835
|
-
# Split the string by those tokens to get the in-between text
|
|
4836
|
-
text_parts = re.split(pattern, s)
|
|
4837
|
-
text_parts = [
|
|
4838
|
-
token
|
|
4839
|
-
for token in text_parts
|
|
4840
|
-
if not (
|
|
4841
|
-
token.startswith(rf"<{_LOC_PREFIX}")
|
|
4842
|
-
or token
|
|
4843
|
-
in [
|
|
4844
|
-
rf"<{DocumentToken.OTSL.value}>",
|
|
4845
|
-
rf"</{DocumentToken.OTSL.value}>",
|
|
4846
|
-
]
|
|
4847
|
-
)
|
|
4848
|
-
]
|
|
4849
|
-
# Remove any empty or purely whitespace strings from text_parts
|
|
4850
|
-
text_parts = [part for part in text_parts if part.strip()]
|
|
4851
|
-
|
|
4852
|
-
return tokens, text_parts
|
|
4853
|
-
|
|
4854
|
-
def parse_table_content(otsl_content: str) -> TableData:
|
|
4855
|
-
tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
|
|
4856
|
-
table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
|
|
4857
|
-
|
|
4858
|
-
return TableData(
|
|
4859
|
-
num_rows=len(split_row_tokens),
|
|
4860
|
-
num_cols=(
|
|
4861
|
-
max(len(row) for row in split_row_tokens) if split_row_tokens else 0
|
|
4862
|
-
),
|
|
4863
|
-
table_cells=table_cells,
|
|
4864
|
-
)
|
|
4865
|
-
|
|
4866
4690
|
def extract_chart_type(text_chunk: str):
|
|
4867
4691
|
label = None
|
|
4868
4692
|
chart_labels = [
|
|
@@ -5094,7 +4918,7 @@ class DoclingDocument(BaseModel):
|
|
|
5094
4918
|
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.TEXT)
|
|
5095
4919
|
|
|
5096
4920
|
if tag_name == DocumentToken.OTSL.value:
|
|
5097
|
-
table_data =
|
|
4921
|
+
table_data = parse_otsl_table_content(full_chunk)
|
|
5098
4922
|
caption, caption_bbox = extract_caption(full_chunk)
|
|
5099
4923
|
if caption is not None and caption_bbox is not None:
|
|
5100
4924
|
caption.prov.append(
|
|
@@ -5137,7 +4961,7 @@ class DoclingDocument(BaseModel):
|
|
|
5137
4961
|
table_data = None
|
|
5138
4962
|
chart_type = None
|
|
5139
4963
|
if tag_name == DocumentToken.CHART.value:
|
|
5140
|
-
table_data =
|
|
4964
|
+
table_data = parse_otsl_table_content(full_chunk)
|
|
5141
4965
|
chart_type = extract_chart_type(full_chunk)
|
|
5142
4966
|
if image:
|
|
5143
4967
|
if bbox:
|
|
@@ -5683,69 +5507,174 @@ class DoclingDocument(BaseModel):
|
|
|
5683
5507
|
)
|
|
5684
5508
|
return self
|
|
5685
5509
|
|
|
5510
|
+
class _DocIndex(BaseModel):
|
|
5511
|
+
"""A document merge buffer."""
|
|
5512
|
+
|
|
5513
|
+
groups: list[GroupItem] = []
|
|
5514
|
+
texts: list[TextItem] = []
|
|
5515
|
+
pictures: list[PictureItem] = []
|
|
5516
|
+
tables: list[TableItem] = []
|
|
5517
|
+
key_value_items: list[KeyValueItem] = []
|
|
5518
|
+
form_items: list[FormItem] = []
|
|
5519
|
+
|
|
5520
|
+
pages: dict[int, PageItem] = {}
|
|
5521
|
+
|
|
5522
|
+
_body: Optional[GroupItem] = None
|
|
5523
|
+
_max_page: int = 0
|
|
5524
|
+
_names: list[str] = []
|
|
5525
|
+
|
|
5526
|
+
def get_item_list(self, key: str) -> list[NodeItem]:
|
|
5527
|
+
return getattr(self, key)
|
|
5528
|
+
|
|
5529
|
+
def index(self, doc: "DoclingDocument") -> None:
|
|
5530
|
+
|
|
5531
|
+
orig_ref_to_new_ref: dict[str, str] = {}
|
|
5532
|
+
page_delta = self._max_page - min(doc.pages.keys()) + 1 if doc.pages else 0
|
|
5533
|
+
|
|
5534
|
+
if self._body is None:
|
|
5535
|
+
self._body = GroupItem(**doc.body.model_dump(exclude={"children"}))
|
|
5536
|
+
|
|
5537
|
+
self._names.append(doc.name)
|
|
5538
|
+
|
|
5539
|
+
# collect items in traversal order
|
|
5540
|
+
for item, _ in doc.iterate_items(
|
|
5541
|
+
with_groups=True,
|
|
5542
|
+
traverse_pictures=True,
|
|
5543
|
+
included_content_layers={c for c in ContentLayer},
|
|
5544
|
+
):
|
|
5545
|
+
key = item.self_ref.split("/")[1]
|
|
5546
|
+
is_body = key == "body"
|
|
5547
|
+
new_cref = (
|
|
5548
|
+
"#/body" if is_body else f"#/{key}/{len(self.get_item_list(key))}"
|
|
5549
|
+
)
|
|
5550
|
+
# register cref mapping:
|
|
5551
|
+
orig_ref_to_new_ref[item.self_ref] = new_cref
|
|
5552
|
+
|
|
5553
|
+
if not is_body:
|
|
5554
|
+
new_item = copy.deepcopy(item)
|
|
5555
|
+
new_item.children = []
|
|
5556
|
+
|
|
5557
|
+
# put item in the right list
|
|
5558
|
+
self.get_item_list(key).append(new_item)
|
|
5559
|
+
|
|
5560
|
+
# update item's self reference
|
|
5561
|
+
new_item.self_ref = new_cref
|
|
5562
|
+
|
|
5563
|
+
if isinstance(new_item, DocItem):
|
|
5564
|
+
# update page numbers
|
|
5565
|
+
# NOTE other prov sources (e.g. GraphCell) currently not covered
|
|
5566
|
+
for prov in new_item.prov:
|
|
5567
|
+
prov.page_no += page_delta
|
|
5568
|
+
|
|
5569
|
+
if item.parent:
|
|
5570
|
+
# set item's parent
|
|
5571
|
+
new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
|
|
5572
|
+
new_item.parent = RefItem(cref=new_parent_cref)
|
|
5573
|
+
|
|
5574
|
+
# add item to parent's children
|
|
5575
|
+
path_components = new_parent_cref.split("/")
|
|
5576
|
+
num_components = len(path_components)
|
|
5577
|
+
if num_components == 3:
|
|
5578
|
+
_, parent_key, parent_index_str = path_components
|
|
5579
|
+
parent_index = int(parent_index_str)
|
|
5580
|
+
parent_item = self.get_item_list(parent_key)[parent_index]
|
|
5581
|
+
|
|
5582
|
+
# update captions field (not possible in iterate_items order):
|
|
5583
|
+
if isinstance(parent_item, FloatingItem):
|
|
5584
|
+
for cap_it, cap in enumerate(parent_item.captions):
|
|
5585
|
+
if cap.cref == item.self_ref:
|
|
5586
|
+
parent_item.captions[cap_it] = RefItem(
|
|
5587
|
+
cref=new_cref
|
|
5588
|
+
)
|
|
5589
|
+
break
|
|
5590
|
+
|
|
5591
|
+
elif num_components == 2 and path_components[1] == "body":
|
|
5592
|
+
parent_item = self._body
|
|
5593
|
+
else:
|
|
5594
|
+
raise RuntimeError(
|
|
5595
|
+
f"Unsupported ref format: {new_parent_cref}"
|
|
5596
|
+
)
|
|
5597
|
+
parent_item.children.append(RefItem(cref=new_cref))
|
|
5598
|
+
|
|
5599
|
+
# update pages
|
|
5600
|
+
new_max_page = None
|
|
5601
|
+
for page_nr in doc.pages:
|
|
5602
|
+
new_page = copy.deepcopy(doc.pages[page_nr])
|
|
5603
|
+
new_page_nr = page_nr + page_delta
|
|
5604
|
+
new_page.page_no = new_page_nr
|
|
5605
|
+
self.pages[new_page_nr] = new_page
|
|
5606
|
+
if new_max_page is None or new_page_nr > new_max_page:
|
|
5607
|
+
new_max_page = new_page_nr
|
|
5608
|
+
if new_max_page is not None:
|
|
5609
|
+
self._max_page = new_max_page
|
|
5610
|
+
|
|
5611
|
+
def get_name(self) -> str:
|
|
5612
|
+
return " + ".join(self._names)
|
|
5613
|
+
|
|
5614
|
+
def _update_from_index(self, doc_index: "_DocIndex") -> None:
|
|
5615
|
+
if doc_index._body is not None:
|
|
5616
|
+
self.body = doc_index._body
|
|
5617
|
+
self.groups = doc_index.groups
|
|
5618
|
+
self.texts = doc_index.texts
|
|
5619
|
+
self.pictures = doc_index.pictures
|
|
5620
|
+
self.tables = doc_index.tables
|
|
5621
|
+
self.key_value_items = doc_index.key_value_items
|
|
5622
|
+
self.form_items = doc_index.form_items
|
|
5623
|
+
self.pages = doc_index.pages
|
|
5624
|
+
self.name = doc_index.get_name()
|
|
5625
|
+
|
|
5686
5626
|
def _normalize_references(self) -> None:
|
|
5687
|
-
|
|
5688
|
-
|
|
5689
|
-
|
|
5690
|
-
|
|
5691
|
-
|
|
5692
|
-
|
|
5693
|
-
|
|
5694
|
-
|
|
5695
|
-
|
|
5696
|
-
|
|
5697
|
-
|
|
5698
|
-
|
|
5627
|
+
doc_index = DoclingDocument._DocIndex()
|
|
5628
|
+
doc_index.index(doc=self)
|
|
5629
|
+
self._update_from_index(doc_index)
|
|
5630
|
+
|
|
5631
|
+
@classmethod
|
|
5632
|
+
def concatenate(cls, docs: Sequence["DoclingDocument"]) -> "DoclingDocument":
|
|
5633
|
+
"""Concatenate multiple documents into a single document."""
|
|
5634
|
+
doc_index = DoclingDocument._DocIndex()
|
|
5635
|
+
for doc in docs:
|
|
5636
|
+
doc_index.index(doc=doc)
|
|
5637
|
+
|
|
5638
|
+
res_doc = DoclingDocument(name=" + ".join([doc.name for doc in docs]))
|
|
5639
|
+
res_doc._update_from_index(doc_index)
|
|
5640
|
+
return res_doc
|
|
5641
|
+
|
|
5642
|
+
def _validate_rules(self):
|
|
5643
|
+
def validate_list_group(doc: DoclingDocument, item: ListGroup):
|
|
5644
|
+
for ref in item.children:
|
|
5645
|
+
child = ref.resolve(doc)
|
|
5646
|
+
if not isinstance(child, ListItem):
|
|
5647
|
+
raise ValueError(
|
|
5648
|
+
f"ListGroup {item.self_ref} contains non-ListItem {child.self_ref} ({child.label=})"
|
|
5649
|
+
)
|
|
5650
|
+
|
|
5651
|
+
def validate_list_item(doc: DoclingDocument, item: ListItem):
|
|
5652
|
+
if item.parent is None:
|
|
5653
|
+
raise ValueError(f"ListItem {item.self_ref} has no parent")
|
|
5654
|
+
if not isinstance(item.parent.resolve(doc), ListGroup):
|
|
5655
|
+
raise ValueError(
|
|
5656
|
+
f"ListItem {item.self_ref} has non-ListGroup parent: {item.parent.cref}"
|
|
5657
|
+
)
|
|
5658
|
+
|
|
5659
|
+
def validate_group(doc: DoclingDocument, item: GroupItem):
|
|
5660
|
+
if (
|
|
5661
|
+
item.parent and not item.children
|
|
5662
|
+
): # tolerate empty body, but not other groups
|
|
5663
|
+
raise ValueError(f"Group {item.self_ref} has no children")
|
|
5699
5664
|
|
|
5700
|
-
# collect items in traversal order
|
|
5701
5665
|
for item, _ in self.iterate_items(
|
|
5702
5666
|
with_groups=True,
|
|
5703
5667
|
traverse_pictures=True,
|
|
5704
5668
|
included_content_layers={c for c in ContentLayer},
|
|
5705
5669
|
):
|
|
5706
|
-
|
|
5707
|
-
|
|
5708
|
-
|
|
5709
|
-
|
|
5710
|
-
|
|
5711
|
-
|
|
5712
|
-
|
|
5713
|
-
|
|
5714
|
-
new_item.children = []
|
|
5715
|
-
|
|
5716
|
-
# put item in the right list
|
|
5717
|
-
item_lists[key].append(new_item)
|
|
5718
|
-
|
|
5719
|
-
# update item's self reference
|
|
5720
|
-
new_item.self_ref = new_cref
|
|
5721
|
-
|
|
5722
|
-
if item.parent:
|
|
5723
|
-
# set item's parent
|
|
5724
|
-
new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
|
|
5725
|
-
new_item.parent = RefItem(cref=new_parent_cref)
|
|
5726
|
-
|
|
5727
|
-
# add item to parent's children
|
|
5728
|
-
path_components = new_parent_cref.split("/")
|
|
5729
|
-
num_components = len(path_components)
|
|
5730
|
-
parent_node: NodeItem
|
|
5731
|
-
if num_components == 3:
|
|
5732
|
-
_, parent_key, parent_index_str = path_components
|
|
5733
|
-
parent_index = int(parent_index_str)
|
|
5734
|
-
parent_node = item_lists[parent_key][parent_index]
|
|
5735
|
-
elif num_components == 2 and path_components[1] == "body":
|
|
5736
|
-
parent_node = new_body
|
|
5737
|
-
else:
|
|
5738
|
-
raise RuntimeError(f"Unsupported ref format: {new_parent_cref}")
|
|
5739
|
-
parent_node.children.append(RefItem(cref=new_cref))
|
|
5740
|
-
|
|
5741
|
-
# update document
|
|
5742
|
-
self.groups = item_lists["groups"] # type: ignore
|
|
5743
|
-
self.texts = item_lists["texts"] # type: ignore
|
|
5744
|
-
self.pictures = item_lists["pictures"] # type: ignore
|
|
5745
|
-
self.tables = item_lists["tables"] # type: ignore
|
|
5746
|
-
self.key_value_items = item_lists["key_value_items"] # type: ignore
|
|
5747
|
-
self.form_items = item_lists["form_items"] # type: ignore
|
|
5748
|
-
self.body = new_body
|
|
5670
|
+
if isinstance(item, ListGroup):
|
|
5671
|
+
validate_list_group(self, item)
|
|
5672
|
+
|
|
5673
|
+
elif isinstance(item, GroupItem):
|
|
5674
|
+
validate_group(self, item)
|
|
5675
|
+
|
|
5676
|
+
elif isinstance(item, ListItem):
|
|
5677
|
+
validate_list_item(self, item)
|
|
5749
5678
|
|
|
5750
5679
|
|
|
5751
5680
|
# deprecated aliases (kept for backwards compatibility):
|