docling-core 2.44.1__tar.gz → 2.45.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (111) hide show
  1. {docling_core-2.44.1 → docling_core-2.45.0}/PKG-INFO +1 -1
  2. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/serializer/html.py +34 -74
  3. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/doc/document.py +165 -236
  4. docling_core-2.45.0/docling_core/types/doc/utils.py +282 -0
  5. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core.egg-info/PKG-INFO +1 -1
  6. {docling_core-2.44.1 → docling_core-2.45.0}/pyproject.toml +1 -1
  7. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_docling_doc.py +119 -1
  8. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_serialization.py +18 -0
  9. docling_core-2.44.1/docling_core/types/doc/utils.py +0 -86
  10. {docling_core-2.44.1 → docling_core-2.45.0}/LICENSE +0 -0
  11. {docling_core-2.44.1 → docling_core-2.45.0}/README.md +0 -0
  12. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/__init__.py +0 -0
  13. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/cli/__init__.py +0 -0
  14. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/cli/view.py +0 -0
  15. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/experimental/__init__.py +0 -0
  16. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/py.typed +0 -0
  17. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  18. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  19. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  20. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  21. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  22. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  23. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  24. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  25. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/search/__init__.py +0 -0
  26. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  27. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/search/mapping.py +0 -0
  28. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/search/meta.py +0 -0
  29. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/search/package.py +0 -0
  30. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/__init__.py +0 -0
  31. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/chunker/__init__.py +0 -0
  32. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/chunker/base.py +0 -0
  33. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  34. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  35. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/chunker/page_chunker.py +0 -0
  36. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
  37. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
  38. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
  39. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
  40. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/serializer/__init__.py +0 -0
  41. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/serializer/base.py +0 -0
  42. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/serializer/common.py +0 -0
  43. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/serializer/doctags.py +0 -0
  44. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/serializer/html_styles.py +0 -0
  45. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/serializer/markdown.py +0 -0
  46. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/visualizer/__init__.py +0 -0
  47. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/visualizer/base.py +0 -0
  48. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/visualizer/key_value_visualizer.py +0 -0
  49. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
  50. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
  51. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
  52. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/__init__.py +0 -0
  53. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/base.py +0 -0
  54. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/doc/__init__.py +0 -0
  55. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/doc/base.py +0 -0
  56. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/doc/labels.py +0 -0
  57. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/doc/page.py +0 -0
  58. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/doc/tokens.py +0 -0
  59. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/gen/__init__.py +0 -0
  60. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/gen/generic.py +0 -0
  61. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/io/__init__.py +0 -0
  62. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  63. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/legacy_doc/base.py +0 -0
  64. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  65. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  66. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  67. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/legacy_doc/document.py +0 -0
  68. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  69. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/nlp/__init__.py +0 -0
  70. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/nlp/qa.py +0 -0
  71. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/nlp/qa_labels.py +0 -0
  72. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/rec/__init__.py +0 -0
  73. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/rec/attribute.py +0 -0
  74. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/rec/base.py +0 -0
  75. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/rec/predicate.py +0 -0
  76. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/rec/record.py +0 -0
  77. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/rec/statement.py +0 -0
  78. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/rec/subject.py +0 -0
  79. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/utils/__init__.py +0 -0
  80. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/utils/alias.py +0 -0
  81. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/utils/file.py +0 -0
  82. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/utils/generate_docs.py +0 -0
  83. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/utils/generate_jsonschema.py +0 -0
  84. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/utils/legacy.py +0 -0
  85. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/utils/validate.py +0 -0
  86. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/utils/validators.py +0 -0
  87. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core.egg-info/SOURCES.txt +0 -0
  88. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core.egg-info/dependency_links.txt +0 -0
  89. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core.egg-info/entry_points.txt +0 -0
  90. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core.egg-info/requires.txt +0 -0
  91. {docling_core-2.44.1 → docling_core-2.45.0}/docling_core.egg-info/top_level.txt +0 -0
  92. {docling_core-2.44.1 → docling_core-2.45.0}/setup.cfg +0 -0
  93. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_base.py +0 -0
  94. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_collection.py +0 -0
  95. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_data_gen_flag.py +0 -0
  96. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_doc_base.py +0 -0
  97. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_doc_legacy_convert.py +0 -0
  98. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_doc_schema.py +0 -0
  99. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_doc_schema_extractor.py +0 -0
  100. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_doctags_load.py +0 -0
  101. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_hierarchical_chunker.py +0 -0
  102. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_hybrid_chunker.py +0 -0
  103. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_json_schema_to_search_mapper.py +0 -0
  104. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_nlp_qa.py +0 -0
  105. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_otsl_table_export.py +0 -0
  106. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_page.py +0 -0
  107. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_page_chunker.py +0 -0
  108. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_rec_schema.py +0 -0
  109. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_search_meta.py +0 -0
  110. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_utils.py +0 -0
  111. {docling_core-2.44.1 → docling_core-2.45.0}/test/test_visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.44.1
3
+ Version: 2.45.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -130,11 +130,14 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
130
130
  doc_serializer: BaseDocSerializer,
131
131
  doc: DoclingDocument,
132
132
  is_inline_scope: bool = False,
133
+ visited: Optional[set[str]] = None,
133
134
  **kwargs: Any,
134
135
  ) -> SerializationResult:
135
136
  """Serializes the passed text item to HTML."""
136
137
  params = HTMLParams(**kwargs)
138
+ my_visited: set[str] = visited if visited is not None else set()
137
139
  res_parts: list[SerializationResult] = []
140
+ post_processed = False
138
141
 
139
142
  # Prepare the HTML based on item type
140
143
  if isinstance(item, TitleItem):
@@ -162,7 +165,28 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
162
165
 
163
166
  elif isinstance(item, ListItem):
164
167
  # List items are handled by list serializer
165
- text_inner = self._prepare_content(item.text)
168
+ text_parts: list[str] = []
169
+ if item_text := self._prepare_content(item.text):
170
+ item_text = doc_serializer.post_process(
171
+ text=item_text,
172
+ formatting=item.formatting,
173
+ hyperlink=item.hyperlink,
174
+ )
175
+ post_processed = True
176
+ text_parts.append(item_text)
177
+ nested_parts = [
178
+ r.text
179
+ for r in doc_serializer.get_parts(
180
+ item=item,
181
+ is_inline_scope=is_inline_scope,
182
+ visited=my_visited,
183
+ **kwargs,
184
+ )
185
+ ]
186
+ text_parts.extend(nested_parts)
187
+ text_inner = "\n".join(text_parts)
188
+ if nested_parts:
189
+ text_inner = f"\n{text_inner}\n"
166
190
  text = (
167
191
  get_html_tag_with_text_direction(
168
192
  html_tag="li",
@@ -185,11 +209,12 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
185
209
  text = get_html_tag_with_text_direction(html_tag="p", text=text_inner)
186
210
 
187
211
  # Apply formatting and hyperlinks
188
- text = doc_serializer.post_process(
189
- text=text,
190
- formatting=item.formatting,
191
- hyperlink=item.hyperlink,
192
- )
212
+ if not post_processed:
213
+ text = doc_serializer.post_process(
214
+ text=text,
215
+ formatting=item.formatting,
216
+ hyperlink=item.hyperlink,
217
+ )
193
218
 
194
219
  if text:
195
220
  text_res = create_ser_result(text=text, span_source=item)
@@ -703,7 +728,6 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
703
728
  ) -> SerializationResult:
704
729
  """Serializes a list to HTML."""
705
730
  my_visited: set[str] = visited if visited is not None else set()
706
- params = HTMLParams(**kwargs)
707
731
  # Get all child parts
708
732
  parts = doc_serializer.get_parts(
709
733
  item=item,
@@ -713,72 +737,8 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
713
737
  **kwargs,
714
738
  )
715
739
 
716
- # Append nested list to parent list item:
717
- i = 0
718
- while i < len(parts):
719
- prt = parts[i]
720
- if prt.text.startswith(("<ul>", "<ol>")):
721
- for j in range(i - 1, -1, -1):
722
- if parts[j].text.startswith(("<li>", "<li ")) and parts[
723
- j
724
- ].text.endswith("</li>"):
725
- before, _, _ = parts[j].text.rpartition("</li>")
726
- parts[j].text = f"{before}\n{prt.text}\n</li>"
727
- break
728
- if j > -1:
729
- parts.pop(i)
730
- else:
731
- i += 1
732
-
733
740
  # Add all child parts
734
- text_res = "\n".join(
735
- [
736
- (
737
- p.text
738
- if (
739
- (
740
- p.text.startswith(("<li>", "<li "))
741
- and p.text.endswith("</li>")
742
- )
743
- or (
744
- p.text.startswith(("<ol>", "<ol "))
745
- and p.text.endswith("</ol>")
746
- )
747
- or (
748
- p.text.startswith(("<ul>", "<ul "))
749
- and p.text.endswith("</ul>")
750
- )
751
- )
752
- else (
753
- get_html_tag_with_text_direction(
754
- html_tag="li",
755
- text=p.text,
756
- attrs=(
757
- {
758
- "style": f"list-style-type: '{grandparent_item.marker} ';"
759
- }
760
- if params.show_original_list_item_marker
761
- and grandparent_item.marker
762
- else {}
763
- ),
764
- )
765
- if p.spans
766
- and p.spans[0].item.parent
767
- and isinstance(
768
- (parent_item := p.spans[0].item.parent.resolve(doc)),
769
- InlineGroup,
770
- )
771
- and parent_item.parent
772
- and isinstance(
773
- (grandparent_item := parent_item.parent.resolve(doc)),
774
- ListItem,
775
- )
776
- else f"<li>{p.text}</li>"
777
- )
778
- )
779
- for p in parts
780
- ]
781
- )
741
+ text_res = "\n".join(p.text for p in parts if p.text)
782
742
  if text_res:
783
743
  tag = "ol" if item.first_item_is_enumerated(doc) else "ul"
784
744
  text_res = f"<{tag}>\n{text_res}\n</{tag}>"
@@ -1097,7 +1057,7 @@ class HTMLDocSerializer(DocSerializer):
1097
1057
  if self.params.html_head is not None:
1098
1058
  return self.params.html_head
1099
1059
 
1100
- head_parts = ["<head>", '<meta charset="UTF-8">']
1060
+ head_parts = ["<head>", '<meta charset="UTF-8"/>']
1101
1061
 
1102
1062
  # Add metadata if requested
1103
1063
  if params.add_document_metadata:
@@ -1107,7 +1067,7 @@ class HTMLDocSerializer(DocSerializer):
1107
1067
  head_parts.append("<title>Docling Document</title>")
1108
1068
 
1109
1069
  head_parts.append(
1110
- '<meta name="generator" content="Docling HTML Serializer">'
1070
+ '<meta name="generator" content="Docling HTML Serializer"/>'
1111
1071
  )
1112
1072
 
1113
1073
  # Add default styles or custom CSS
@@ -3,7 +3,6 @@
3
3
  import base64
4
4
  import copy
5
5
  import hashlib
6
- import itertools
7
6
  import json
8
7
  import logging
9
8
  import mimetypes
@@ -54,8 +53,8 @@ from docling_core.types.doc.labels import (
54
53
  GroupLabel,
55
54
  PictureClassificationLabel,
56
55
  )
57
- from docling_core.types.doc.tokens import _LOC_PREFIX, DocumentToken, TableToken
58
- from docling_core.types.doc.utils import relative_path
56
+ from docling_core.types.doc.tokens import DocumentToken, TableToken
57
+ from docling_core.types.doc.utils import parse_otsl_table_content, relative_path
59
58
 
60
59
  _logger = logging.getLogger(__name__)
61
60
 
@@ -4688,181 +4687,6 @@ class DoclingDocument(BaseModel):
4688
4687
  bbox = None
4689
4688
  return caption_item, bbox
4690
4689
 
4691
- def otsl_parse_texts(texts, tokens):
4692
- split_word = TableToken.OTSL_NL.value
4693
- # CLEAN tokens from extra tags, only structural OTSL allowed
4694
- clean_tokens = []
4695
- for t in tokens:
4696
- if t in [
4697
- TableToken.OTSL_ECEL.value,
4698
- TableToken.OTSL_FCEL.value,
4699
- TableToken.OTSL_LCEL.value,
4700
- TableToken.OTSL_UCEL.value,
4701
- TableToken.OTSL_XCEL.value,
4702
- TableToken.OTSL_NL.value,
4703
- TableToken.OTSL_CHED.value,
4704
- TableToken.OTSL_RHED.value,
4705
- TableToken.OTSL_SROW.value,
4706
- ]:
4707
- clean_tokens.append(t)
4708
- tokens = clean_tokens
4709
- split_row_tokens = [
4710
- list(y)
4711
- for x, y in itertools.groupby(tokens, lambda z: z == split_word)
4712
- if not x
4713
- ]
4714
-
4715
- table_cells = []
4716
- r_idx = 0
4717
- c_idx = 0
4718
-
4719
- def count_right(tokens, c_idx, r_idx, which_tokens):
4720
- span = 0
4721
- c_idx_iter = c_idx
4722
- while tokens[r_idx][c_idx_iter] in which_tokens:
4723
- c_idx_iter += 1
4724
- span += 1
4725
- if c_idx_iter >= len(tokens[r_idx]):
4726
- return span
4727
- return span
4728
-
4729
- def count_down(tokens, c_idx, r_idx, which_tokens):
4730
- span = 0
4731
- r_idx_iter = r_idx
4732
- while tokens[r_idx_iter][c_idx] in which_tokens:
4733
- r_idx_iter += 1
4734
- span += 1
4735
- if r_idx_iter >= len(tokens):
4736
- return span
4737
- return span
4738
-
4739
- for i, text in enumerate(texts):
4740
- cell_text = ""
4741
- if text in [
4742
- TableToken.OTSL_FCEL.value,
4743
- TableToken.OTSL_ECEL.value,
4744
- TableToken.OTSL_CHED.value,
4745
- TableToken.OTSL_RHED.value,
4746
- TableToken.OTSL_SROW.value,
4747
- ]:
4748
- row_span = 1
4749
- col_span = 1
4750
- right_offset = 1
4751
- if text != TableToken.OTSL_ECEL.value:
4752
- cell_text = texts[i + 1]
4753
- right_offset = 2
4754
-
4755
- # Check next element(s) for lcel / ucel / xcel,
4756
- # set properly row_span, col_span
4757
- next_right_cell = ""
4758
- if i + right_offset < len(texts):
4759
- next_right_cell = texts[i + right_offset]
4760
-
4761
- next_bottom_cell = ""
4762
- if r_idx + 1 < len(split_row_tokens):
4763
- if c_idx < len(split_row_tokens[r_idx + 1]):
4764
- next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
4765
-
4766
- if next_right_cell in [
4767
- TableToken.OTSL_LCEL.value,
4768
- TableToken.OTSL_XCEL.value,
4769
- ]:
4770
- # we have horisontal spanning cell or 2d spanning cell
4771
- col_span += count_right(
4772
- split_row_tokens,
4773
- c_idx + 1,
4774
- r_idx,
4775
- [TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
4776
- )
4777
- if next_bottom_cell in [
4778
- TableToken.OTSL_UCEL.value,
4779
- TableToken.OTSL_XCEL.value,
4780
- ]:
4781
- # we have a vertical spanning cell or 2d spanning cell
4782
- row_span += count_down(
4783
- split_row_tokens,
4784
- c_idx,
4785
- r_idx + 1,
4786
- [TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
4787
- )
4788
-
4789
- table_cells.append(
4790
- TableCell(
4791
- text=cell_text.strip(),
4792
- row_span=row_span,
4793
- col_span=col_span,
4794
- start_row_offset_idx=r_idx,
4795
- end_row_offset_idx=r_idx + row_span,
4796
- start_col_offset_idx=c_idx,
4797
- end_col_offset_idx=c_idx + col_span,
4798
- )
4799
- )
4800
- if text in [
4801
- TableToken.OTSL_FCEL.value,
4802
- TableToken.OTSL_ECEL.value,
4803
- TableToken.OTSL_CHED.value,
4804
- TableToken.OTSL_RHED.value,
4805
- TableToken.OTSL_SROW.value,
4806
- TableToken.OTSL_LCEL.value,
4807
- TableToken.OTSL_UCEL.value,
4808
- TableToken.OTSL_XCEL.value,
4809
- ]:
4810
- c_idx += 1
4811
- if text == TableToken.OTSL_NL.value:
4812
- r_idx += 1
4813
- c_idx = 0
4814
- return table_cells, split_row_tokens
4815
-
4816
- def otsl_extract_tokens_and_text(s: str):
4817
- # Pattern to match anything enclosed by < >
4818
- # (including the angle brackets themselves)
4819
- pattern = r"(<[^>]+>)"
4820
- # Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
4821
- tokens = re.findall(pattern, s)
4822
- # Remove any tokens that start with "<loc_"
4823
- tokens = [
4824
- token
4825
- for token in tokens
4826
- if not (
4827
- token.startswith(rf"<{_LOC_PREFIX}")
4828
- or token
4829
- in [
4830
- rf"<{DocumentToken.OTSL.value}>",
4831
- rf"</{DocumentToken.OTSL.value}>",
4832
- ]
4833
- )
4834
- ]
4835
- # Split the string by those tokens to get the in-between text
4836
- text_parts = re.split(pattern, s)
4837
- text_parts = [
4838
- token
4839
- for token in text_parts
4840
- if not (
4841
- token.startswith(rf"<{_LOC_PREFIX}")
4842
- or token
4843
- in [
4844
- rf"<{DocumentToken.OTSL.value}>",
4845
- rf"</{DocumentToken.OTSL.value}>",
4846
- ]
4847
- )
4848
- ]
4849
- # Remove any empty or purely whitespace strings from text_parts
4850
- text_parts = [part for part in text_parts if part.strip()]
4851
-
4852
- return tokens, text_parts
4853
-
4854
- def parse_table_content(otsl_content: str) -> TableData:
4855
- tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
4856
- table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
4857
-
4858
- return TableData(
4859
- num_rows=len(split_row_tokens),
4860
- num_cols=(
4861
- max(len(row) for row in split_row_tokens) if split_row_tokens else 0
4862
- ),
4863
- table_cells=table_cells,
4864
- )
4865
-
4866
4690
  def extract_chart_type(text_chunk: str):
4867
4691
  label = None
4868
4692
  chart_labels = [
@@ -5094,7 +4918,7 @@ class DoclingDocument(BaseModel):
5094
4918
  doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.TEXT)
5095
4919
 
5096
4920
  if tag_name == DocumentToken.OTSL.value:
5097
- table_data = parse_table_content(full_chunk)
4921
+ table_data = parse_otsl_table_content(full_chunk)
5098
4922
  caption, caption_bbox = extract_caption(full_chunk)
5099
4923
  if caption is not None and caption_bbox is not None:
5100
4924
  caption.prov.append(
@@ -5137,7 +4961,7 @@ class DoclingDocument(BaseModel):
5137
4961
  table_data = None
5138
4962
  chart_type = None
5139
4963
  if tag_name == DocumentToken.CHART.value:
5140
- table_data = parse_table_content(full_chunk)
4964
+ table_data = parse_otsl_table_content(full_chunk)
5141
4965
  chart_type = extract_chart_type(full_chunk)
5142
4966
  if image:
5143
4967
  if bbox:
@@ -5683,69 +5507,174 @@ class DoclingDocument(BaseModel):
5683
5507
  )
5684
5508
  return self
5685
5509
 
5510
+ class _DocIndex(BaseModel):
5511
+ """A document merge buffer."""
5512
+
5513
+ groups: list[GroupItem] = []
5514
+ texts: list[TextItem] = []
5515
+ pictures: list[PictureItem] = []
5516
+ tables: list[TableItem] = []
5517
+ key_value_items: list[KeyValueItem] = []
5518
+ form_items: list[FormItem] = []
5519
+
5520
+ pages: dict[int, PageItem] = {}
5521
+
5522
+ _body: Optional[GroupItem] = None
5523
+ _max_page: int = 0
5524
+ _names: list[str] = []
5525
+
5526
+ def get_item_list(self, key: str) -> list[NodeItem]:
5527
+ return getattr(self, key)
5528
+
5529
+ def index(self, doc: "DoclingDocument") -> None:
5530
+
5531
+ orig_ref_to_new_ref: dict[str, str] = {}
5532
+ page_delta = self._max_page - min(doc.pages.keys()) + 1 if doc.pages else 0
5533
+
5534
+ if self._body is None:
5535
+ self._body = GroupItem(**doc.body.model_dump(exclude={"children"}))
5536
+
5537
+ self._names.append(doc.name)
5538
+
5539
+ # collect items in traversal order
5540
+ for item, _ in doc.iterate_items(
5541
+ with_groups=True,
5542
+ traverse_pictures=True,
5543
+ included_content_layers={c for c in ContentLayer},
5544
+ ):
5545
+ key = item.self_ref.split("/")[1]
5546
+ is_body = key == "body"
5547
+ new_cref = (
5548
+ "#/body" if is_body else f"#/{key}/{len(self.get_item_list(key))}"
5549
+ )
5550
+ # register cref mapping:
5551
+ orig_ref_to_new_ref[item.self_ref] = new_cref
5552
+
5553
+ if not is_body:
5554
+ new_item = copy.deepcopy(item)
5555
+ new_item.children = []
5556
+
5557
+ # put item in the right list
5558
+ self.get_item_list(key).append(new_item)
5559
+
5560
+ # update item's self reference
5561
+ new_item.self_ref = new_cref
5562
+
5563
+ if isinstance(new_item, DocItem):
5564
+ # update page numbers
5565
+ # NOTE other prov sources (e.g. GraphCell) currently not covered
5566
+ for prov in new_item.prov:
5567
+ prov.page_no += page_delta
5568
+
5569
+ if item.parent:
5570
+ # set item's parent
5571
+ new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
5572
+ new_item.parent = RefItem(cref=new_parent_cref)
5573
+
5574
+ # add item to parent's children
5575
+ path_components = new_parent_cref.split("/")
5576
+ num_components = len(path_components)
5577
+ if num_components == 3:
5578
+ _, parent_key, parent_index_str = path_components
5579
+ parent_index = int(parent_index_str)
5580
+ parent_item = self.get_item_list(parent_key)[parent_index]
5581
+
5582
+ # update captions field (not possible in iterate_items order):
5583
+ if isinstance(parent_item, FloatingItem):
5584
+ for cap_it, cap in enumerate(parent_item.captions):
5585
+ if cap.cref == item.self_ref:
5586
+ parent_item.captions[cap_it] = RefItem(
5587
+ cref=new_cref
5588
+ )
5589
+ break
5590
+
5591
+ elif num_components == 2 and path_components[1] == "body":
5592
+ parent_item = self._body
5593
+ else:
5594
+ raise RuntimeError(
5595
+ f"Unsupported ref format: {new_parent_cref}"
5596
+ )
5597
+ parent_item.children.append(RefItem(cref=new_cref))
5598
+
5599
+ # update pages
5600
+ new_max_page = None
5601
+ for page_nr in doc.pages:
5602
+ new_page = copy.deepcopy(doc.pages[page_nr])
5603
+ new_page_nr = page_nr + page_delta
5604
+ new_page.page_no = new_page_nr
5605
+ self.pages[new_page_nr] = new_page
5606
+ if new_max_page is None or new_page_nr > new_max_page:
5607
+ new_max_page = new_page_nr
5608
+ if new_max_page is not None:
5609
+ self._max_page = new_max_page
5610
+
5611
+ def get_name(self) -> str:
5612
+ return " + ".join(self._names)
5613
+
5614
+ def _update_from_index(self, doc_index: "_DocIndex") -> None:
5615
+ if doc_index._body is not None:
5616
+ self.body = doc_index._body
5617
+ self.groups = doc_index.groups
5618
+ self.texts = doc_index.texts
5619
+ self.pictures = doc_index.pictures
5620
+ self.tables = doc_index.tables
5621
+ self.key_value_items = doc_index.key_value_items
5622
+ self.form_items = doc_index.form_items
5623
+ self.pages = doc_index.pages
5624
+ self.name = doc_index.get_name()
5625
+
5686
5626
  def _normalize_references(self) -> None:
5687
- """Normalize ref numbering by ordering node items as per iterate_items()."""
5688
- new_body = GroupItem(**self.body.model_dump(exclude={"children"}))
5689
-
5690
- item_lists: dict[str, list[NodeItem]] = {
5691
- "groups": [],
5692
- "texts": [],
5693
- "pictures": [],
5694
- "tables": [],
5695
- "key_value_items": [],
5696
- "form_items": [],
5697
- }
5698
- orig_ref_to_new_ref: dict[str, str] = {}
5627
+ doc_index = DoclingDocument._DocIndex()
5628
+ doc_index.index(doc=self)
5629
+ self._update_from_index(doc_index)
5630
+
5631
+ @classmethod
5632
+ def concatenate(cls, docs: Sequence["DoclingDocument"]) -> "DoclingDocument":
5633
+ """Concatenate multiple documents into a single document."""
5634
+ doc_index = DoclingDocument._DocIndex()
5635
+ for doc in docs:
5636
+ doc_index.index(doc=doc)
5637
+
5638
+ res_doc = DoclingDocument(name=" + ".join([doc.name for doc in docs]))
5639
+ res_doc._update_from_index(doc_index)
5640
+ return res_doc
5641
+
5642
+ def _validate_rules(self):
5643
+ def validate_list_group(doc: DoclingDocument, item: ListGroup):
5644
+ for ref in item.children:
5645
+ child = ref.resolve(doc)
5646
+ if not isinstance(child, ListItem):
5647
+ raise ValueError(
5648
+ f"ListGroup {item.self_ref} contains non-ListItem {child.self_ref} ({child.label=})"
5649
+ )
5650
+
5651
+ def validate_list_item(doc: DoclingDocument, item: ListItem):
5652
+ if item.parent is None:
5653
+ raise ValueError(f"ListItem {item.self_ref} has no parent")
5654
+ if not isinstance(item.parent.resolve(doc), ListGroup):
5655
+ raise ValueError(
5656
+ f"ListItem {item.self_ref} has non-ListGroup parent: {item.parent.cref}"
5657
+ )
5658
+
5659
+ def validate_group(doc: DoclingDocument, item: GroupItem):
5660
+ if (
5661
+ item.parent and not item.children
5662
+ ): # tolerate empty body, but not other groups
5663
+ raise ValueError(f"Group {item.self_ref} has no children")
5699
5664
 
5700
- # collect items in traversal order
5701
5665
  for item, _ in self.iterate_items(
5702
5666
  with_groups=True,
5703
5667
  traverse_pictures=True,
5704
5668
  included_content_layers={c for c in ContentLayer},
5705
5669
  ):
5706
- key = item.self_ref.split("/")[1]
5707
- is_body = key == "body"
5708
- new_cref = "#/body" if is_body else f"#/{key}/{len(item_lists[key])}"
5709
- # register cref mapping:
5710
- orig_ref_to_new_ref[item.self_ref] = new_cref
5711
-
5712
- if not is_body:
5713
- new_item = copy.deepcopy(item)
5714
- new_item.children = []
5715
-
5716
- # put item in the right list
5717
- item_lists[key].append(new_item)
5718
-
5719
- # update item's self reference
5720
- new_item.self_ref = new_cref
5721
-
5722
- if item.parent:
5723
- # set item's parent
5724
- new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
5725
- new_item.parent = RefItem(cref=new_parent_cref)
5726
-
5727
- # add item to parent's children
5728
- path_components = new_parent_cref.split("/")
5729
- num_components = len(path_components)
5730
- parent_node: NodeItem
5731
- if num_components == 3:
5732
- _, parent_key, parent_index_str = path_components
5733
- parent_index = int(parent_index_str)
5734
- parent_node = item_lists[parent_key][parent_index]
5735
- elif num_components == 2 and path_components[1] == "body":
5736
- parent_node = new_body
5737
- else:
5738
- raise RuntimeError(f"Unsupported ref format: {new_parent_cref}")
5739
- parent_node.children.append(RefItem(cref=new_cref))
5740
-
5741
- # update document
5742
- self.groups = item_lists["groups"] # type: ignore
5743
- self.texts = item_lists["texts"] # type: ignore
5744
- self.pictures = item_lists["pictures"] # type: ignore
5745
- self.tables = item_lists["tables"] # type: ignore
5746
- self.key_value_items = item_lists["key_value_items"] # type: ignore
5747
- self.form_items = item_lists["form_items"] # type: ignore
5748
- self.body = new_body
5670
+ if isinstance(item, ListGroup):
5671
+ validate_list_group(self, item)
5672
+
5673
+ elif isinstance(item, GroupItem):
5674
+ validate_group(self, item)
5675
+
5676
+ elif isinstance(item, ListItem):
5677
+ validate_list_item(self, item)
5749
5678
 
5750
5679
 
5751
5680
  # deprecated aliases (kept for backwards compatibility):