docling-core 2.44.2__tar.gz → 2.45.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (111) hide show
  1. {docling_core-2.44.2 → docling_core-2.45.0}/PKG-INFO +1 -1
  2. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/serializer/html.py +2 -2
  3. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/doc/document.py +165 -236
  4. docling_core-2.45.0/docling_core/types/doc/utils.py +282 -0
  5. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core.egg-info/PKG-INFO +1 -1
  6. {docling_core-2.44.2 → docling_core-2.45.0}/pyproject.toml +1 -1
  7. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_docling_doc.py +119 -1
  8. docling_core-2.44.2/docling_core/types/doc/utils.py +0 -86
  9. {docling_core-2.44.2 → docling_core-2.45.0}/LICENSE +0 -0
  10. {docling_core-2.44.2 → docling_core-2.45.0}/README.md +0 -0
  11. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/__init__.py +0 -0
  12. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/cli/__init__.py +0 -0
  13. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/cli/view.py +0 -0
  14. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/experimental/__init__.py +0 -0
  15. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/py.typed +0 -0
  16. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  17. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  18. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  19. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  20. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  21. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  22. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  23. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  24. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/search/__init__.py +0 -0
  25. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  26. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/search/mapping.py +0 -0
  27. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/search/meta.py +0 -0
  28. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/search/package.py +0 -0
  29. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/__init__.py +0 -0
  30. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/chunker/__init__.py +0 -0
  31. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/chunker/base.py +0 -0
  32. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  33. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  34. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/chunker/page_chunker.py +0 -0
  35. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
  36. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
  37. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
  38. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
  39. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/serializer/__init__.py +0 -0
  40. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/serializer/base.py +0 -0
  41. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/serializer/common.py +0 -0
  42. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/serializer/doctags.py +0 -0
  43. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/serializer/html_styles.py +0 -0
  44. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/serializer/markdown.py +0 -0
  45. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/visualizer/__init__.py +0 -0
  46. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/visualizer/base.py +0 -0
  47. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/visualizer/key_value_visualizer.py +0 -0
  48. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
  49. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
  50. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
  51. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/__init__.py +0 -0
  52. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/base.py +0 -0
  53. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/doc/__init__.py +0 -0
  54. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/doc/base.py +0 -0
  55. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/doc/labels.py +0 -0
  56. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/doc/page.py +0 -0
  57. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/doc/tokens.py +0 -0
  58. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/gen/__init__.py +0 -0
  59. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/gen/generic.py +0 -0
  60. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/io/__init__.py +0 -0
  61. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  62. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/legacy_doc/base.py +0 -0
  63. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  64. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  65. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  66. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/legacy_doc/document.py +0 -0
  67. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  68. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/nlp/__init__.py +0 -0
  69. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/nlp/qa.py +0 -0
  70. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/nlp/qa_labels.py +0 -0
  71. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/rec/__init__.py +0 -0
  72. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/rec/attribute.py +0 -0
  73. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/rec/base.py +0 -0
  74. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/rec/predicate.py +0 -0
  75. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/rec/record.py +0 -0
  76. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/rec/statement.py +0 -0
  77. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/types/rec/subject.py +0 -0
  78. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/utils/__init__.py +0 -0
  79. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/utils/alias.py +0 -0
  80. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/utils/file.py +0 -0
  81. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/utils/generate_docs.py +0 -0
  82. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/utils/generate_jsonschema.py +0 -0
  83. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/utils/legacy.py +0 -0
  84. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/utils/validate.py +0 -0
  85. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core/utils/validators.py +0 -0
  86. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core.egg-info/SOURCES.txt +0 -0
  87. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core.egg-info/dependency_links.txt +0 -0
  88. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core.egg-info/entry_points.txt +0 -0
  89. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core.egg-info/requires.txt +0 -0
  90. {docling_core-2.44.2 → docling_core-2.45.0}/docling_core.egg-info/top_level.txt +0 -0
  91. {docling_core-2.44.2 → docling_core-2.45.0}/setup.cfg +0 -0
  92. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_base.py +0 -0
  93. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_collection.py +0 -0
  94. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_data_gen_flag.py +0 -0
  95. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_doc_base.py +0 -0
  96. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_doc_legacy_convert.py +0 -0
  97. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_doc_schema.py +0 -0
  98. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_doc_schema_extractor.py +0 -0
  99. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_doctags_load.py +0 -0
  100. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_hierarchical_chunker.py +0 -0
  101. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_hybrid_chunker.py +0 -0
  102. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_json_schema_to_search_mapper.py +0 -0
  103. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_nlp_qa.py +0 -0
  104. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_otsl_table_export.py +0 -0
  105. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_page.py +0 -0
  106. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_page_chunker.py +0 -0
  107. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_rec_schema.py +0 -0
  108. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_search_meta.py +0 -0
  109. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_serialization.py +0 -0
  110. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_utils.py +0 -0
  111. {docling_core-2.44.2 → docling_core-2.45.0}/test/test_visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.44.2
3
+ Version: 2.45.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -1057,7 +1057,7 @@ class HTMLDocSerializer(DocSerializer):
1057
1057
  if self.params.html_head is not None:
1058
1058
  return self.params.html_head
1059
1059
 
1060
- head_parts = ["<head>", '<meta charset="UTF-8">']
1060
+ head_parts = ["<head>", '<meta charset="UTF-8"/>']
1061
1061
 
1062
1062
  # Add metadata if requested
1063
1063
  if params.add_document_metadata:
@@ -1067,7 +1067,7 @@ class HTMLDocSerializer(DocSerializer):
1067
1067
  head_parts.append("<title>Docling Document</title>")
1068
1068
 
1069
1069
  head_parts.append(
1070
- '<meta name="generator" content="Docling HTML Serializer">'
1070
+ '<meta name="generator" content="Docling HTML Serializer"/>'
1071
1071
  )
1072
1072
 
1073
1073
  # Add default styles or custom CSS
@@ -3,7 +3,6 @@
3
3
  import base64
4
4
  import copy
5
5
  import hashlib
6
- import itertools
7
6
  import json
8
7
  import logging
9
8
  import mimetypes
@@ -54,8 +53,8 @@ from docling_core.types.doc.labels import (
54
53
  GroupLabel,
55
54
  PictureClassificationLabel,
56
55
  )
57
- from docling_core.types.doc.tokens import _LOC_PREFIX, DocumentToken, TableToken
58
- from docling_core.types.doc.utils import relative_path
56
+ from docling_core.types.doc.tokens import DocumentToken, TableToken
57
+ from docling_core.types.doc.utils import parse_otsl_table_content, relative_path
59
58
 
60
59
  _logger = logging.getLogger(__name__)
61
60
 
@@ -4688,181 +4687,6 @@ class DoclingDocument(BaseModel):
4688
4687
  bbox = None
4689
4688
  return caption_item, bbox
4690
4689
 
4691
- def otsl_parse_texts(texts, tokens):
4692
- split_word = TableToken.OTSL_NL.value
4693
- # CLEAN tokens from extra tags, only structural OTSL allowed
4694
- clean_tokens = []
4695
- for t in tokens:
4696
- if t in [
4697
- TableToken.OTSL_ECEL.value,
4698
- TableToken.OTSL_FCEL.value,
4699
- TableToken.OTSL_LCEL.value,
4700
- TableToken.OTSL_UCEL.value,
4701
- TableToken.OTSL_XCEL.value,
4702
- TableToken.OTSL_NL.value,
4703
- TableToken.OTSL_CHED.value,
4704
- TableToken.OTSL_RHED.value,
4705
- TableToken.OTSL_SROW.value,
4706
- ]:
4707
- clean_tokens.append(t)
4708
- tokens = clean_tokens
4709
- split_row_tokens = [
4710
- list(y)
4711
- for x, y in itertools.groupby(tokens, lambda z: z == split_word)
4712
- if not x
4713
- ]
4714
-
4715
- table_cells = []
4716
- r_idx = 0
4717
- c_idx = 0
4718
-
4719
- def count_right(tokens, c_idx, r_idx, which_tokens):
4720
- span = 0
4721
- c_idx_iter = c_idx
4722
- while tokens[r_idx][c_idx_iter] in which_tokens:
4723
- c_idx_iter += 1
4724
- span += 1
4725
- if c_idx_iter >= len(tokens[r_idx]):
4726
- return span
4727
- return span
4728
-
4729
- def count_down(tokens, c_idx, r_idx, which_tokens):
4730
- span = 0
4731
- r_idx_iter = r_idx
4732
- while tokens[r_idx_iter][c_idx] in which_tokens:
4733
- r_idx_iter += 1
4734
- span += 1
4735
- if r_idx_iter >= len(tokens):
4736
- return span
4737
- return span
4738
-
4739
- for i, text in enumerate(texts):
4740
- cell_text = ""
4741
- if text in [
4742
- TableToken.OTSL_FCEL.value,
4743
- TableToken.OTSL_ECEL.value,
4744
- TableToken.OTSL_CHED.value,
4745
- TableToken.OTSL_RHED.value,
4746
- TableToken.OTSL_SROW.value,
4747
- ]:
4748
- row_span = 1
4749
- col_span = 1
4750
- right_offset = 1
4751
- if text != TableToken.OTSL_ECEL.value:
4752
- cell_text = texts[i + 1]
4753
- right_offset = 2
4754
-
4755
- # Check next element(s) for lcel / ucel / xcel,
4756
- # set properly row_span, col_span
4757
- next_right_cell = ""
4758
- if i + right_offset < len(texts):
4759
- next_right_cell = texts[i + right_offset]
4760
-
4761
- next_bottom_cell = ""
4762
- if r_idx + 1 < len(split_row_tokens):
4763
- if c_idx < len(split_row_tokens[r_idx + 1]):
4764
- next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
4765
-
4766
- if next_right_cell in [
4767
- TableToken.OTSL_LCEL.value,
4768
- TableToken.OTSL_XCEL.value,
4769
- ]:
4770
- # we have horisontal spanning cell or 2d spanning cell
4771
- col_span += count_right(
4772
- split_row_tokens,
4773
- c_idx + 1,
4774
- r_idx,
4775
- [TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
4776
- )
4777
- if next_bottom_cell in [
4778
- TableToken.OTSL_UCEL.value,
4779
- TableToken.OTSL_XCEL.value,
4780
- ]:
4781
- # we have a vertical spanning cell or 2d spanning cell
4782
- row_span += count_down(
4783
- split_row_tokens,
4784
- c_idx,
4785
- r_idx + 1,
4786
- [TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
4787
- )
4788
-
4789
- table_cells.append(
4790
- TableCell(
4791
- text=cell_text.strip(),
4792
- row_span=row_span,
4793
- col_span=col_span,
4794
- start_row_offset_idx=r_idx,
4795
- end_row_offset_idx=r_idx + row_span,
4796
- start_col_offset_idx=c_idx,
4797
- end_col_offset_idx=c_idx + col_span,
4798
- )
4799
- )
4800
- if text in [
4801
- TableToken.OTSL_FCEL.value,
4802
- TableToken.OTSL_ECEL.value,
4803
- TableToken.OTSL_CHED.value,
4804
- TableToken.OTSL_RHED.value,
4805
- TableToken.OTSL_SROW.value,
4806
- TableToken.OTSL_LCEL.value,
4807
- TableToken.OTSL_UCEL.value,
4808
- TableToken.OTSL_XCEL.value,
4809
- ]:
4810
- c_idx += 1
4811
- if text == TableToken.OTSL_NL.value:
4812
- r_idx += 1
4813
- c_idx = 0
4814
- return table_cells, split_row_tokens
4815
-
4816
- def otsl_extract_tokens_and_text(s: str):
4817
- # Pattern to match anything enclosed by < >
4818
- # (including the angle brackets themselves)
4819
- pattern = r"(<[^>]+>)"
4820
- # Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
4821
- tokens = re.findall(pattern, s)
4822
- # Remove any tokens that start with "<loc_"
4823
- tokens = [
4824
- token
4825
- for token in tokens
4826
- if not (
4827
- token.startswith(rf"<{_LOC_PREFIX}")
4828
- or token
4829
- in [
4830
- rf"<{DocumentToken.OTSL.value}>",
4831
- rf"</{DocumentToken.OTSL.value}>",
4832
- ]
4833
- )
4834
- ]
4835
- # Split the string by those tokens to get the in-between text
4836
- text_parts = re.split(pattern, s)
4837
- text_parts = [
4838
- token
4839
- for token in text_parts
4840
- if not (
4841
- token.startswith(rf"<{_LOC_PREFIX}")
4842
- or token
4843
- in [
4844
- rf"<{DocumentToken.OTSL.value}>",
4845
- rf"</{DocumentToken.OTSL.value}>",
4846
- ]
4847
- )
4848
- ]
4849
- # Remove any empty or purely whitespace strings from text_parts
4850
- text_parts = [part for part in text_parts if part.strip()]
4851
-
4852
- return tokens, text_parts
4853
-
4854
- def parse_table_content(otsl_content: str) -> TableData:
4855
- tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
4856
- table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
4857
-
4858
- return TableData(
4859
- num_rows=len(split_row_tokens),
4860
- num_cols=(
4861
- max(len(row) for row in split_row_tokens) if split_row_tokens else 0
4862
- ),
4863
- table_cells=table_cells,
4864
- )
4865
-
4866
4690
  def extract_chart_type(text_chunk: str):
4867
4691
  label = None
4868
4692
  chart_labels = [
@@ -5094,7 +4918,7 @@ class DoclingDocument(BaseModel):
5094
4918
  doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.TEXT)
5095
4919
 
5096
4920
  if tag_name == DocumentToken.OTSL.value:
5097
- table_data = parse_table_content(full_chunk)
4921
+ table_data = parse_otsl_table_content(full_chunk)
5098
4922
  caption, caption_bbox = extract_caption(full_chunk)
5099
4923
  if caption is not None and caption_bbox is not None:
5100
4924
  caption.prov.append(
@@ -5137,7 +4961,7 @@ class DoclingDocument(BaseModel):
5137
4961
  table_data = None
5138
4962
  chart_type = None
5139
4963
  if tag_name == DocumentToken.CHART.value:
5140
- table_data = parse_table_content(full_chunk)
4964
+ table_data = parse_otsl_table_content(full_chunk)
5141
4965
  chart_type = extract_chart_type(full_chunk)
5142
4966
  if image:
5143
4967
  if bbox:
@@ -5683,69 +5507,174 @@ class DoclingDocument(BaseModel):
5683
5507
  )
5684
5508
  return self
5685
5509
 
5510
+ class _DocIndex(BaseModel):
5511
+ """A document merge buffer."""
5512
+
5513
+ groups: list[GroupItem] = []
5514
+ texts: list[TextItem] = []
5515
+ pictures: list[PictureItem] = []
5516
+ tables: list[TableItem] = []
5517
+ key_value_items: list[KeyValueItem] = []
5518
+ form_items: list[FormItem] = []
5519
+
5520
+ pages: dict[int, PageItem] = {}
5521
+
5522
+ _body: Optional[GroupItem] = None
5523
+ _max_page: int = 0
5524
+ _names: list[str] = []
5525
+
5526
+ def get_item_list(self, key: str) -> list[NodeItem]:
5527
+ return getattr(self, key)
5528
+
5529
+ def index(self, doc: "DoclingDocument") -> None:
5530
+
5531
+ orig_ref_to_new_ref: dict[str, str] = {}
5532
+ page_delta = self._max_page - min(doc.pages.keys()) + 1 if doc.pages else 0
5533
+
5534
+ if self._body is None:
5535
+ self._body = GroupItem(**doc.body.model_dump(exclude={"children"}))
5536
+
5537
+ self._names.append(doc.name)
5538
+
5539
+ # collect items in traversal order
5540
+ for item, _ in doc.iterate_items(
5541
+ with_groups=True,
5542
+ traverse_pictures=True,
5543
+ included_content_layers={c for c in ContentLayer},
5544
+ ):
5545
+ key = item.self_ref.split("/")[1]
5546
+ is_body = key == "body"
5547
+ new_cref = (
5548
+ "#/body" if is_body else f"#/{key}/{len(self.get_item_list(key))}"
5549
+ )
5550
+ # register cref mapping:
5551
+ orig_ref_to_new_ref[item.self_ref] = new_cref
5552
+
5553
+ if not is_body:
5554
+ new_item = copy.deepcopy(item)
5555
+ new_item.children = []
5556
+
5557
+ # put item in the right list
5558
+ self.get_item_list(key).append(new_item)
5559
+
5560
+ # update item's self reference
5561
+ new_item.self_ref = new_cref
5562
+
5563
+ if isinstance(new_item, DocItem):
5564
+ # update page numbers
5565
+ # NOTE other prov sources (e.g. GraphCell) currently not covered
5566
+ for prov in new_item.prov:
5567
+ prov.page_no += page_delta
5568
+
5569
+ if item.parent:
5570
+ # set item's parent
5571
+ new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
5572
+ new_item.parent = RefItem(cref=new_parent_cref)
5573
+
5574
+ # add item to parent's children
5575
+ path_components = new_parent_cref.split("/")
5576
+ num_components = len(path_components)
5577
+ if num_components == 3:
5578
+ _, parent_key, parent_index_str = path_components
5579
+ parent_index = int(parent_index_str)
5580
+ parent_item = self.get_item_list(parent_key)[parent_index]
5581
+
5582
+ # update captions field (not possible in iterate_items order):
5583
+ if isinstance(parent_item, FloatingItem):
5584
+ for cap_it, cap in enumerate(parent_item.captions):
5585
+ if cap.cref == item.self_ref:
5586
+ parent_item.captions[cap_it] = RefItem(
5587
+ cref=new_cref
5588
+ )
5589
+ break
5590
+
5591
+ elif num_components == 2 and path_components[1] == "body":
5592
+ parent_item = self._body
5593
+ else:
5594
+ raise RuntimeError(
5595
+ f"Unsupported ref format: {new_parent_cref}"
5596
+ )
5597
+ parent_item.children.append(RefItem(cref=new_cref))
5598
+
5599
+ # update pages
5600
+ new_max_page = None
5601
+ for page_nr in doc.pages:
5602
+ new_page = copy.deepcopy(doc.pages[page_nr])
5603
+ new_page_nr = page_nr + page_delta
5604
+ new_page.page_no = new_page_nr
5605
+ self.pages[new_page_nr] = new_page
5606
+ if new_max_page is None or new_page_nr > new_max_page:
5607
+ new_max_page = new_page_nr
5608
+ if new_max_page is not None:
5609
+ self._max_page = new_max_page
5610
+
5611
+ def get_name(self) -> str:
5612
+ return " + ".join(self._names)
5613
+
5614
+ def _update_from_index(self, doc_index: "_DocIndex") -> None:
5615
+ if doc_index._body is not None:
5616
+ self.body = doc_index._body
5617
+ self.groups = doc_index.groups
5618
+ self.texts = doc_index.texts
5619
+ self.pictures = doc_index.pictures
5620
+ self.tables = doc_index.tables
5621
+ self.key_value_items = doc_index.key_value_items
5622
+ self.form_items = doc_index.form_items
5623
+ self.pages = doc_index.pages
5624
+ self.name = doc_index.get_name()
5625
+
5686
5626
  def _normalize_references(self) -> None:
5687
- """Normalize ref numbering by ordering node items as per iterate_items()."""
5688
- new_body = GroupItem(**self.body.model_dump(exclude={"children"}))
5689
-
5690
- item_lists: dict[str, list[NodeItem]] = {
5691
- "groups": [],
5692
- "texts": [],
5693
- "pictures": [],
5694
- "tables": [],
5695
- "key_value_items": [],
5696
- "form_items": [],
5697
- }
5698
- orig_ref_to_new_ref: dict[str, str] = {}
5627
+ doc_index = DoclingDocument._DocIndex()
5628
+ doc_index.index(doc=self)
5629
+ self._update_from_index(doc_index)
5630
+
5631
+ @classmethod
5632
+ def concatenate(cls, docs: Sequence["DoclingDocument"]) -> "DoclingDocument":
5633
+ """Concatenate multiple documents into a single document."""
5634
+ doc_index = DoclingDocument._DocIndex()
5635
+ for doc in docs:
5636
+ doc_index.index(doc=doc)
5637
+
5638
+ res_doc = DoclingDocument(name=" + ".join([doc.name for doc in docs]))
5639
+ res_doc._update_from_index(doc_index)
5640
+ return res_doc
5641
+
5642
+ def _validate_rules(self):
5643
+ def validate_list_group(doc: DoclingDocument, item: ListGroup):
5644
+ for ref in item.children:
5645
+ child = ref.resolve(doc)
5646
+ if not isinstance(child, ListItem):
5647
+ raise ValueError(
5648
+ f"ListGroup {item.self_ref} contains non-ListItem {child.self_ref} ({child.label=})"
5649
+ )
5650
+
5651
+ def validate_list_item(doc: DoclingDocument, item: ListItem):
5652
+ if item.parent is None:
5653
+ raise ValueError(f"ListItem {item.self_ref} has no parent")
5654
+ if not isinstance(item.parent.resolve(doc), ListGroup):
5655
+ raise ValueError(
5656
+ f"ListItem {item.self_ref} has non-ListGroup parent: {item.parent.cref}"
5657
+ )
5658
+
5659
+ def validate_group(doc: DoclingDocument, item: GroupItem):
5660
+ if (
5661
+ item.parent and not item.children
5662
+ ): # tolerate empty body, but not other groups
5663
+ raise ValueError(f"Group {item.self_ref} has no children")
5699
5664
 
5700
- # collect items in traversal order
5701
5665
  for item, _ in self.iterate_items(
5702
5666
  with_groups=True,
5703
5667
  traverse_pictures=True,
5704
5668
  included_content_layers={c for c in ContentLayer},
5705
5669
  ):
5706
- key = item.self_ref.split("/")[1]
5707
- is_body = key == "body"
5708
- new_cref = "#/body" if is_body else f"#/{key}/{len(item_lists[key])}"
5709
- # register cref mapping:
5710
- orig_ref_to_new_ref[item.self_ref] = new_cref
5711
-
5712
- if not is_body:
5713
- new_item = copy.deepcopy(item)
5714
- new_item.children = []
5715
-
5716
- # put item in the right list
5717
- item_lists[key].append(new_item)
5718
-
5719
- # update item's self reference
5720
- new_item.self_ref = new_cref
5721
-
5722
- if item.parent:
5723
- # set item's parent
5724
- new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
5725
- new_item.parent = RefItem(cref=new_parent_cref)
5726
-
5727
- # add item to parent's children
5728
- path_components = new_parent_cref.split("/")
5729
- num_components = len(path_components)
5730
- parent_node: NodeItem
5731
- if num_components == 3:
5732
- _, parent_key, parent_index_str = path_components
5733
- parent_index = int(parent_index_str)
5734
- parent_node = item_lists[parent_key][parent_index]
5735
- elif num_components == 2 and path_components[1] == "body":
5736
- parent_node = new_body
5737
- else:
5738
- raise RuntimeError(f"Unsupported ref format: {new_parent_cref}")
5739
- parent_node.children.append(RefItem(cref=new_cref))
5740
-
5741
- # update document
5742
- self.groups = item_lists["groups"] # type: ignore
5743
- self.texts = item_lists["texts"] # type: ignore
5744
- self.pictures = item_lists["pictures"] # type: ignore
5745
- self.tables = item_lists["tables"] # type: ignore
5746
- self.key_value_items = item_lists["key_value_items"] # type: ignore
5747
- self.form_items = item_lists["form_items"] # type: ignore
5748
- self.body = new_body
5670
+ if isinstance(item, ListGroup):
5671
+ validate_list_group(self, item)
5672
+
5673
+ elif isinstance(item, GroupItem):
5674
+ validate_group(self, item)
5675
+
5676
+ elif isinstance(item, ListItem):
5677
+ validate_list_item(self, item)
5749
5678
 
5750
5679
 
5751
5680
  # deprecated aliases (kept for backwards compatibility):