docling-core 2.47.0__tar.gz → 2.48.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (110) hide show
  1. {docling_core-2.47.0 → docling_core-2.48.0}/PKG-INFO +1 -1
  2. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/chunker/hierarchical_chunker.py +1 -1
  3. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/serializer/common.py +1 -0
  4. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/serializer/doctags.py +25 -9
  5. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/serializer/html.py +89 -84
  6. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/serializer/markdown.py +23 -21
  7. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/doc/document.py +2 -1
  8. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core.egg-info/PKG-INFO +1 -1
  9. {docling_core-2.47.0 → docling_core-2.48.0}/pyproject.toml +1 -1
  10. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_docling_doc.py +21 -2
  11. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_serialization.py +98 -65
  12. {docling_core-2.47.0 → docling_core-2.48.0}/LICENSE +0 -0
  13. {docling_core-2.47.0 → docling_core-2.48.0}/README.md +0 -0
  14. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/__init__.py +0 -0
  15. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/cli/__init__.py +0 -0
  16. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/cli/view.py +0 -0
  17. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/experimental/__init__.py +0 -0
  18. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/py.typed +0 -0
  19. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  20. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  21. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  22. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  23. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  24. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  25. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  26. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  27. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/search/__init__.py +0 -0
  28. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  29. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/search/mapping.py +0 -0
  30. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/search/meta.py +0 -0
  31. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/search/package.py +0 -0
  32. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/__init__.py +0 -0
  33. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/chunker/__init__.py +0 -0
  34. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/chunker/base.py +0 -0
  35. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  36. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/chunker/page_chunker.py +0 -0
  37. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
  38. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
  39. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
  40. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
  41. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/serializer/__init__.py +0 -0
  42. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/serializer/base.py +0 -0
  43. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/serializer/html_styles.py +0 -0
  44. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/visualizer/__init__.py +0 -0
  45. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/visualizer/base.py +0 -0
  46. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/visualizer/key_value_visualizer.py +0 -0
  47. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
  48. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
  49. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
  50. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/__init__.py +0 -0
  51. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/base.py +0 -0
  52. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/doc/__init__.py +0 -0
  53. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/doc/base.py +0 -0
  54. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/doc/labels.py +0 -0
  55. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/doc/page.py +0 -0
  56. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/doc/tokens.py +0 -0
  57. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/doc/utils.py +0 -0
  58. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/gen/__init__.py +0 -0
  59. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/gen/generic.py +0 -0
  60. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/io/__init__.py +0 -0
  61. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  62. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/legacy_doc/base.py +0 -0
  63. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  64. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  65. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  66. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/legacy_doc/document.py +0 -0
  67. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  68. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/nlp/__init__.py +0 -0
  69. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/nlp/qa.py +0 -0
  70. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/nlp/qa_labels.py +0 -0
  71. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/rec/__init__.py +0 -0
  72. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/rec/attribute.py +0 -0
  73. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/rec/base.py +0 -0
  74. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/rec/predicate.py +0 -0
  75. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/rec/record.py +0 -0
  76. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/rec/statement.py +0 -0
  77. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/rec/subject.py +0 -0
  78. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/utils/__init__.py +0 -0
  79. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/utils/alias.py +0 -0
  80. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/utils/file.py +0 -0
  81. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/utils/generate_docs.py +0 -0
  82. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/utils/generate_jsonschema.py +0 -0
  83. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/utils/legacy.py +0 -0
  84. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/utils/validate.py +0 -0
  85. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core/utils/validators.py +0 -0
  86. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core.egg-info/SOURCES.txt +0 -0
  87. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core.egg-info/dependency_links.txt +0 -0
  88. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core.egg-info/entry_points.txt +0 -0
  89. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core.egg-info/requires.txt +0 -0
  90. {docling_core-2.47.0 → docling_core-2.48.0}/docling_core.egg-info/top_level.txt +0 -0
  91. {docling_core-2.47.0 → docling_core-2.48.0}/setup.cfg +0 -0
  92. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_base.py +0 -0
  93. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_collection.py +0 -0
  94. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_data_gen_flag.py +0 -0
  95. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_doc_base.py +0 -0
  96. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_doc_legacy_convert.py +0 -0
  97. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_doc_schema.py +0 -0
  98. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_doc_schema_extractor.py +0 -0
  99. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_doctags_load.py +0 -0
  100. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_hierarchical_chunker.py +0 -0
  101. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_hybrid_chunker.py +0 -0
  102. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_json_schema_to_search_mapper.py +0 -0
  103. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_nlp_qa.py +0 -0
  104. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_otsl_table_export.py +0 -0
  105. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_page.py +0 -0
  106. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_page_chunker.py +0 -0
  107. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_rec_schema.py +0 -0
  108. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_search_meta.py +0 -0
  109. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_utils.py +0 -0
  110. {docling_core-2.47.0 → docling_core-2.48.0}/test/test_visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.47.0
3
+ Version: 2.48.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -145,7 +145,7 @@ class TripletTableSerializer(BaseTableSerializer):
145
145
  parts.append(cap_res)
146
146
 
147
147
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
148
- table_df = item.export_to_dataframe()
148
+ table_df = item.export_to_dataframe(doc)
149
149
  if table_df.shape[0] >= 1 and table_df.shape[1] >= 2:
150
150
 
151
151
  # copy header as first row and shift all rows by one
@@ -394,6 +394,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
394
394
  item=item,
395
395
  doc_serializer=self,
396
396
  doc=self.doc,
397
+ visited=my_visited,
397
398
  **my_kwargs,
398
399
  )
399
400
  return part
@@ -32,6 +32,7 @@ from docling_core.types.doc.document import (
32
32
  DoclingDocument,
33
33
  FloatingItem,
34
34
  FormItem,
35
+ GroupItem,
35
36
  InlineGroup,
36
37
  KeyValueItem,
37
38
  ListGroup,
@@ -42,6 +43,7 @@ from docling_core.types.doc.document import (
42
43
  PictureMoleculeData,
43
44
  PictureTabularChartData,
44
45
  ProvenanceItem,
46
+ SectionHeaderItem,
45
47
  TableItem,
46
48
  TextItem,
47
49
  )
@@ -94,11 +96,11 @@ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
94
96
  item: TextItem,
95
97
  doc_serializer: BaseDocSerializer,
96
98
  doc: DoclingDocument,
99
+ visited: Optional[set[str]] = None,
97
100
  **kwargs: Any,
98
101
  ) -> SerializationResult:
99
102
  """Serializes the passed item."""
100
- from docling_core.types.doc.document import SectionHeaderItem
101
-
103
+ my_visited = visited if visited is not None else set()
102
104
  params = DocTagsParams(**kwargs)
103
105
  wrap_tag: Optional[str] = DocumentToken.create_token_name_from_doc_item_label(
104
106
  label=item.label,
@@ -116,12 +118,21 @@ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
116
118
  parts.append(location)
117
119
 
118
120
  if params.add_content:
119
- text_part = item.text
120
- text_part = doc_serializer.post_process(
121
- text=text_part,
122
- formatting=item.formatting,
123
- hyperlink=item.hyperlink,
124
- )
121
+ if (
122
+ item.text == ""
123
+ and len(item.children) == 1
124
+ and isinstance(
125
+ (child_group := item.children[0].resolve(doc)), InlineGroup
126
+ )
127
+ ):
128
+ ser_res = doc_serializer.serialize(item=child_group, visited=my_visited)
129
+ text_part = ser_res.text
130
+ else:
131
+ text_part = doc_serializer.post_process(
132
+ text=item.text,
133
+ formatting=item.formatting,
134
+ hyperlink=item.hyperlink,
135
+ )
125
136
 
126
137
  if isinstance(item, CodeItem):
127
138
  language_token = DocumentToken.get_code_language_token(
@@ -506,7 +517,12 @@ class DocTagsFallbackSerializer(BaseFallbackSerializer):
506
517
  **kwargs: Any,
507
518
  ) -> SerializationResult:
508
519
  """Serializes the passed item."""
509
- return create_ser_result()
520
+ if isinstance(item, GroupItem):
521
+ parts = doc_serializer.get_parts(item=item, **kwargs)
522
+ text_res = "\n".join([p.text for p in parts if p.text])
523
+ return create_ser_result(text=text_res, span_source=parts)
524
+ else:
525
+ return create_ser_result()
510
526
 
511
527
 
512
528
  class DocTagsAnnotationSerializer(BaseAnnotationSerializer):
@@ -55,6 +55,7 @@ from docling_core.types.doc.document import (
55
55
  FormItem,
56
56
  FormulaItem,
57
57
  GraphData,
58
+ GroupItem,
58
59
  ImageRef,
59
60
  InlineGroup,
60
61
  KeyValueItem,
@@ -139,21 +140,34 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
139
140
  res_parts: list[SerializationResult] = []
140
141
  post_processed = False
141
142
 
142
- # Prepare the HTML based on item type
143
- if isinstance(item, TitleItem):
144
- text_inner = self._prepare_content(item.text)
145
- text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
143
+ has_inline_repr = (
144
+ item.text == ""
145
+ and len(item.children) == 1
146
+ and isinstance((child_group := item.children[0].resolve(doc)), InlineGroup)
147
+ )
148
+ if has_inline_repr:
149
+ text = doc_serializer.serialize(item=child_group, visited=my_visited).text
150
+ post_processed = True
151
+ else:
152
+ text = item.text
153
+ if not isinstance(item, (CodeItem, FormulaItem)):
154
+ text = html.escape(text, quote=False)
155
+ text = text.replace("\n", "<br>")
146
156
 
147
- elif isinstance(item, SectionHeaderItem):
148
- section_level = min(item.level + 1, 6)
149
- text_inner = self._prepare_content(item.text)
157
+ # Prepare the HTML based on item type
158
+ if isinstance(item, (TitleItem, SectionHeaderItem)):
159
+ section_level = (
160
+ min(item.level + 1, 6) if isinstance(item, SectionHeaderItem) else 1
161
+ )
150
162
  text = get_html_tag_with_text_direction(
151
- html_tag=f"h{section_level}", text=text_inner
163
+ html_tag=f"h{section_level}", text=text
152
164
  )
153
165
 
154
166
  elif isinstance(item, FormulaItem):
155
167
  text = self._process_formula(
156
168
  item=item,
169
+ text=text,
170
+ orig=item.orig,
157
171
  doc=doc,
158
172
  image_mode=params.image_mode,
159
173
  formula_to_mathml=params.formula_to_mathml,
@@ -161,19 +175,26 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
161
175
  )
162
176
 
163
177
  elif isinstance(item, CodeItem):
164
- text = self._process_code(item=item, is_inline_scope=is_inline_scope)
178
+ text = (
179
+ f"<code>{text}</code>"
180
+ if is_inline_scope
181
+ else f"<pre><code>{text}</code></pre>"
182
+ )
165
183
 
166
184
  elif isinstance(item, ListItem):
167
185
  # List items are handled by list serializer
168
186
  text_parts: list[str] = []
169
- if item_text := self._prepare_content(item.text):
170
- item_text = doc_serializer.post_process(
171
- text=item_text,
172
- formatting=item.formatting,
173
- hyperlink=item.hyperlink,
174
- )
175
- post_processed = True
176
- text_parts.append(item_text)
187
+ if text:
188
+ if has_inline_repr:
189
+ text = f"\n{text}\n"
190
+ else:
191
+ text = doc_serializer.post_process(
192
+ text=text,
193
+ formatting=item.formatting,
194
+ hyperlink=item.hyperlink,
195
+ )
196
+ post_processed = True
197
+ text_parts.append(text)
177
198
  nested_parts = [
178
199
  r.text
179
200
  for r in doc_serializer.get_parts(
@@ -184,29 +205,26 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
184
205
  )
185
206
  ]
186
207
  text_parts.extend(nested_parts)
187
- text_inner = "\n".join(text_parts)
208
+ text = "\n".join(text_parts)
188
209
  if nested_parts:
189
- text_inner = f"\n{text_inner}\n"
210
+ text = f"\n{text}\n"
190
211
  text = (
191
212
  get_html_tag_with_text_direction(
192
213
  html_tag="li",
193
- text=text_inner,
214
+ text=text,
194
215
  attrs=(
195
216
  {"style": f"list-style-type: '{item.marker} ';"}
196
217
  if params.show_original_list_item_marker and item.marker
197
218
  else {}
198
219
  ),
199
220
  )
200
- if text_inner
221
+ if text
201
222
  else ""
202
223
  )
203
224
 
204
- elif is_inline_scope:
205
- text = self._prepare_content(item.text)
206
- else:
225
+ elif not is_inline_scope:
207
226
  # Regular text item
208
- text_inner = self._prepare_content(item.text)
209
- text = get_html_tag_with_text_direction(html_tag="p", text=text_inner)
227
+ text = get_html_tag_with_text_direction(html_tag="p", text=text)
210
228
 
211
229
  # Apply formatting and hyperlinks
212
230
  if not post_processed:
@@ -227,66 +245,44 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
227
245
 
228
246
  return create_ser_result(text=text, span_source=res_parts)
229
247
 
230
- def _prepare_content(
231
- self, text: str, do_escape_html=True, do_replace_newline=True
232
- ) -> str:
233
- """Prepare text content for HTML inclusion."""
234
- if do_escape_html:
235
- text = html.escape(text, quote=False)
236
- if do_replace_newline:
237
- text = text.replace("\n", "<br>")
238
- return text
239
-
240
- def _process_code(
241
- self,
242
- item: CodeItem,
243
- is_inline_scope: bool,
244
- ) -> str:
245
- code_text = self._prepare_content(
246
- item.text, do_escape_html=False, do_replace_newline=False
247
- )
248
- if is_inline_scope:
249
- text = f"<code>{code_text}</code>"
250
- else:
251
- text = f"<pre><code>{code_text}</code></pre>"
252
-
253
- return text
254
-
255
248
  def _process_formula(
256
249
  self,
257
- item: FormulaItem,
250
+ *,
251
+ item: DocItem,
252
+ text: str,
253
+ orig: str,
258
254
  doc: DoclingDocument,
259
255
  image_mode: ImageRefMode,
260
256
  formula_to_mathml: bool,
261
257
  is_inline_scope: bool,
262
258
  ) -> str:
263
259
  """Process a formula item to HTML/MathML."""
264
- math_formula = self._prepare_content(
265
- item.text, do_escape_html=False, do_replace_newline=False
266
- )
267
-
268
260
  # If formula is empty, try to use an image fallback
269
- if item.text == "" and item.orig != "":
270
- img_fallback = self._get_formula_image_fallback(item, doc)
271
- if (
272
- image_mode == ImageRefMode.EMBEDDED
273
- and len(item.prov) > 0
274
- and img_fallback
275
- ):
276
- return img_fallback
261
+ if (
262
+ text == ""
263
+ and orig != ""
264
+ and len(item.prov) > 0
265
+ and image_mode == ImageRefMode.EMBEDDED
266
+ and (
267
+ img_fallback := self._get_formula_image_fallback(
268
+ item=item, orig=orig, doc=doc
269
+ )
270
+ )
271
+ ):
272
+ return img_fallback
277
273
 
278
274
  # Try to generate MathML
279
- if formula_to_mathml and math_formula:
275
+ elif formula_to_mathml and text:
280
276
  try:
281
277
  # Set display mode based on context
282
278
  display_mode = "inline" if is_inline_scope else "block"
283
279
  mathml_element = latex2mathml.converter.convert_to_element(
284
- math_formula, display=display_mode
280
+ text, display=display_mode
285
281
  )
286
282
  annotation = SubElement(
287
283
  mathml_element, "annotation", dict(encoding="TeX")
288
284
  )
289
- annotation.text = math_formula
285
+ annotation.text = text
290
286
  mathml = unescape(tostring(mathml_element, encoding="unicode"))
291
287
 
292
288
  # Don't wrap in div for inline formulas
@@ -296,40 +292,40 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
296
292
  return f"<div>{mathml}</div>"
297
293
 
298
294
  except Exception:
299
- img_fallback = self._get_formula_image_fallback(item, doc)
295
+ img_fallback = self._get_formula_image_fallback(
296
+ item=item, orig=orig, doc=doc
297
+ )
300
298
  if (
301
299
  image_mode == ImageRefMode.EMBEDDED
302
300
  and len(item.prov) > 0
303
301
  and img_fallback
304
302
  ):
305
303
  return img_fallback
306
- elif math_formula:
307
- return f"<pre>{math_formula}</pre>"
304
+ elif text:
305
+ return f"<pre>{text}</pre>"
308
306
  else:
309
307
  return "<pre>Formula not decoded</pre>"
310
308
 
311
309
  _logger.warning("Could not parse formula with MathML")
312
310
 
313
311
  # Fallback options if we got here
314
- if math_formula and is_inline_scope:
315
- return f"<code>{math_formula}</code>"
316
- elif math_formula and (not is_inline_scope):
317
- f"<pre>{math_formula}</pre>"
312
+ if text and is_inline_scope:
313
+ return f"<code>{text}</code>"
314
+ elif text and (not is_inline_scope):
315
+ f"<pre>{text}</pre>"
318
316
  elif is_inline_scope:
319
317
  return '<span class="formula-not-decoded">Formula not decoded</span>'
320
318
 
321
319
  return '<div class="formula-not-decoded">Formula not decoded</div>'
322
320
 
323
321
  def _get_formula_image_fallback(
324
- self, item: TextItem, doc: DoclingDocument
322
+ self, *, item: DocItem, orig: str, doc: DoclingDocument
325
323
  ) -> Optional[str]:
326
324
  """Try to get an image fallback for a formula."""
327
325
  item_image = item.get_image(doc=doc)
328
326
  if item_image is not None:
329
327
  img_ref = ImageRef.from_pil(item_image, dpi=72)
330
- return (
331
- "<figure>" f'<img src="{img_ref.uri}" alt="{item.orig}" />' "</figure>"
332
- )
328
+ return "<figure>" f'<img src="{img_ref.uri}" alt="{orig}" />' "</figure>"
333
329
  return None
334
330
 
335
331
 
@@ -792,21 +788,30 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
792
788
  """HTML-specific fallback serializer."""
793
789
 
794
790
  @override
795
- def serialize(self, *, item: NodeItem, **kwargs: Any) -> SerializationResult:
791
+ def serialize(
792
+ self,
793
+ *,
794
+ item: NodeItem,
795
+ doc_serializer: "BaseDocSerializer",
796
+ doc: DoclingDocument,
797
+ **kwargs: Any,
798
+ ) -> SerializationResult:
796
799
  """Fallback serializer for items not handled by other serializers."""
797
- if isinstance(item, DocItem):
800
+ if isinstance(item, GroupItem):
801
+ parts = doc_serializer.get_parts(item=item, **kwargs)
802
+ text_res = "\n".join([p.text for p in parts if p.text])
803
+ return create_ser_result(text=text_res, span_source=parts)
804
+ else:
798
805
  return create_ser_result(
799
806
  text=f"<!-- Unhandled item type: {item.__class__.__name__} -->",
800
- span_source=item,
807
+ span_source=item if isinstance(item, DocItem) else [],
801
808
  )
802
- else:
803
- # For group items, we don't generate any markup
804
- return create_ser_result()
805
809
 
806
810
 
807
811
  class HTMLAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
808
812
  """HTML-specific annotation serializer."""
809
813
 
814
+ @override
810
815
  def serialize(
811
816
  self,
812
817
  *,
@@ -45,6 +45,7 @@ from docling_core.types.doc.document import (
45
45
  Formatting,
46
46
  FormItem,
47
47
  FormulaItem,
48
+ GroupItem,
48
49
  ImageRef,
49
50
  InlineGroup,
50
51
  KeyValueItem,
@@ -124,26 +125,24 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
124
125
  my_visited = visited if visited is not None else set()
125
126
  params = MarkdownParams(**kwargs)
126
127
  res_parts: list[SerializationResult] = []
127
- text = item.text
128
128
  escape_html = True
129
129
  escape_underscores = True
130
- processing_pending = True
131
- if isinstance(item, (ListItem, TitleItem, SectionHeaderItem)):
132
- # case where processing/formatting should be applied first (in inner scope)
130
+
131
+ has_inline_repr = (
132
+ item.text == ""
133
+ and len(item.children) == 1
134
+ and isinstance((child_group := item.children[0].resolve(doc)), InlineGroup)
135
+ )
136
+ if has_inline_repr:
137
+ text = doc_serializer.serialize(item=child_group, visited=my_visited).text
133
138
  processing_pending = False
134
- if (
135
- text == ""
136
- and len(item.children) == 1
137
- and isinstance(
138
- (child_group := item.children[0].resolve(doc)), InlineGroup
139
- )
140
- ):
141
- # case of inline within heading / list item
142
- ser_res = doc_serializer.serialize(item=child_group)
143
- text = ser_res.text
144
- for span in ser_res.spans:
145
- my_visited.add(span.item.self_ref)
146
- else:
139
+ else:
140
+ text = item.text
141
+ processing_pending = True
142
+
143
+ if isinstance(item, (ListItem, TitleItem, SectionHeaderItem)):
144
+ if not has_inline_repr:
145
+ # case where processing/formatting should be applied first (in inner scope)
147
146
  text = doc_serializer.post_process(
148
147
  text=text,
149
148
  escape_html=escape_html,
@@ -151,6 +150,7 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
151
150
  formatting=item.formatting,
152
151
  hyperlink=item.hyperlink,
153
152
  )
153
+ processing_pending = False
154
154
 
155
155
  if isinstance(item, ListItem):
156
156
  pieces: list[str] = []
@@ -600,13 +600,15 @@ class MarkdownFallbackSerializer(BaseFallbackSerializer):
600
600
  **kwargs: Any,
601
601
  ) -> SerializationResult:
602
602
  """Serializes the passed item."""
603
- if isinstance(item, DocItem):
603
+ if isinstance(item, GroupItem):
604
+ parts = doc_serializer.get_parts(item=item, **kwargs)
605
+ text_res = "\n\n".join([p.text for p in parts if p.text])
606
+ return create_ser_result(text=text_res, span_source=parts)
607
+ else:
604
608
  return create_ser_result(
605
609
  text="<!-- missing-text -->",
606
- span_source=item,
610
+ span_source=item if isinstance(item, DocItem) else [],
607
611
  )
608
- else:
609
- return create_ser_result()
610
612
 
611
613
 
612
614
  class MarkdownDocSerializer(DocSerializer):
@@ -60,7 +60,7 @@ _logger = logging.getLogger(__name__)
60
60
 
61
61
  Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
62
62
  LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
63
- CURRENT_VERSION: Final = "1.6.0"
63
+ CURRENT_VERSION: Final = "1.7.0"
64
64
 
65
65
  DEFAULT_EXPORT_LABELS = {
66
66
  DocItemLabel.TITLE,
@@ -310,6 +310,7 @@ class TableCell(BaseModel):
310
310
  column_header: bool = False
311
311
  row_header: bool = False
312
312
  row_section: bool = False
313
+ fillable: bool = False
313
314
 
314
315
  @model_validator(mode="before")
315
316
  @classmethod
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.47.0
3
+ Version: 2.48.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docling-core"
3
- version = "2.47.0" # DO NOT EDIT, updated automatically
3
+ version = "2.48.0" # DO NOT EDIT, updated automatically
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  license-files = ["LICENSE"]
@@ -734,7 +734,7 @@ def _test_export_methods(
734
734
  for table in doc.tables:
735
735
  table.export_to_markdown()
736
736
  table.export_to_html(doc)
737
- table.export_to_dataframe()
737
+ table.export_to_dataframe(doc)
738
738
  table.export_to_doctags(doc)
739
739
 
740
740
  # Test Images export ...
@@ -2102,7 +2102,7 @@ def _construct_rich_table_doc():
2102
2102
 
2103
2103
  table_item = doc.add_table(
2104
2104
  data=TableData(
2105
- num_rows=4,
2105
+ num_rows=5,
2106
2106
  num_cols=2,
2107
2107
  ),
2108
2108
  )
@@ -2121,6 +2121,17 @@ def _construct_rich_table_doc():
2121
2121
  rich_item_3 = doc.add_table(
2122
2122
  data=TableData(num_rows=2, num_cols=3), parent=table_item
2123
2123
  )
2124
+
2125
+ rich_item_4 = doc.add_group(parent=table_item, label=GroupLabel.UNSPECIFIED)
2126
+ doc.add_text(
2127
+ parent=rich_item_4,
2128
+ text="Some text in a generic group.",
2129
+ label=DocItemLabel.TEXT,
2130
+ )
2131
+ doc.add_text(
2132
+ parent=rich_item_4, text="More text in the group.", label=DocItemLabel.TEXT
2133
+ )
2134
+
2124
2135
  for i in range(rich_item_3.data.num_rows):
2125
2136
  for j in range(rich_item_3.data.num_cols):
2126
2137
  cell = TableCell(
@@ -2158,6 +2169,14 @@ def _construct_rich_table_doc():
2158
2169
  end_col_offset_idx=j + 1,
2159
2170
  ref=rich_item_3.get_ref(),
2160
2171
  )
2172
+ elif i == 4 and j == 0:
2173
+ cell = RichTableCell(
2174
+ start_row_offset_idx=i,
2175
+ end_row_offset_idx=i + 1,
2176
+ start_col_offset_idx=j,
2177
+ end_col_offset_idx=j + 1,
2178
+ ref=rich_item_4.get_ref(),
2179
+ )
2161
2180
  else:
2162
2181
  cell = TableCell(
2163
2182
  start_row_offset_idx=i,
@@ -85,6 +85,11 @@ def verify(exp_file: Path, actual: str):
85
85
  assert expected == actual
86
86
 
87
87
 
88
+ # ===============================
89
+ # Markdown tests
90
+ # ===============================
91
+
92
+
88
93
  def test_md_cross_page_list_page_break():
89
94
  src = Path("./test/data/doc/activities.json")
90
95
  doc = DoclingDocument.load_from_json(src)
@@ -99,7 +104,7 @@ def test_md_cross_page_list_page_break():
99
104
  ),
100
105
  )
101
106
  actual = ser.serialize().text
102
- verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
107
+ verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
103
108
 
104
109
 
105
110
  def test_md_cross_page_list_page_break_none():
@@ -170,20 +175,6 @@ def test_md_cross_page_list_page_break_p2():
170
175
  verify(exp_file=src.parent / f"{src.stem}_p2.gt.md", actual=actual)
171
176
 
172
177
 
173
- def test_html_charts():
174
- src = Path("./test/data/doc/barchart.json")
175
- doc = DoclingDocument.load_from_json(src)
176
-
177
- ser = HTMLDocSerializer(
178
- doc=doc,
179
- params=HTMLParams(
180
- image_mode=ImageRefMode.PLACEHOLDER,
181
- ),
182
- )
183
- actual = ser.serialize().text
184
- verify(exp_file=src.parent / f"{src.stem}.gt.html", actual=actual)
185
-
186
-
187
178
  def test_md_charts():
188
179
  src = Path("./test/data/doc/barchart.json")
189
180
  doc = DoclingDocument.load_from_json(src)
@@ -195,7 +186,7 @@ def test_md_charts():
195
186
  ),
196
187
  )
197
188
  actual = ser.serialize().text
198
- verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
189
+ verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
199
190
 
200
191
 
201
192
  def test_md_inline_and_formatting():
@@ -209,51 +200,7 @@ def test_md_inline_and_formatting():
209
200
  ),
210
201
  )
211
202
  actual = ser.serialize().text
212
- verify(exp_file=src.parent / f"{src.stem}.md", actual=actual)
213
-
214
-
215
- def test_html_cross_page_list_page_break():
216
- src = Path("./test/data/doc/activities.json")
217
- doc = DoclingDocument.load_from_json(src)
218
-
219
- ser = HTMLDocSerializer(
220
- doc=doc,
221
- params=HTMLParams(
222
- image_mode=ImageRefMode.PLACEHOLDER,
223
- ),
224
- )
225
- actual = ser.serialize().text
226
- verify(exp_file=src.parent / f"{src.stem}.gt.html", actual=actual)
227
-
228
-
229
- def test_html_cross_page_list_page_break_p1():
230
- src = Path("./test/data/doc/activities.json")
231
- doc = DoclingDocument.load_from_json(src)
232
-
233
- ser = HTMLDocSerializer(
234
- doc=doc,
235
- params=HTMLParams(
236
- image_mode=ImageRefMode.PLACEHOLDER,
237
- pages={1},
238
- ),
239
- )
240
- actual = ser.serialize().text
241
- verify(exp_file=src.parent / f"{src.stem}_p1.gt.html", actual=actual)
242
-
243
-
244
- def test_html_cross_page_list_page_break_p2():
245
- src = Path("./test/data/doc/activities.json")
246
- doc = DoclingDocument.load_from_json(src)
247
-
248
- ser = HTMLDocSerializer(
249
- doc=doc,
250
- params=HTMLParams(
251
- image_mode=ImageRefMode.PLACEHOLDER,
252
- pages={2},
253
- ),
254
- )
255
- actual = ser.serialize().text
256
- verify(exp_file=src.parent / f"{src.stem}_p2.gt.html", actual=actual)
203
+ verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
257
204
 
258
205
 
259
206
  def test_md_pb_placeholder_and_page_filter():
@@ -269,7 +216,7 @@ def test_md_pb_placeholder_and_page_filter():
269
216
  ),
270
217
  )
271
218
  actual = ser.serialize().text
272
- verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
219
+ verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
273
220
 
274
221
 
275
222
  def test_md_list_item_markers():
@@ -358,7 +305,7 @@ def test_md_nested_lists():
358
305
 
359
306
  ser = MarkdownDocSerializer(doc=doc)
360
307
  actual = ser.serialize().text
361
- verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
308
+ verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
362
309
 
363
310
 
364
311
  def test_md_rich_table():
@@ -370,6 +317,69 @@ def test_md_rich_table():
370
317
  verify(exp_file=exp_file, actual=actual)
371
318
 
372
319
 
320
+ # ===============================
321
+ # HTML tests
322
+ # ===============================
323
+
324
+
325
+ def test_html_charts():
326
+ src = Path("./test/data/doc/barchart.json")
327
+ doc = DoclingDocument.load_from_json(src)
328
+
329
+ ser = HTMLDocSerializer(
330
+ doc=doc,
331
+ params=HTMLParams(
332
+ image_mode=ImageRefMode.PLACEHOLDER,
333
+ ),
334
+ )
335
+ actual = ser.serialize().text
336
+ verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
337
+
338
+
339
+ def test_html_cross_page_list_page_break():
340
+ src = Path("./test/data/doc/activities.json")
341
+ doc = DoclingDocument.load_from_json(src)
342
+
343
+ ser = HTMLDocSerializer(
344
+ doc=doc,
345
+ params=HTMLParams(
346
+ image_mode=ImageRefMode.PLACEHOLDER,
347
+ ),
348
+ )
349
+ actual = ser.serialize().text
350
+ verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
351
+
352
+
353
+ def test_html_cross_page_list_page_break_p1():
354
+ src = Path("./test/data/doc/activities.json")
355
+ doc = DoclingDocument.load_from_json(src)
356
+
357
+ ser = HTMLDocSerializer(
358
+ doc=doc,
359
+ params=HTMLParams(
360
+ image_mode=ImageRefMode.PLACEHOLDER,
361
+ pages={1},
362
+ ),
363
+ )
364
+ actual = ser.serialize().text
365
+ verify(exp_file=src.parent / f"{src.stem}_p1.gt.html", actual=actual)
366
+
367
+
368
+ def test_html_cross_page_list_page_break_p2():
369
+ src = Path("./test/data/doc/activities.json")
370
+ doc = DoclingDocument.load_from_json(src)
371
+
372
+ ser = HTMLDocSerializer(
373
+ doc=doc,
374
+ params=HTMLParams(
375
+ image_mode=ImageRefMode.PLACEHOLDER,
376
+ pages={2},
377
+ ),
378
+ )
379
+ actual = ser.serialize().text
380
+ verify(exp_file=src.parent / f"{src.stem}_p2.gt.html", actual=actual)
381
+
382
+
373
383
  def test_html_split_page():
374
384
  src = Path("./test/data/doc/2408.09869v3_enriched.json")
375
385
  doc = DoclingDocument.load_from_json(src)
@@ -506,7 +516,7 @@ def test_html_nested_lists():
506
516
 
507
517
  ser = HTMLDocSerializer(doc=doc)
508
518
  actual = ser.serialize().text
509
- verify(exp_file=src.parent / f"{src.stem}.gt.html", actual=actual)
519
+ verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
510
520
 
511
521
 
512
522
  def test_html_rich_table():
@@ -518,13 +528,27 @@ def test_html_rich_table():
518
528
  verify(exp_file=exp_file, actual=actual)
519
529
 
520
530
 
531
+ def test_html_inline_and_formatting():
532
+ src = Path("./test/data/doc/inline_and_formatting.yaml")
533
+ doc = DoclingDocument.load_from_yaml(src)
534
+
535
+ ser = HTMLDocSerializer(doc=doc)
536
+ actual = ser.serialize().text
537
+ verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
538
+
539
+
540
+ # ===============================
541
+ # DocTags tests
542
+ # ===============================
543
+
544
+
521
545
  def test_doctags_inline_loc_tags():
522
546
  src = Path("./test/data/doc/2408.09869v3_enriched.json")
523
547
  doc = DoclingDocument.load_from_json(src)
524
548
 
525
549
  ser = DocTagsDocSerializer(doc=doc)
526
550
  actual = ser.serialize().text
527
- verify(exp_file=src.parent / f"{src.stem}.out.dt", actual=actual)
551
+ verify(exp_file=src.with_suffix(".out.dt"), actual=actual)
528
552
 
529
553
 
530
554
  def test_doctags_rich_table():
@@ -535,3 +559,12 @@ def test_doctags_rich_table():
535
559
  ser = DocTagsDocSerializer(doc=doc)
536
560
  actual = ser.serialize().text
537
561
  verify(exp_file=exp_file, actual=actual)
562
+
563
+
564
+ def test_doctags_inline_and_formatting():
565
+ src = Path("./test/data/doc/inline_and_formatting.yaml")
566
+ doc = DoclingDocument.load_from_yaml(src)
567
+
568
+ ser = DocTagsDocSerializer(doc=doc)
569
+ actual = ser.serialize().text
570
+ verify(exp_file=src.with_suffix(".gt.dt"), actual=actual)
File without changes
File without changes
File without changes