docling-core 2.47.0__tar.gz → 2.48.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (110) hide show
  1. {docling_core-2.47.0 → docling_core-2.48.1}/PKG-INFO +1 -1
  2. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/hierarchical_chunker.py +1 -1
  3. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/serializer/common.py +1 -0
  4. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/serializer/doctags.py +25 -9
  5. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/serializer/html.py +89 -84
  6. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/serializer/markdown.py +24 -22
  7. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/doc/document.py +2 -1
  8. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core.egg-info/PKG-INFO +1 -1
  9. {docling_core-2.47.0 → docling_core-2.48.1}/pyproject.toml +1 -1
  10. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_docling_doc.py +21 -2
  11. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_serialization.py +128 -66
  12. {docling_core-2.47.0 → docling_core-2.48.1}/LICENSE +0 -0
  13. {docling_core-2.47.0 → docling_core-2.48.1}/README.md +0 -0
  14. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/__init__.py +0 -0
  15. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/cli/__init__.py +0 -0
  16. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/cli/view.py +0 -0
  17. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/experimental/__init__.py +0 -0
  18. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/py.typed +0 -0
  19. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
  20. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
  21. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  22. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
  23. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  24. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  25. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  26. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  27. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/search/__init__.py +0 -0
  28. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  29. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/search/mapping.py +0 -0
  30. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/search/meta.py +0 -0
  31. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/search/package.py +0 -0
  32. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/__init__.py +0 -0
  33. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/__init__.py +0 -0
  34. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/base.py +0 -0
  35. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  36. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/page_chunker.py +0 -0
  37. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
  38. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
  39. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
  40. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
  41. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/serializer/__init__.py +0 -0
  42. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/serializer/base.py +0 -0
  43. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/serializer/html_styles.py +0 -0
  44. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/visualizer/__init__.py +0 -0
  45. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/visualizer/base.py +0 -0
  46. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/visualizer/key_value_visualizer.py +0 -0
  47. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
  48. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
  49. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
  50. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/__init__.py +0 -0
  51. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/base.py +0 -0
  52. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/doc/__init__.py +0 -0
  53. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/doc/base.py +0 -0
  54. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/doc/labels.py +0 -0
  55. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/doc/page.py +0 -0
  56. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/doc/tokens.py +0 -0
  57. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/doc/utils.py +0 -0
  58. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/gen/__init__.py +0 -0
  59. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/gen/generic.py +0 -0
  60. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/io/__init__.py +0 -0
  61. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/legacy_doc/__init__.py +0 -0
  62. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/legacy_doc/base.py +0 -0
  63. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  64. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  65. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  66. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/legacy_doc/document.py +0 -0
  67. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/legacy_doc/tokens.py +0 -0
  68. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/nlp/__init__.py +0 -0
  69. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/nlp/qa.py +0 -0
  70. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/nlp/qa_labels.py +0 -0
  71. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/rec/__init__.py +0 -0
  72. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/rec/attribute.py +0 -0
  73. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/rec/base.py +0 -0
  74. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/rec/predicate.py +0 -0
  75. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/rec/record.py +0 -0
  76. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/rec/statement.py +0 -0
  77. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/types/rec/subject.py +0 -0
  78. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/utils/__init__.py +0 -0
  79. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/utils/alias.py +0 -0
  80. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/utils/file.py +0 -0
  81. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/utils/generate_docs.py +0 -0
  82. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/utils/generate_jsonschema.py +0 -0
  83. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/utils/legacy.py +0 -0
  84. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/utils/validate.py +0 -0
  85. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core/utils/validators.py +0 -0
  86. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core.egg-info/SOURCES.txt +0 -0
  87. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core.egg-info/dependency_links.txt +0 -0
  88. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core.egg-info/entry_points.txt +0 -0
  89. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core.egg-info/requires.txt +0 -0
  90. {docling_core-2.47.0 → docling_core-2.48.1}/docling_core.egg-info/top_level.txt +0 -0
  91. {docling_core-2.47.0 → docling_core-2.48.1}/setup.cfg +0 -0
  92. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_base.py +0 -0
  93. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_collection.py +0 -0
  94. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_data_gen_flag.py +0 -0
  95. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_doc_base.py +0 -0
  96. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_doc_legacy_convert.py +0 -0
  97. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_doc_schema.py +0 -0
  98. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_doc_schema_extractor.py +0 -0
  99. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_doctags_load.py +0 -0
  100. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_hierarchical_chunker.py +0 -0
  101. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_hybrid_chunker.py +0 -0
  102. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_json_schema_to_search_mapper.py +0 -0
  103. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_nlp_qa.py +0 -0
  104. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_otsl_table_export.py +0 -0
  105. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_page.py +0 -0
  106. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_page_chunker.py +0 -0
  107. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_rec_schema.py +0 -0
  108. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_search_meta.py +0 -0
  109. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_utils.py +0 -0
  110. {docling_core-2.47.0 → docling_core-2.48.1}/test/test_visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.47.0
3
+ Version: 2.48.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -145,7 +145,7 @@ class TripletTableSerializer(BaseTableSerializer):
145
145
  parts.append(cap_res)
146
146
 
147
147
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
148
- table_df = item.export_to_dataframe()
148
+ table_df = item.export_to_dataframe(doc)
149
149
  if table_df.shape[0] >= 1 and table_df.shape[1] >= 2:
150
150
 
151
151
  # copy header as first row and shift all rows by one
@@ -394,6 +394,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
394
394
  item=item,
395
395
  doc_serializer=self,
396
396
  doc=self.doc,
397
+ visited=my_visited,
397
398
  **my_kwargs,
398
399
  )
399
400
  return part
@@ -32,6 +32,7 @@ from docling_core.types.doc.document import (
32
32
  DoclingDocument,
33
33
  FloatingItem,
34
34
  FormItem,
35
+ GroupItem,
35
36
  InlineGroup,
36
37
  KeyValueItem,
37
38
  ListGroup,
@@ -42,6 +43,7 @@ from docling_core.types.doc.document import (
42
43
  PictureMoleculeData,
43
44
  PictureTabularChartData,
44
45
  ProvenanceItem,
46
+ SectionHeaderItem,
45
47
  TableItem,
46
48
  TextItem,
47
49
  )
@@ -94,11 +96,11 @@ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
94
96
  item: TextItem,
95
97
  doc_serializer: BaseDocSerializer,
96
98
  doc: DoclingDocument,
99
+ visited: Optional[set[str]] = None,
97
100
  **kwargs: Any,
98
101
  ) -> SerializationResult:
99
102
  """Serializes the passed item."""
100
- from docling_core.types.doc.document import SectionHeaderItem
101
-
103
+ my_visited = visited if visited is not None else set()
102
104
  params = DocTagsParams(**kwargs)
103
105
  wrap_tag: Optional[str] = DocumentToken.create_token_name_from_doc_item_label(
104
106
  label=item.label,
@@ -116,12 +118,21 @@ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
116
118
  parts.append(location)
117
119
 
118
120
  if params.add_content:
119
- text_part = item.text
120
- text_part = doc_serializer.post_process(
121
- text=text_part,
122
- formatting=item.formatting,
123
- hyperlink=item.hyperlink,
124
- )
121
+ if (
122
+ item.text == ""
123
+ and len(item.children) == 1
124
+ and isinstance(
125
+ (child_group := item.children[0].resolve(doc)), InlineGroup
126
+ )
127
+ ):
128
+ ser_res = doc_serializer.serialize(item=child_group, visited=my_visited)
129
+ text_part = ser_res.text
130
+ else:
131
+ text_part = doc_serializer.post_process(
132
+ text=item.text,
133
+ formatting=item.formatting,
134
+ hyperlink=item.hyperlink,
135
+ )
125
136
 
126
137
  if isinstance(item, CodeItem):
127
138
  language_token = DocumentToken.get_code_language_token(
@@ -506,7 +517,12 @@ class DocTagsFallbackSerializer(BaseFallbackSerializer):
506
517
  **kwargs: Any,
507
518
  ) -> SerializationResult:
508
519
  """Serializes the passed item."""
509
- return create_ser_result()
520
+ if isinstance(item, GroupItem):
521
+ parts = doc_serializer.get_parts(item=item, **kwargs)
522
+ text_res = "\n".join([p.text for p in parts if p.text])
523
+ return create_ser_result(text=text_res, span_source=parts)
524
+ else:
525
+ return create_ser_result()
510
526
 
511
527
 
512
528
  class DocTagsAnnotationSerializer(BaseAnnotationSerializer):
@@ -55,6 +55,7 @@ from docling_core.types.doc.document import (
55
55
  FormItem,
56
56
  FormulaItem,
57
57
  GraphData,
58
+ GroupItem,
58
59
  ImageRef,
59
60
  InlineGroup,
60
61
  KeyValueItem,
@@ -139,21 +140,34 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
139
140
  res_parts: list[SerializationResult] = []
140
141
  post_processed = False
141
142
 
142
- # Prepare the HTML based on item type
143
- if isinstance(item, TitleItem):
144
- text_inner = self._prepare_content(item.text)
145
- text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
143
+ has_inline_repr = (
144
+ item.text == ""
145
+ and len(item.children) == 1
146
+ and isinstance((child_group := item.children[0].resolve(doc)), InlineGroup)
147
+ )
148
+ if has_inline_repr:
149
+ text = doc_serializer.serialize(item=child_group, visited=my_visited).text
150
+ post_processed = True
151
+ else:
152
+ text = item.text
153
+ if not isinstance(item, (CodeItem, FormulaItem)):
154
+ text = html.escape(text, quote=False)
155
+ text = text.replace("\n", "<br>")
146
156
 
147
- elif isinstance(item, SectionHeaderItem):
148
- section_level = min(item.level + 1, 6)
149
- text_inner = self._prepare_content(item.text)
157
+ # Prepare the HTML based on item type
158
+ if isinstance(item, (TitleItem, SectionHeaderItem)):
159
+ section_level = (
160
+ min(item.level + 1, 6) if isinstance(item, SectionHeaderItem) else 1
161
+ )
150
162
  text = get_html_tag_with_text_direction(
151
- html_tag=f"h{section_level}", text=text_inner
163
+ html_tag=f"h{section_level}", text=text
152
164
  )
153
165
 
154
166
  elif isinstance(item, FormulaItem):
155
167
  text = self._process_formula(
156
168
  item=item,
169
+ text=text,
170
+ orig=item.orig,
157
171
  doc=doc,
158
172
  image_mode=params.image_mode,
159
173
  formula_to_mathml=params.formula_to_mathml,
@@ -161,19 +175,26 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
161
175
  )
162
176
 
163
177
  elif isinstance(item, CodeItem):
164
- text = self._process_code(item=item, is_inline_scope=is_inline_scope)
178
+ text = (
179
+ f"<code>{text}</code>"
180
+ if is_inline_scope
181
+ else f"<pre><code>{text}</code></pre>"
182
+ )
165
183
 
166
184
  elif isinstance(item, ListItem):
167
185
  # List items are handled by list serializer
168
186
  text_parts: list[str] = []
169
- if item_text := self._prepare_content(item.text):
170
- item_text = doc_serializer.post_process(
171
- text=item_text,
172
- formatting=item.formatting,
173
- hyperlink=item.hyperlink,
174
- )
175
- post_processed = True
176
- text_parts.append(item_text)
187
+ if text:
188
+ if has_inline_repr:
189
+ text = f"\n{text}\n"
190
+ else:
191
+ text = doc_serializer.post_process(
192
+ text=text,
193
+ formatting=item.formatting,
194
+ hyperlink=item.hyperlink,
195
+ )
196
+ post_processed = True
197
+ text_parts.append(text)
177
198
  nested_parts = [
178
199
  r.text
179
200
  for r in doc_serializer.get_parts(
@@ -184,29 +205,26 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
184
205
  )
185
206
  ]
186
207
  text_parts.extend(nested_parts)
187
- text_inner = "\n".join(text_parts)
208
+ text = "\n".join(text_parts)
188
209
  if nested_parts:
189
- text_inner = f"\n{text_inner}\n"
210
+ text = f"\n{text}\n"
190
211
  text = (
191
212
  get_html_tag_with_text_direction(
192
213
  html_tag="li",
193
- text=text_inner,
214
+ text=text,
194
215
  attrs=(
195
216
  {"style": f"list-style-type: '{item.marker} ';"}
196
217
  if params.show_original_list_item_marker and item.marker
197
218
  else {}
198
219
  ),
199
220
  )
200
- if text_inner
221
+ if text
201
222
  else ""
202
223
  )
203
224
 
204
- elif is_inline_scope:
205
- text = self._prepare_content(item.text)
206
- else:
225
+ elif not is_inline_scope:
207
226
  # Regular text item
208
- text_inner = self._prepare_content(item.text)
209
- text = get_html_tag_with_text_direction(html_tag="p", text=text_inner)
227
+ text = get_html_tag_with_text_direction(html_tag="p", text=text)
210
228
 
211
229
  # Apply formatting and hyperlinks
212
230
  if not post_processed:
@@ -227,66 +245,44 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
227
245
 
228
246
  return create_ser_result(text=text, span_source=res_parts)
229
247
 
230
- def _prepare_content(
231
- self, text: str, do_escape_html=True, do_replace_newline=True
232
- ) -> str:
233
- """Prepare text content for HTML inclusion."""
234
- if do_escape_html:
235
- text = html.escape(text, quote=False)
236
- if do_replace_newline:
237
- text = text.replace("\n", "<br>")
238
- return text
239
-
240
- def _process_code(
241
- self,
242
- item: CodeItem,
243
- is_inline_scope: bool,
244
- ) -> str:
245
- code_text = self._prepare_content(
246
- item.text, do_escape_html=False, do_replace_newline=False
247
- )
248
- if is_inline_scope:
249
- text = f"<code>{code_text}</code>"
250
- else:
251
- text = f"<pre><code>{code_text}</code></pre>"
252
-
253
- return text
254
-
255
248
  def _process_formula(
256
249
  self,
257
- item: FormulaItem,
250
+ *,
251
+ item: DocItem,
252
+ text: str,
253
+ orig: str,
258
254
  doc: DoclingDocument,
259
255
  image_mode: ImageRefMode,
260
256
  formula_to_mathml: bool,
261
257
  is_inline_scope: bool,
262
258
  ) -> str:
263
259
  """Process a formula item to HTML/MathML."""
264
- math_formula = self._prepare_content(
265
- item.text, do_escape_html=False, do_replace_newline=False
266
- )
267
-
268
260
  # If formula is empty, try to use an image fallback
269
- if item.text == "" and item.orig != "":
270
- img_fallback = self._get_formula_image_fallback(item, doc)
271
- if (
272
- image_mode == ImageRefMode.EMBEDDED
273
- and len(item.prov) > 0
274
- and img_fallback
275
- ):
276
- return img_fallback
261
+ if (
262
+ text == ""
263
+ and orig != ""
264
+ and len(item.prov) > 0
265
+ and image_mode == ImageRefMode.EMBEDDED
266
+ and (
267
+ img_fallback := self._get_formula_image_fallback(
268
+ item=item, orig=orig, doc=doc
269
+ )
270
+ )
271
+ ):
272
+ return img_fallback
277
273
 
278
274
  # Try to generate MathML
279
- if formula_to_mathml and math_formula:
275
+ elif formula_to_mathml and text:
280
276
  try:
281
277
  # Set display mode based on context
282
278
  display_mode = "inline" if is_inline_scope else "block"
283
279
  mathml_element = latex2mathml.converter.convert_to_element(
284
- math_formula, display=display_mode
280
+ text, display=display_mode
285
281
  )
286
282
  annotation = SubElement(
287
283
  mathml_element, "annotation", dict(encoding="TeX")
288
284
  )
289
- annotation.text = math_formula
285
+ annotation.text = text
290
286
  mathml = unescape(tostring(mathml_element, encoding="unicode"))
291
287
 
292
288
  # Don't wrap in div for inline formulas
@@ -296,40 +292,40 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
296
292
  return f"<div>{mathml}</div>"
297
293
 
298
294
  except Exception:
299
- img_fallback = self._get_formula_image_fallback(item, doc)
295
+ img_fallback = self._get_formula_image_fallback(
296
+ item=item, orig=orig, doc=doc
297
+ )
300
298
  if (
301
299
  image_mode == ImageRefMode.EMBEDDED
302
300
  and len(item.prov) > 0
303
301
  and img_fallback
304
302
  ):
305
303
  return img_fallback
306
- elif math_formula:
307
- return f"<pre>{math_formula}</pre>"
304
+ elif text:
305
+ return f"<pre>{text}</pre>"
308
306
  else:
309
307
  return "<pre>Formula not decoded</pre>"
310
308
 
311
309
  _logger.warning("Could not parse formula with MathML")
312
310
 
313
311
  # Fallback options if we got here
314
- if math_formula and is_inline_scope:
315
- return f"<code>{math_formula}</code>"
316
- elif math_formula and (not is_inline_scope):
317
- f"<pre>{math_formula}</pre>"
312
+ if text and is_inline_scope:
313
+ return f"<code>{text}</code>"
314
+ elif text and (not is_inline_scope):
315
+ f"<pre>{text}</pre>"
318
316
  elif is_inline_scope:
319
317
  return '<span class="formula-not-decoded">Formula not decoded</span>'
320
318
 
321
319
  return '<div class="formula-not-decoded">Formula not decoded</div>'
322
320
 
323
321
  def _get_formula_image_fallback(
324
- self, item: TextItem, doc: DoclingDocument
322
+ self, *, item: DocItem, orig: str, doc: DoclingDocument
325
323
  ) -> Optional[str]:
326
324
  """Try to get an image fallback for a formula."""
327
325
  item_image = item.get_image(doc=doc)
328
326
  if item_image is not None:
329
327
  img_ref = ImageRef.from_pil(item_image, dpi=72)
330
- return (
331
- "<figure>" f'<img src="{img_ref.uri}" alt="{item.orig}" />' "</figure>"
332
- )
328
+ return "<figure>" f'<img src="{img_ref.uri}" alt="{orig}" />' "</figure>"
333
329
  return None
334
330
 
335
331
 
@@ -792,21 +788,30 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
792
788
  """HTML-specific fallback serializer."""
793
789
 
794
790
  @override
795
- def serialize(self, *, item: NodeItem, **kwargs: Any) -> SerializationResult:
791
+ def serialize(
792
+ self,
793
+ *,
794
+ item: NodeItem,
795
+ doc_serializer: "BaseDocSerializer",
796
+ doc: DoclingDocument,
797
+ **kwargs: Any,
798
+ ) -> SerializationResult:
796
799
  """Fallback serializer for items not handled by other serializers."""
797
- if isinstance(item, DocItem):
800
+ if isinstance(item, GroupItem):
801
+ parts = doc_serializer.get_parts(item=item, **kwargs)
802
+ text_res = "\n".join([p.text for p in parts if p.text])
803
+ return create_ser_result(text=text_res, span_source=parts)
804
+ else:
798
805
  return create_ser_result(
799
806
  text=f"<!-- Unhandled item type: {item.__class__.__name__} -->",
800
- span_source=item,
807
+ span_source=item if isinstance(item, DocItem) else [],
801
808
  )
802
- else:
803
- # For group items, we don't generate any markup
804
- return create_ser_result()
805
809
 
806
810
 
807
811
  class HTMLAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
808
812
  """HTML-specific annotation serializer."""
809
813
 
814
+ @override
810
815
  def serialize(
811
816
  self,
812
817
  *,
@@ -45,6 +45,7 @@ from docling_core.types.doc.document import (
45
45
  Formatting,
46
46
  FormItem,
47
47
  FormulaItem,
48
+ GroupItem,
48
49
  ImageRef,
49
50
  InlineGroup,
50
51
  KeyValueItem,
@@ -124,26 +125,24 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
124
125
  my_visited = visited if visited is not None else set()
125
126
  params = MarkdownParams(**kwargs)
126
127
  res_parts: list[SerializationResult] = []
127
- text = item.text
128
128
  escape_html = True
129
129
  escape_underscores = True
130
- processing_pending = True
131
- if isinstance(item, (ListItem, TitleItem, SectionHeaderItem)):
132
- # case where processing/formatting should be applied first (in inner scope)
130
+
131
+ has_inline_repr = (
132
+ item.text == ""
133
+ and len(item.children) == 1
134
+ and isinstance((child_group := item.children[0].resolve(doc)), InlineGroup)
135
+ )
136
+ if has_inline_repr:
137
+ text = doc_serializer.serialize(item=child_group, visited=my_visited).text
133
138
  processing_pending = False
134
- if (
135
- text == ""
136
- and len(item.children) == 1
137
- and isinstance(
138
- (child_group := item.children[0].resolve(doc)), InlineGroup
139
- )
140
- ):
141
- # case of inline within heading / list item
142
- ser_res = doc_serializer.serialize(item=child_group)
143
- text = ser_res.text
144
- for span in ser_res.spans:
145
- my_visited.add(span.item.self_ref)
146
- else:
139
+ else:
140
+ text = item.text
141
+ processing_pending = True
142
+
143
+ if isinstance(item, (ListItem, TitleItem, SectionHeaderItem)):
144
+ if not has_inline_repr:
145
+ # case where processing/formatting should be applied first (in inner scope)
147
146
  text = doc_serializer.post_process(
148
147
  text=text,
149
148
  escape_html=escape_html,
@@ -151,6 +150,7 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
151
150
  formatting=item.formatting,
152
151
  hyperlink=item.hyperlink,
153
152
  )
153
+ processing_pending = False
154
154
 
155
155
  if isinstance(item, ListItem):
156
156
  pieces: list[str] = []
@@ -332,7 +332,7 @@ class MarkdownTableSerializer(BaseTableSerializer):
332
332
  ]
333
333
  for row in item.data.grid
334
334
  ]
335
- if len(rows) > 1 and len(rows[0]) > 0:
335
+ if len(rows) > 0:
336
336
  try:
337
337
  table_text = tabulate(rows[1:], headers=rows[0], tablefmt="github")
338
338
  except ValueError:
@@ -600,13 +600,15 @@ class MarkdownFallbackSerializer(BaseFallbackSerializer):
600
600
  **kwargs: Any,
601
601
  ) -> SerializationResult:
602
602
  """Serializes the passed item."""
603
- if isinstance(item, DocItem):
603
+ if isinstance(item, GroupItem):
604
+ parts = doc_serializer.get_parts(item=item, **kwargs)
605
+ text_res = "\n\n".join([p.text for p in parts if p.text])
606
+ return create_ser_result(text=text_res, span_source=parts)
607
+ else:
604
608
  return create_ser_result(
605
609
  text="<!-- missing-text -->",
606
- span_source=item,
610
+ span_source=item if isinstance(item, DocItem) else [],
607
611
  )
608
- else:
609
- return create_ser_result()
610
612
 
611
613
 
612
614
  class MarkdownDocSerializer(DocSerializer):
@@ -60,7 +60,7 @@ _logger = logging.getLogger(__name__)
60
60
 
61
61
  Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
62
62
  LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
63
- CURRENT_VERSION: Final = "1.6.0"
63
+ CURRENT_VERSION: Final = "1.7.0"
64
64
 
65
65
  DEFAULT_EXPORT_LABELS = {
66
66
  DocItemLabel.TITLE,
@@ -310,6 +310,7 @@ class TableCell(BaseModel):
310
310
  column_header: bool = False
311
311
  row_header: bool = False
312
312
  row_section: bool = False
313
+ fillable: bool = False
313
314
 
314
315
  @model_validator(mode="before")
315
316
  @classmethod
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.47.0
3
+ Version: 2.48.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docling-core"
3
- version = "2.47.0" # DO NOT EDIT, updated automatically
3
+ version = "2.48.1" # DO NOT EDIT, updated automatically
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  license-files = ["LICENSE"]
@@ -734,7 +734,7 @@ def _test_export_methods(
734
734
  for table in doc.tables:
735
735
  table.export_to_markdown()
736
736
  table.export_to_html(doc)
737
- table.export_to_dataframe()
737
+ table.export_to_dataframe(doc)
738
738
  table.export_to_doctags(doc)
739
739
 
740
740
  # Test Images export ...
@@ -2102,7 +2102,7 @@ def _construct_rich_table_doc():
2102
2102
 
2103
2103
  table_item = doc.add_table(
2104
2104
  data=TableData(
2105
- num_rows=4,
2105
+ num_rows=5,
2106
2106
  num_cols=2,
2107
2107
  ),
2108
2108
  )
@@ -2121,6 +2121,17 @@ def _construct_rich_table_doc():
2121
2121
  rich_item_3 = doc.add_table(
2122
2122
  data=TableData(num_rows=2, num_cols=3), parent=table_item
2123
2123
  )
2124
+
2125
+ rich_item_4 = doc.add_group(parent=table_item, label=GroupLabel.UNSPECIFIED)
2126
+ doc.add_text(
2127
+ parent=rich_item_4,
2128
+ text="Some text in a generic group.",
2129
+ label=DocItemLabel.TEXT,
2130
+ )
2131
+ doc.add_text(
2132
+ parent=rich_item_4, text="More text in the group.", label=DocItemLabel.TEXT
2133
+ )
2134
+
2124
2135
  for i in range(rich_item_3.data.num_rows):
2125
2136
  for j in range(rich_item_3.data.num_cols):
2126
2137
  cell = TableCell(
@@ -2158,6 +2169,14 @@ def _construct_rich_table_doc():
2158
2169
  end_col_offset_idx=j + 1,
2159
2170
  ref=rich_item_3.get_ref(),
2160
2171
  )
2172
+ elif i == 4 and j == 0:
2173
+ cell = RichTableCell(
2174
+ start_row_offset_idx=i,
2175
+ end_row_offset_idx=i + 1,
2176
+ start_col_offset_idx=j,
2177
+ end_col_offset_idx=j + 1,
2178
+ ref=rich_item_4.get_ref(),
2179
+ )
2161
2180
  else:
2162
2181
  cell = TableCell(
2163
2182
  start_row_offset_idx=i,
@@ -25,7 +25,13 @@ from docling_core.transforms.serializer.markdown import (
25
25
  )
26
26
  from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
27
27
  from docling_core.types.doc.base import ImageRefMode
28
- from docling_core.types.doc.document import DoclingDocument, MiscAnnotation, TableItem
28
+ from docling_core.types.doc.document import (
29
+ DoclingDocument,
30
+ MiscAnnotation,
31
+ TableCell,
32
+ TableData,
33
+ TableItem,
34
+ )
29
35
  from docling_core.types.doc.labels import DocItemLabel
30
36
 
31
37
  from .test_data_gen_flag import GEN_TEST_DATA
@@ -85,6 +91,11 @@ def verify(exp_file: Path, actual: str):
85
91
  assert expected == actual
86
92
 
87
93
 
94
+ # ===============================
95
+ # Markdown tests
96
+ # ===============================
97
+
98
+
88
99
  def test_md_cross_page_list_page_break():
89
100
  src = Path("./test/data/doc/activities.json")
90
101
  doc = DoclingDocument.load_from_json(src)
@@ -99,7 +110,7 @@ def test_md_cross_page_list_page_break():
99
110
  ),
100
111
  )
101
112
  actual = ser.serialize().text
102
- verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
113
+ verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
103
114
 
104
115
 
105
116
  def test_md_cross_page_list_page_break_none():
@@ -170,20 +181,6 @@ def test_md_cross_page_list_page_break_p2():
170
181
  verify(exp_file=src.parent / f"{src.stem}_p2.gt.md", actual=actual)
171
182
 
172
183
 
173
- def test_html_charts():
174
- src = Path("./test/data/doc/barchart.json")
175
- doc = DoclingDocument.load_from_json(src)
176
-
177
- ser = HTMLDocSerializer(
178
- doc=doc,
179
- params=HTMLParams(
180
- image_mode=ImageRefMode.PLACEHOLDER,
181
- ),
182
- )
183
- actual = ser.serialize().text
184
- verify(exp_file=src.parent / f"{src.stem}.gt.html", actual=actual)
185
-
186
-
187
184
  def test_md_charts():
188
185
  src = Path("./test/data/doc/barchart.json")
189
186
  doc = DoclingDocument.load_from_json(src)
@@ -195,7 +192,7 @@ def test_md_charts():
195
192
  ),
196
193
  )
197
194
  actual = ser.serialize().text
198
- verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
195
+ verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
199
196
 
200
197
 
201
198
  def test_md_inline_and_formatting():
@@ -209,51 +206,7 @@ def test_md_inline_and_formatting():
209
206
  ),
210
207
  )
211
208
  actual = ser.serialize().text
212
- verify(exp_file=src.parent / f"{src.stem}.md", actual=actual)
213
-
214
-
215
- def test_html_cross_page_list_page_break():
216
- src = Path("./test/data/doc/activities.json")
217
- doc = DoclingDocument.load_from_json(src)
218
-
219
- ser = HTMLDocSerializer(
220
- doc=doc,
221
- params=HTMLParams(
222
- image_mode=ImageRefMode.PLACEHOLDER,
223
- ),
224
- )
225
- actual = ser.serialize().text
226
- verify(exp_file=src.parent / f"{src.stem}.gt.html", actual=actual)
227
-
228
-
229
- def test_html_cross_page_list_page_break_p1():
230
- src = Path("./test/data/doc/activities.json")
231
- doc = DoclingDocument.load_from_json(src)
232
-
233
- ser = HTMLDocSerializer(
234
- doc=doc,
235
- params=HTMLParams(
236
- image_mode=ImageRefMode.PLACEHOLDER,
237
- pages={1},
238
- ),
239
- )
240
- actual = ser.serialize().text
241
- verify(exp_file=src.parent / f"{src.stem}_p1.gt.html", actual=actual)
242
-
243
-
244
- def test_html_cross_page_list_page_break_p2():
245
- src = Path("./test/data/doc/activities.json")
246
- doc = DoclingDocument.load_from_json(src)
247
-
248
- ser = HTMLDocSerializer(
249
- doc=doc,
250
- params=HTMLParams(
251
- image_mode=ImageRefMode.PLACEHOLDER,
252
- pages={2},
253
- ),
254
- )
255
- actual = ser.serialize().text
256
- verify(exp_file=src.parent / f"{src.stem}_p2.gt.html", actual=actual)
209
+ verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
257
210
 
258
211
 
259
212
  def test_md_pb_placeholder_and_page_filter():
@@ -269,7 +222,7 @@ def test_md_pb_placeholder_and_page_filter():
269
222
  ),
270
223
  )
271
224
  actual = ser.serialize().text
272
- verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
225
+ verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
273
226
 
274
227
 
275
228
  def test_md_list_item_markers():
@@ -358,7 +311,7 @@ def test_md_nested_lists():
358
311
 
359
312
  ser = MarkdownDocSerializer(doc=doc)
360
313
  actual = ser.serialize().text
361
- verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
314
+ verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
362
315
 
363
316
 
364
317
  def test_md_rich_table():
@@ -370,6 +323,92 @@ def test_md_rich_table():
370
323
  verify(exp_file=exp_file, actual=actual)
371
324
 
372
325
 
326
+ def test_md_single_row_table():
327
+ exp_file = Path("./test/data/doc/single_row_table.gt.md")
328
+ words = ["foo", "bar"]
329
+ doc = DoclingDocument(name="")
330
+ row_idx = 0
331
+ table = doc.add_table(data=TableData(num_rows=1, num_cols=len(words)))
332
+ for col_idx, word in enumerate(words):
333
+ doc.add_table_cell(
334
+ table_item=table,
335
+ cell=TableCell(
336
+ start_row_offset_idx=row_idx,
337
+ end_row_offset_idx=row_idx + 1,
338
+ start_col_offset_idx=col_idx,
339
+ end_col_offset_idx=col_idx + 1,
340
+ text=word,
341
+ ),
342
+ )
343
+
344
+ ser = MarkdownDocSerializer(doc=doc)
345
+ actual = ser.serialize().text
346
+ verify(exp_file=exp_file, actual=actual)
347
+
348
+
349
+ # ===============================
350
+ # HTML tests
351
+ # ===============================
352
+
353
+
354
+ def test_html_charts():
355
+ src = Path("./test/data/doc/barchart.json")
356
+ doc = DoclingDocument.load_from_json(src)
357
+
358
+ ser = HTMLDocSerializer(
359
+ doc=doc,
360
+ params=HTMLParams(
361
+ image_mode=ImageRefMode.PLACEHOLDER,
362
+ ),
363
+ )
364
+ actual = ser.serialize().text
365
+ verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
366
+
367
+
368
+ def test_html_cross_page_list_page_break():
369
+ src = Path("./test/data/doc/activities.json")
370
+ doc = DoclingDocument.load_from_json(src)
371
+
372
+ ser = HTMLDocSerializer(
373
+ doc=doc,
374
+ params=HTMLParams(
375
+ image_mode=ImageRefMode.PLACEHOLDER,
376
+ ),
377
+ )
378
+ actual = ser.serialize().text
379
+ verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
380
+
381
+
382
+ def test_html_cross_page_list_page_break_p1():
383
+ src = Path("./test/data/doc/activities.json")
384
+ doc = DoclingDocument.load_from_json(src)
385
+
386
+ ser = HTMLDocSerializer(
387
+ doc=doc,
388
+ params=HTMLParams(
389
+ image_mode=ImageRefMode.PLACEHOLDER,
390
+ pages={1},
391
+ ),
392
+ )
393
+ actual = ser.serialize().text
394
+ verify(exp_file=src.parent / f"{src.stem}_p1.gt.html", actual=actual)
395
+
396
+
397
+ def test_html_cross_page_list_page_break_p2():
398
+ src = Path("./test/data/doc/activities.json")
399
+ doc = DoclingDocument.load_from_json(src)
400
+
401
+ ser = HTMLDocSerializer(
402
+ doc=doc,
403
+ params=HTMLParams(
404
+ image_mode=ImageRefMode.PLACEHOLDER,
405
+ pages={2},
406
+ ),
407
+ )
408
+ actual = ser.serialize().text
409
+ verify(exp_file=src.parent / f"{src.stem}_p2.gt.html", actual=actual)
410
+
411
+
373
412
  def test_html_split_page():
374
413
  src = Path("./test/data/doc/2408.09869v3_enriched.json")
375
414
  doc = DoclingDocument.load_from_json(src)
@@ -506,7 +545,7 @@ def test_html_nested_lists():
506
545
 
507
546
  ser = HTMLDocSerializer(doc=doc)
508
547
  actual = ser.serialize().text
509
- verify(exp_file=src.parent / f"{src.stem}.gt.html", actual=actual)
548
+ verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
510
549
 
511
550
 
512
551
  def test_html_rich_table():
@@ -518,13 +557,27 @@ def test_html_rich_table():
518
557
  verify(exp_file=exp_file, actual=actual)
519
558
 
520
559
 
560
+ def test_html_inline_and_formatting():
561
+ src = Path("./test/data/doc/inline_and_formatting.yaml")
562
+ doc = DoclingDocument.load_from_yaml(src)
563
+
564
+ ser = HTMLDocSerializer(doc=doc)
565
+ actual = ser.serialize().text
566
+ verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
567
+
568
+
569
+ # ===============================
570
+ # DocTags tests
571
+ # ===============================
572
+
573
+
521
574
  def test_doctags_inline_loc_tags():
522
575
  src = Path("./test/data/doc/2408.09869v3_enriched.json")
523
576
  doc = DoclingDocument.load_from_json(src)
524
577
 
525
578
  ser = DocTagsDocSerializer(doc=doc)
526
579
  actual = ser.serialize().text
527
- verify(exp_file=src.parent / f"{src.stem}.out.dt", actual=actual)
580
+ verify(exp_file=src.with_suffix(".out.dt"), actual=actual)
528
581
 
529
582
 
530
583
  def test_doctags_rich_table():
@@ -535,3 +588,12 @@ def test_doctags_rich_table():
535
588
  ser = DocTagsDocSerializer(doc=doc)
536
589
  actual = ser.serialize().text
537
590
  verify(exp_file=exp_file, actual=actual)
591
+
592
+
593
+ def test_doctags_inline_and_formatting():
594
+ src = Path("./test/data/doc/inline_and_formatting.yaml")
595
+ doc = DoclingDocument.load_from_yaml(src)
596
+
597
+ ser = DocTagsDocSerializer(doc=doc)
598
+ actual = ser.serialize().text
599
+ verify(exp_file=src.with_suffix(".gt.dt"), actual=actual)
File without changes
File without changes
File without changes