docling-core 2.44.1__tar.gz → 2.44.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (110) hide show
  1. {docling_core-2.44.1 → docling_core-2.44.2}/PKG-INFO +1 -1
  2. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/serializer/html.py +32 -72
  3. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core.egg-info/PKG-INFO +1 -1
  4. {docling_core-2.44.1 → docling_core-2.44.2}/pyproject.toml +1 -1
  5. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_serialization.py +18 -0
  6. {docling_core-2.44.1 → docling_core-2.44.2}/LICENSE +0 -0
  7. {docling_core-2.44.1 → docling_core-2.44.2}/README.md +0 -0
  8. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/__init__.py +0 -0
  9. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/cli/__init__.py +0 -0
  10. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/cli/view.py +0 -0
  11. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/experimental/__init__.py +0 -0
  12. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/py.typed +0 -0
  13. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/resources/schemas/doc/ANN.json +0 -0
  14. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/resources/schemas/doc/DOC.json +0 -0
  15. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  16. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/resources/schemas/doc/RAW.json +0 -0
  17. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  18. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  19. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  20. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  21. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/search/__init__.py +0 -0
  22. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  23. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/search/mapping.py +0 -0
  24. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/search/meta.py +0 -0
  25. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/search/package.py +0 -0
  26. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/__init__.py +0 -0
  27. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/chunker/__init__.py +0 -0
  28. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/chunker/base.py +0 -0
  29. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  30. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  31. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/chunker/page_chunker.py +0 -0
  32. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
  33. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
  34. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
  35. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
  36. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/serializer/__init__.py +0 -0
  37. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/serializer/base.py +0 -0
  38. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/serializer/common.py +0 -0
  39. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/serializer/doctags.py +0 -0
  40. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/serializer/html_styles.py +0 -0
  41. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/serializer/markdown.py +0 -0
  42. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/visualizer/__init__.py +0 -0
  43. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/visualizer/base.py +0 -0
  44. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/visualizer/key_value_visualizer.py +0 -0
  45. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
  46. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
  47. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
  48. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/__init__.py +0 -0
  49. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/base.py +0 -0
  50. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/doc/__init__.py +0 -0
  51. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/doc/base.py +0 -0
  52. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/doc/document.py +0 -0
  53. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/doc/labels.py +0 -0
  54. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/doc/page.py +0 -0
  55. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/doc/tokens.py +0 -0
  56. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/doc/utils.py +0 -0
  57. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/gen/__init__.py +0 -0
  58. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/gen/generic.py +0 -0
  59. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/io/__init__.py +0 -0
  60. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/legacy_doc/__init__.py +0 -0
  61. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/legacy_doc/base.py +0 -0
  62. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  63. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  64. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  65. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/legacy_doc/document.py +0 -0
  66. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/legacy_doc/tokens.py +0 -0
  67. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/nlp/__init__.py +0 -0
  68. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/nlp/qa.py +0 -0
  69. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/nlp/qa_labels.py +0 -0
  70. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/rec/__init__.py +0 -0
  71. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/rec/attribute.py +0 -0
  72. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/rec/base.py +0 -0
  73. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/rec/predicate.py +0 -0
  74. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/rec/record.py +0 -0
  75. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/rec/statement.py +0 -0
  76. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/types/rec/subject.py +0 -0
  77. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/utils/__init__.py +0 -0
  78. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/utils/alias.py +0 -0
  79. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/utils/file.py +0 -0
  80. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/utils/generate_docs.py +0 -0
  81. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/utils/generate_jsonschema.py +0 -0
  82. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/utils/legacy.py +0 -0
  83. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/utils/validate.py +0 -0
  84. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core/utils/validators.py +0 -0
  85. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core.egg-info/SOURCES.txt +0 -0
  86. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core.egg-info/dependency_links.txt +0 -0
  87. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core.egg-info/entry_points.txt +0 -0
  88. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core.egg-info/requires.txt +0 -0
  89. {docling_core-2.44.1 → docling_core-2.44.2}/docling_core.egg-info/top_level.txt +0 -0
  90. {docling_core-2.44.1 → docling_core-2.44.2}/setup.cfg +0 -0
  91. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_base.py +0 -0
  92. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_collection.py +0 -0
  93. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_data_gen_flag.py +0 -0
  94. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_doc_base.py +0 -0
  95. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_doc_legacy_convert.py +0 -0
  96. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_doc_schema.py +0 -0
  97. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_doc_schema_extractor.py +0 -0
  98. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_docling_doc.py +0 -0
  99. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_doctags_load.py +0 -0
  100. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_hierarchical_chunker.py +0 -0
  101. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_hybrid_chunker.py +0 -0
  102. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_json_schema_to_search_mapper.py +0 -0
  103. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_nlp_qa.py +0 -0
  104. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_otsl_table_export.py +0 -0
  105. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_page.py +0 -0
  106. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_page_chunker.py +0 -0
  107. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_rec_schema.py +0 -0
  108. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_search_meta.py +0 -0
  109. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_utils.py +0 -0
  110. {docling_core-2.44.1 → docling_core-2.44.2}/test/test_visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.44.1
3
+ Version: 2.44.2
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -130,11 +130,14 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
130
130
  doc_serializer: BaseDocSerializer,
131
131
  doc: DoclingDocument,
132
132
  is_inline_scope: bool = False,
133
+ visited: Optional[set[str]] = None,
133
134
  **kwargs: Any,
134
135
  ) -> SerializationResult:
135
136
  """Serializes the passed text item to HTML."""
136
137
  params = HTMLParams(**kwargs)
138
+ my_visited: set[str] = visited if visited is not None else set()
137
139
  res_parts: list[SerializationResult] = []
140
+ post_processed = False
138
141
 
139
142
  # Prepare the HTML based on item type
140
143
  if isinstance(item, TitleItem):
@@ -162,7 +165,28 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
162
165
 
163
166
  elif isinstance(item, ListItem):
164
167
  # List items are handled by list serializer
165
- text_inner = self._prepare_content(item.text)
168
+ text_parts: list[str] = []
169
+ if item_text := self._prepare_content(item.text):
170
+ item_text = doc_serializer.post_process(
171
+ text=item_text,
172
+ formatting=item.formatting,
173
+ hyperlink=item.hyperlink,
174
+ )
175
+ post_processed = True
176
+ text_parts.append(item_text)
177
+ nested_parts = [
178
+ r.text
179
+ for r in doc_serializer.get_parts(
180
+ item=item,
181
+ is_inline_scope=is_inline_scope,
182
+ visited=my_visited,
183
+ **kwargs,
184
+ )
185
+ ]
186
+ text_parts.extend(nested_parts)
187
+ text_inner = "\n".join(text_parts)
188
+ if nested_parts:
189
+ text_inner = f"\n{text_inner}\n"
166
190
  text = (
167
191
  get_html_tag_with_text_direction(
168
192
  html_tag="li",
@@ -185,11 +209,12 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
185
209
  text = get_html_tag_with_text_direction(html_tag="p", text=text_inner)
186
210
 
187
211
  # Apply formatting and hyperlinks
188
- text = doc_serializer.post_process(
189
- text=text,
190
- formatting=item.formatting,
191
- hyperlink=item.hyperlink,
192
- )
212
+ if not post_processed:
213
+ text = doc_serializer.post_process(
214
+ text=text,
215
+ formatting=item.formatting,
216
+ hyperlink=item.hyperlink,
217
+ )
193
218
 
194
219
  if text:
195
220
  text_res = create_ser_result(text=text, span_source=item)
@@ -703,7 +728,6 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
703
728
  ) -> SerializationResult:
704
729
  """Serializes a list to HTML."""
705
730
  my_visited: set[str] = visited if visited is not None else set()
706
- params = HTMLParams(**kwargs)
707
731
  # Get all child parts
708
732
  parts = doc_serializer.get_parts(
709
733
  item=item,
@@ -713,72 +737,8 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
713
737
  **kwargs,
714
738
  )
715
739
 
716
- # Append nested list to parent list item:
717
- i = 0
718
- while i < len(parts):
719
- prt = parts[i]
720
- if prt.text.startswith(("<ul>", "<ol>")):
721
- for j in range(i - 1, -1, -1):
722
- if parts[j].text.startswith(("<li>", "<li ")) and parts[
723
- j
724
- ].text.endswith("</li>"):
725
- before, _, _ = parts[j].text.rpartition("</li>")
726
- parts[j].text = f"{before}\n{prt.text}\n</li>"
727
- break
728
- if j > -1:
729
- parts.pop(i)
730
- else:
731
- i += 1
732
-
733
740
  # Add all child parts
734
- text_res = "\n".join(
735
- [
736
- (
737
- p.text
738
- if (
739
- (
740
- p.text.startswith(("<li>", "<li "))
741
- and p.text.endswith("</li>")
742
- )
743
- or (
744
- p.text.startswith(("<ol>", "<ol "))
745
- and p.text.endswith("</ol>")
746
- )
747
- or (
748
- p.text.startswith(("<ul>", "<ul "))
749
- and p.text.endswith("</ul>")
750
- )
751
- )
752
- else (
753
- get_html_tag_with_text_direction(
754
- html_tag="li",
755
- text=p.text,
756
- attrs=(
757
- {
758
- "style": f"list-style-type: '{grandparent_item.marker} ';"
759
- }
760
- if params.show_original_list_item_marker
761
- and grandparent_item.marker
762
- else {}
763
- ),
764
- )
765
- if p.spans
766
- and p.spans[0].item.parent
767
- and isinstance(
768
- (parent_item := p.spans[0].item.parent.resolve(doc)),
769
- InlineGroup,
770
- )
771
- and parent_item.parent
772
- and isinstance(
773
- (grandparent_item := parent_item.parent.resolve(doc)),
774
- ListItem,
775
- )
776
- else f"<li>{p.text}</li>"
777
- )
778
- )
779
- for p in parts
780
- ]
781
- )
741
+ text_res = "\n".join(p.text for p in parts if p.text)
782
742
  if text_res:
783
743
  tag = "ol" if item.first_item_is_enumerated(doc) else "ul"
784
744
  text_res = f"<{tag}>\n{text_res}\n</{tag}>"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.44.1
3
+ Version: 2.44.2
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docling-core"
3
- version = "2.44.1" # DO NOT EDIT, updated automatically
3
+ version = "2.44.2" # DO NOT EDIT, updated automatically
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  license-files = ["LICENSE"]
@@ -352,6 +352,15 @@ def test_md_mark_annotations_true():
352
352
  )
353
353
 
354
354
 
355
+ def test_md_nested_lists():
356
+ src = Path("./test/data/doc/polymers.json")
357
+ doc = DoclingDocument.load_from_json(src)
358
+
359
+ ser = MarkdownDocSerializer(doc=doc)
360
+ actual = ser.serialize().text
361
+ verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
362
+
363
+
355
364
  def test_html_split_page():
356
365
  src = Path("./test/data/doc/2408.09869v3_enriched.json")
357
366
  doc = DoclingDocument.load_from_json(src)
@@ -482,6 +491,15 @@ def test_html_list_item_markers():
482
491
  )
483
492
 
484
493
 
494
+ def test_html_nested_lists():
495
+ src = Path("./test/data/doc/polymers.json")
496
+ doc = DoclingDocument.load_from_json(src)
497
+
498
+ ser = HTMLDocSerializer(doc=doc)
499
+ actual = ser.serialize().text
500
+ verify(exp_file=src.parent / f"{src.stem}.gt.html", actual=actual)
501
+
502
+
485
503
  def test_doctags_inline_loc_tags():
486
504
  src = Path("./test/data/doc/2408.09869v3_enriched.json")
487
505
  doc = DoclingDocument.load_from_json(src)
File without changes
File without changes
File without changes