docling-core 2.38.2__tar.gz → 2.40.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (108) hide show
  1. {docling_core-2.38.2 → docling_core-2.40.0}/PKG-INFO +1 -1
  2. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/chunker/hierarchical_chunker.py +2 -3
  3. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/serializer/base.py +2 -3
  4. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/serializer/common.py +3 -4
  5. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/serializer/doctags.py +4 -5
  6. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/serializer/html.py +57 -10
  7. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/serializer/markdown.py +75 -21
  8. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/doc/__init__.py +1 -0
  9. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/doc/document.py +78 -65
  10. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/doc/labels.py +1 -1
  11. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/doc/page.py +3 -2
  12. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/doc/utils.py +18 -7
  13. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/utils/file.py +27 -0
  14. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/utils/legacy.py +1 -2
  15. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core.egg-info/PKG-INFO +1 -1
  16. {docling_core-2.38.2 → docling_core-2.40.0}/pyproject.toml +1 -1
  17. {docling_core-2.38.2 → docling_core-2.40.0}/test/test_doc_legacy_convert.py +7 -7
  18. {docling_core-2.38.2 → docling_core-2.40.0}/test/test_docling_doc.py +51 -45
  19. docling_core-2.40.0/test/test_page.py +214 -0
  20. {docling_core-2.38.2 → docling_core-2.40.0}/test/test_serialization.py +41 -0
  21. docling_core-2.38.2/test/test_page.py +0 -79
  22. {docling_core-2.38.2 → docling_core-2.40.0}/LICENSE +0 -0
  23. {docling_core-2.38.2 → docling_core-2.40.0}/README.md +0 -0
  24. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/__init__.py +0 -0
  25. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/cli/__init__.py +0 -0
  26. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/cli/view.py +0 -0
  27. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/experimental/__init__.py +0 -0
  28. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/py.typed +0 -0
  29. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  30. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  31. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  32. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  33. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  34. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  35. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  36. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  37. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/search/__init__.py +0 -0
  38. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  39. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/search/mapping.py +0 -0
  40. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/search/meta.py +0 -0
  41. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/search/package.py +0 -0
  42. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/__init__.py +0 -0
  43. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/chunker/__init__.py +0 -0
  44. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/chunker/base.py +0 -0
  45. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  46. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
  47. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
  48. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
  49. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
  50. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/serializer/__init__.py +0 -0
  51. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/serializer/html_styles.py +0 -0
  52. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/visualizer/__init__.py +0 -0
  53. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/visualizer/base.py +0 -0
  54. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
  55. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
  56. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
  57. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/__init__.py +0 -0
  58. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/base.py +0 -0
  59. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/doc/base.py +0 -0
  60. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/doc/tokens.py +0 -0
  61. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/gen/__init__.py +0 -0
  62. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/gen/generic.py +0 -0
  63. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/io/__init__.py +0 -0
  64. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  65. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/legacy_doc/base.py +0 -0
  66. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  67. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  68. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  69. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/legacy_doc/document.py +0 -0
  70. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  71. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/nlp/__init__.py +0 -0
  72. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/nlp/qa.py +0 -0
  73. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/nlp/qa_labels.py +0 -0
  74. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/rec/__init__.py +0 -0
  75. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/rec/attribute.py +0 -0
  76. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/rec/base.py +0 -0
  77. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/rec/predicate.py +0 -0
  78. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/rec/record.py +0 -0
  79. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/rec/statement.py +0 -0
  80. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/rec/subject.py +0 -0
  81. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/utils/__init__.py +0 -0
  82. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/utils/alias.py +0 -0
  83. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/utils/generate_docs.py +0 -0
  84. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/utils/generate_jsonschema.py +0 -0
  85. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/utils/validate.py +0 -0
  86. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/utils/validators.py +0 -0
  87. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core.egg-info/SOURCES.txt +0 -0
  88. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core.egg-info/dependency_links.txt +0 -0
  89. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core.egg-info/entry_points.txt +0 -0
  90. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core.egg-info/requires.txt +0 -0
  91. {docling_core-2.38.2 → docling_core-2.40.0}/docling_core.egg-info/top_level.txt +0 -0
  92. {docling_core-2.38.2 → docling_core-2.40.0}/setup.cfg +0 -0
  93. {docling_core-2.38.2 → docling_core-2.40.0}/test/test_base.py +0 -0
  94. {docling_core-2.38.2 → docling_core-2.40.0}/test/test_collection.py +0 -0
  95. {docling_core-2.38.2 → docling_core-2.40.0}/test/test_data_gen_flag.py +0 -0
  96. {docling_core-2.38.2 → docling_core-2.40.0}/test/test_doc_base.py +0 -0
  97. {docling_core-2.38.2 → docling_core-2.40.0}/test/test_doc_schema.py +0 -0
  98. {docling_core-2.38.2 → docling_core-2.40.0}/test/test_doc_schema_extractor.py +0 -0
  99. {docling_core-2.38.2 → docling_core-2.40.0}/test/test_doctags_load.py +0 -0
  100. {docling_core-2.38.2 → docling_core-2.40.0}/test/test_hierarchical_chunker.py +0 -0
  101. {docling_core-2.38.2 → docling_core-2.40.0}/test/test_hybrid_chunker.py +0 -0
  102. {docling_core-2.38.2 → docling_core-2.40.0}/test/test_json_schema_to_search_mapper.py +0 -0
  103. {docling_core-2.38.2 → docling_core-2.40.0}/test/test_nlp_qa.py +0 -0
  104. {docling_core-2.38.2 → docling_core-2.40.0}/test/test_otsl_table_export.py +0 -0
  105. {docling_core-2.38.2 → docling_core-2.40.0}/test/test_rec_schema.py +0 -0
  106. {docling_core-2.38.2 → docling_core-2.40.0}/test/test_search_meta.py +0 -0
  107. {docling_core-2.38.2 → docling_core-2.40.0}/test/test_utils.py +0 -0
  108. {docling_core-2.38.2 → docling_core-2.40.0}/test/test_visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.38.2
3
+ Version: 2.40.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -35,11 +35,10 @@ from docling_core.types.doc.document import (
35
35
  DocumentOrigin,
36
36
  InlineGroup,
37
37
  LevelNumber,
38
- OrderedList,
38
+ ListGroup,
39
39
  SectionHeaderItem,
40
40
  TableItem,
41
41
  TitleItem,
42
- UnorderedList,
43
42
  )
44
43
 
45
44
  _VERSION: Final = "1.0.0"
@@ -240,7 +239,7 @@ class HierarchicalChunker(BaseChunker):
240
239
  heading_by_level.pop(k, None)
241
240
  continue
242
241
  elif (
243
- isinstance(item, (OrderedList, UnorderedList, InlineGroup, DocItem))
242
+ isinstance(item, (ListGroup, InlineGroup, DocItem))
244
243
  and item.self_ref not in visited
245
244
  ):
246
245
  ser_res = my_doc_ser.serialize(item=item, visited=visited)
@@ -17,12 +17,11 @@ from docling_core.types.doc.document import (
17
17
  FormItem,
18
18
  InlineGroup,
19
19
  KeyValueItem,
20
+ ListGroup,
20
21
  NodeItem,
21
- OrderedList,
22
22
  PictureItem,
23
23
  TableItem,
24
24
  TextItem,
25
- UnorderedList,
26
25
  )
27
26
 
28
27
 
@@ -128,7 +127,7 @@ class BaseListSerializer(ABC):
128
127
  def serialize(
129
128
  self,
130
129
  *,
131
- item: Union[UnorderedList, OrderedList],
130
+ item: ListGroup,
132
131
  doc_serializer: "BaseDocSerializer",
133
132
  doc: DoclingDocument,
134
133
  **kwargs: Any,
@@ -39,8 +39,8 @@ from docling_core.types.doc.document import (
39
39
  FormItem,
40
40
  InlineGroup,
41
41
  KeyValueItem,
42
+ ListGroup,
42
43
  NodeItem,
43
- OrderedList,
44
44
  PictureClassificationData,
45
45
  PictureDataType,
46
46
  PictureItem,
@@ -49,7 +49,6 @@ from docling_core.types.doc.document import (
49
49
  TableAnnotationType,
50
50
  TableItem,
51
51
  TextItem,
52
- UnorderedList,
53
52
  )
54
53
  from docling_core.types.doc.labels import DocItemLabel
55
54
 
@@ -89,7 +88,7 @@ def _iterate_items(
89
88
  ):
90
89
  if add_page_breaks:
91
90
  if (
92
- isinstance(item, (UnorderedList, OrderedList, InlineGroup))
91
+ isinstance(item, (ListGroup, InlineGroup))
93
92
  and item.self_ref not in my_visited
94
93
  ):
95
94
  # if group starts with new page, yield page break before group node
@@ -316,7 +315,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
316
315
  ########
317
316
  # groups
318
317
  ########
319
- if isinstance(item, (UnorderedList, OrderedList)):
318
+ if isinstance(item, ListGroup):
320
319
  part = self.list_serializer.serialize(
321
320
  item=item,
322
321
  doc_serializer=self,
@@ -1,7 +1,7 @@
1
1
  """Define classes for Doctags serialization."""
2
2
 
3
3
  from enum import Enum
4
- from typing import Any, Dict, List, Optional, Union
4
+ from typing import Any, Dict, List, Optional
5
5
 
6
6
  from pydantic import BaseModel
7
7
  from typing_extensions import override
@@ -34,9 +34,9 @@ from docling_core.types.doc.document import (
34
34
  FormItem,
35
35
  InlineGroup,
36
36
  KeyValueItem,
37
+ ListGroup,
37
38
  ListItem,
38
39
  NodeItem,
39
- OrderedList,
40
40
  PictureClassificationData,
41
41
  PictureItem,
42
42
  PictureMoleculeData,
@@ -44,7 +44,6 @@ from docling_core.types.doc.document import (
44
44
  ProvenanceItem,
45
45
  TableItem,
46
46
  TextItem,
47
- UnorderedList,
48
47
  )
49
48
  from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel
50
49
  from docling_core.types.doc.tokens import DocumentToken
@@ -376,7 +375,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
376
375
  def serialize(
377
376
  self,
378
377
  *,
379
- item: Union[UnorderedList, OrderedList],
378
+ item: ListGroup,
380
379
  doc_serializer: "BaseDocSerializer",
381
380
  doc: DoclingDocument,
382
381
  list_level: int = 0,
@@ -406,7 +405,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
406
405
  text_res = f"{text_res}{delim}"
407
406
  wrap_tag = (
408
407
  DocumentToken.ORDERED_LIST.value
409
- if isinstance(item, OrderedList)
408
+ if item.first_item_is_enumerated(doc)
410
409
  else DocumentToken.UNORDERED_LIST.value
411
410
  )
412
411
  text_res = _wrap(text=text_res, wrap_tag=wrap_tag)
@@ -58,9 +58,9 @@ from docling_core.types.doc.document import (
58
58
  ImageRef,
59
59
  InlineGroup,
60
60
  KeyValueItem,
61
+ ListGroup,
61
62
  ListItem,
62
63
  NodeItem,
63
- OrderedList,
64
64
  PictureClassificationData,
65
65
  PictureItem,
66
66
  PictureMoleculeData,
@@ -70,7 +70,6 @@ from docling_core.types.doc.document import (
70
70
  TableItem,
71
71
  TextItem,
72
72
  TitleItem,
73
- UnorderedList,
74
73
  )
75
74
  from docling_core.types.doc.labels import DocItemLabel
76
75
  from docling_core.types.doc.utils import (
@@ -117,6 +116,8 @@ class HTMLParams(CommonParams):
117
116
 
118
117
  include_annotations: bool = True
119
118
 
119
+ show_original_list_item_marker: bool = True
120
+
120
121
 
121
122
  class HTMLTextSerializer(BaseModel, BaseTextSerializer):
122
123
  """HTML-specific text item serializer."""
@@ -162,7 +163,19 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
162
163
  elif isinstance(item, ListItem):
163
164
  # List items are handled by list serializer
164
165
  text_inner = self._prepare_content(item.text)
165
- text = get_html_tag_with_text_direction(html_tag="li", text=text_inner)
166
+ text = (
167
+ get_html_tag_with_text_direction(
168
+ html_tag="li",
169
+ text=text_inner,
170
+ attrs=(
171
+ {"style": f"list-style-type: '{item.marker} ';"}
172
+ if params.show_original_list_item_marker and item.marker
173
+ else {}
174
+ ),
175
+ )
176
+ if text_inner
177
+ else ""
178
+ )
166
179
 
167
180
  elif is_inline_scope:
168
181
  text = self._prepare_content(item.text)
@@ -680,7 +693,7 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
680
693
  def serialize(
681
694
  self,
682
695
  *,
683
- item: Union[UnorderedList, OrderedList],
696
+ item: ListGroup,
684
697
  doc_serializer: "BaseDocSerializer",
685
698
  doc: DoclingDocument,
686
699
  list_level: int = 0,
@@ -690,7 +703,7 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
690
703
  ) -> SerializationResult:
691
704
  """Serializes a list to HTML."""
692
705
  my_visited: set[str] = visited if visited is not None else set()
693
-
706
+ params = HTMLParams(**kwargs)
694
707
  # Get all child parts
695
708
  parts = doc_serializer.get_parts(
696
709
  item=item,
@@ -706,17 +719,51 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
706
719
  (
707
720
  p.text
708
721
  if (
709
- (p.text.startswith("<li>") and p.text.endswith("</li>"))
710
- or (p.text.startswith("<ol>") and p.text.endswith("</ol>"))
711
- or (p.text.startswith("<ul>") and p.text.endswith("</ul>"))
722
+ (
723
+ p.text.startswith(("<li>", "<li "))
724
+ and p.text.endswith("</li>")
725
+ )
726
+ or (
727
+ p.text.startswith(("<ol>", "<ol "))
728
+ and p.text.endswith("</ol>")
729
+ )
730
+ or (
731
+ p.text.startswith(("<ul>", "<ul "))
732
+ and p.text.endswith("</ul>")
733
+ )
734
+ )
735
+ else (
736
+ get_html_tag_with_text_direction(
737
+ html_tag="li",
738
+ text=p.text,
739
+ attrs=(
740
+ {
741
+ "style": f"list-style-type: '{grandparent_item.marker} ';"
742
+ }
743
+ if params.show_original_list_item_marker
744
+ and grandparent_item.marker
745
+ else {}
746
+ ),
747
+ )
748
+ if p.spans
749
+ and p.spans[0].item.parent
750
+ and isinstance(
751
+ (parent_item := p.spans[0].item.parent.resolve(doc)),
752
+ InlineGroup,
753
+ )
754
+ and parent_item.parent
755
+ and isinstance(
756
+ (grandparent_item := parent_item.parent.resolve(doc)),
757
+ ListItem,
758
+ )
759
+ else f"<li>{p.text}</li>"
712
760
  )
713
- else f"<li>{p.text}</li>"
714
761
  )
715
762
  for p in parts
716
763
  ]
717
764
  )
718
765
  if text_res:
719
- tag = "ol" if isinstance(item, OrderedList) else "ul"
766
+ tag = "ol" if item.first_item_is_enumerated(doc) else "ul"
720
767
  text_res = f"<{tag}>\n{text_res}\n</{tag}>"
721
768
 
722
769
  return create_ser_result(text=text_res, span_source=parts)
@@ -7,6 +7,7 @@
7
7
  import html
8
8
  import re
9
9
  import textwrap
10
+ from enum import Enum
10
11
  from pathlib import Path
11
12
  from typing import Any, Optional, Union
12
13
 
@@ -31,7 +32,6 @@ from docling_core.transforms.serializer.common import (
31
32
  CommonParams,
32
33
  DocSerializer,
33
34
  _get_annotation_text,
34
- _PageBreakSerResult,
35
35
  create_ser_result,
36
36
  )
37
37
  from docling_core.types.doc.base import ImageRefMode
@@ -48,8 +48,9 @@ from docling_core.types.doc.document import (
48
48
  ImageRef,
49
49
  InlineGroup,
50
50
  KeyValueItem,
51
+ ListGroup,
52
+ ListItem,
51
53
  NodeItem,
52
- OrderedList,
53
54
  PictureClassificationData,
54
55
  PictureItem,
55
56
  PictureMoleculeData,
@@ -58,7 +59,6 @@ from docling_core.types.doc.document import (
58
59
  TableItem,
59
60
  TextItem,
60
61
  TitleItem,
61
- UnorderedList,
62
62
  )
63
63
 
64
64
 
@@ -79,6 +79,14 @@ def _get_annotation_ser_result(
79
79
  )
80
80
 
81
81
 
82
+ class OrigListItemMarkerMode(str, Enum):
83
+ """Display mode for original list item marker."""
84
+
85
+ NEVER = "never"
86
+ ALWAYS = "always"
87
+ AUTO = "auto"
88
+
89
+
82
90
  class MarkdownParams(CommonParams):
83
91
  """Markdown-specific serialization parameters."""
84
92
 
@@ -93,6 +101,8 @@ class MarkdownParams(CommonParams):
93
101
  escape_html: bool = True
94
102
  include_annotations: bool = True
95
103
  mark_annotations: bool = False
104
+ orig_list_item_marker_mode: OrigListItemMarkerMode = OrigListItemMarkerMode.AUTO
105
+ ensure_valid_list_item_marker: bool = True
96
106
 
97
107
 
98
108
  class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
@@ -117,7 +127,7 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
117
127
  escape_html = True
118
128
  escape_underscores = True
119
129
  processing_pending = True
120
- if isinstance(item, (TitleItem, SectionHeaderItem)):
130
+ if isinstance(item, (ListItem, TitleItem, SectionHeaderItem)):
121
131
  # case where processing/formatting should be applied first (in inner scope)
122
132
  processing_pending = False
123
133
  if (
@@ -127,7 +137,7 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
127
137
  (child_group := item.children[0].resolve(doc)), InlineGroup
128
138
  )
129
139
  ):
130
- # case of heading with inline
140
+ # case of inline within heading / list item
131
141
  ser_res = doc_serializer.serialize(item=child_group)
132
142
  text = ser_res.text
133
143
  for span in ser_res.spans:
@@ -140,8 +150,55 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
140
150
  formatting=item.formatting,
141
151
  hyperlink=item.hyperlink,
142
152
  )
143
- num_hashes = 1 if isinstance(item, TitleItem) else item.level + 1
144
- text_part = f"{num_hashes * '#'} {text}"
153
+
154
+ if isinstance(item, ListItem):
155
+ pieces: list[str] = []
156
+ case_auto = (
157
+ params.orig_list_item_marker_mode == OrigListItemMarkerMode.AUTO
158
+ and bool(re.search(r"[a-zA-Z0-9]", item.marker))
159
+ )
160
+ case_already_valid = (
161
+ params.ensure_valid_list_item_marker
162
+ and params.orig_list_item_marker_mode
163
+ != OrigListItemMarkerMode.NEVER
164
+ and (
165
+ item.marker in ["-", "*", "+"]
166
+ or re.fullmatch(r"\d+\.", item.marker)
167
+ )
168
+ )
169
+
170
+ # wrap with outer marker (if applicable)
171
+ if params.ensure_valid_list_item_marker and not case_already_valid:
172
+ assert item.parent and isinstance(
173
+ (list_group := item.parent.resolve(doc)), ListGroup
174
+ )
175
+ if list_group.first_item_is_enumerated(doc) and (
176
+ params.orig_list_item_marker_mode != OrigListItemMarkerMode.AUTO
177
+ or not item.marker
178
+ ):
179
+ pos = -1
180
+ for i, child in enumerate(list_group.children):
181
+ if child.resolve(doc) == item:
182
+ pos = i
183
+ break
184
+ md_marker = f"{pos + 1}."
185
+ else:
186
+ md_marker = "-"
187
+ pieces.append(md_marker)
188
+
189
+ # include original marker (if applicable)
190
+ if item.marker and (
191
+ params.orig_list_item_marker_mode == OrigListItemMarkerMode.ALWAYS
192
+ or case_auto
193
+ or case_already_valid
194
+ ):
195
+ pieces.append(item.marker)
196
+
197
+ pieces.append(text)
198
+ text_part = " ".join(pieces)
199
+ else:
200
+ num_hashes = 1 if isinstance(item, TitleItem) else item.level + 1
201
+ text_part = f"{num_hashes * '#'} {text}"
145
202
  elif isinstance(item, CodeItem):
146
203
  text_part = f"`{text}`" if is_inline_scope else f"```\n{text}\n```"
147
204
  escape_html = False
@@ -452,7 +509,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
452
509
  def serialize(
453
510
  self,
454
511
  *,
455
- item: Union[UnorderedList, OrderedList],
512
+ item: ListGroup,
456
513
  doc_serializer: "BaseDocSerializer",
457
514
  doc: DoclingDocument,
458
515
  list_level: int = 0,
@@ -473,27 +530,24 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
473
530
  sep = "\n"
474
531
  my_parts: list[SerializationResult] = []
475
532
  for p in parts:
476
- if p.text and p.text[0] == " " and my_parts:
477
- my_parts[-1].text = sep.join([my_parts[-1].text, p.text]) # update last
533
+ if (
534
+ my_parts
535
+ and p.text
536
+ and p.spans
537
+ and p.spans[0].item.parent
538
+ and isinstance(p.spans[0].item.parent.resolve(doc), InlineGroup)
539
+ ):
540
+ my_parts[-1].text = f"{my_parts[-1].text}{p.text}" # append to last
478
541
  my_parts[-1].spans.extend(p.spans)
479
542
  else:
480
543
  my_parts.append(p)
481
544
 
482
545
  indent_str = list_level * params.indent * " "
483
- is_ol = isinstance(item, OrderedList)
484
546
  text_res = sep.join(
485
547
  [
486
548
  # avoid additional marker on already evaled sublists
487
- (
488
- c.text
489
- if c.text and c.text[0] == " "
490
- else (
491
- f"{indent_str}"
492
- f"{'' if isinstance(c, _PageBreakSerResult) else (f'{i + 1}. ' if is_ol else '- ')}" # noqa: E501
493
- f"{c.text}"
494
- )
495
- )
496
- for i, c in enumerate(my_parts)
549
+ (c.text if c.text and c.text[0] == " " else f"{indent_str}{c.text}")
550
+ for c in my_parts
497
551
  ]
498
552
  )
499
553
  return create_ser_result(text=text_res, span_source=my_parts)
@@ -32,6 +32,7 @@ from .document import (
32
32
  ImageRef,
33
33
  InlineGroup,
34
34
  KeyValueItem,
35
+ ListGroup,
35
36
  ListItem,
36
37
  MiscAnnotation,
37
38
  NodeItem,