docling-core 2.38.2__tar.gz → 2.40.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.38.2 → docling_core-2.40.0}/PKG-INFO +1 -1
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/chunker/hierarchical_chunker.py +2 -3
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/serializer/base.py +2 -3
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/serializer/common.py +3 -4
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/serializer/doctags.py +4 -5
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/serializer/html.py +57 -10
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/serializer/markdown.py +75 -21
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/doc/__init__.py +1 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/doc/document.py +78 -65
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/doc/labels.py +1 -1
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/doc/page.py +3 -2
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/doc/utils.py +18 -7
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/utils/file.py +27 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/utils/legacy.py +1 -2
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core.egg-info/PKG-INFO +1 -1
- {docling_core-2.38.2 → docling_core-2.40.0}/pyproject.toml +1 -1
- {docling_core-2.38.2 → docling_core-2.40.0}/test/test_doc_legacy_convert.py +7 -7
- {docling_core-2.38.2 → docling_core-2.40.0}/test/test_docling_doc.py +51 -45
- docling_core-2.40.0/test/test_page.py +214 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/test/test_serialization.py +41 -0
- docling_core-2.38.2/test/test_page.py +0 -79
- {docling_core-2.38.2 → docling_core-2.40.0}/LICENSE +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/README.md +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/__init__.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/py.typed +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/search/package.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/serializer/__init__.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/serializer/html_styles.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/visualizer/__init__.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/visualizer/base.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/base.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core/utils/validators.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core.egg-info/SOURCES.txt +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core.egg-info/dependency_links.txt +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core.egg-info/entry_points.txt +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core.egg-info/requires.txt +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/docling_core.egg-info/top_level.txt +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/setup.cfg +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/test/test_base.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/test/test_collection.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/test/test_data_gen_flag.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/test/test_doc_base.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/test/test_doc_schema.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/test/test_doc_schema_extractor.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/test/test_doctags_load.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/test/test_hierarchical_chunker.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/test/test_hybrid_chunker.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/test/test_json_schema_to_search_mapper.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/test/test_nlp_qa.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/test/test_otsl_table_export.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/test/test_rec_schema.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/test/test_search_meta.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/test/test_utils.py +0 -0
- {docling_core-2.38.2 → docling_core-2.40.0}/test/test_visualization.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.40.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
{docling_core-2.38.2 → docling_core-2.40.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
@@ -35,11 +35,10 @@ from docling_core.types.doc.document import (
|
|
|
35
35
|
DocumentOrigin,
|
|
36
36
|
InlineGroup,
|
|
37
37
|
LevelNumber,
|
|
38
|
-
|
|
38
|
+
ListGroup,
|
|
39
39
|
SectionHeaderItem,
|
|
40
40
|
TableItem,
|
|
41
41
|
TitleItem,
|
|
42
|
-
UnorderedList,
|
|
43
42
|
)
|
|
44
43
|
|
|
45
44
|
_VERSION: Final = "1.0.0"
|
|
@@ -240,7 +239,7 @@ class HierarchicalChunker(BaseChunker):
|
|
|
240
239
|
heading_by_level.pop(k, None)
|
|
241
240
|
continue
|
|
242
241
|
elif (
|
|
243
|
-
isinstance(item, (
|
|
242
|
+
isinstance(item, (ListGroup, InlineGroup, DocItem))
|
|
244
243
|
and item.self_ref not in visited
|
|
245
244
|
):
|
|
246
245
|
ser_res = my_doc_ser.serialize(item=item, visited=visited)
|
|
@@ -17,12 +17,11 @@ from docling_core.types.doc.document import (
|
|
|
17
17
|
FormItem,
|
|
18
18
|
InlineGroup,
|
|
19
19
|
KeyValueItem,
|
|
20
|
+
ListGroup,
|
|
20
21
|
NodeItem,
|
|
21
|
-
OrderedList,
|
|
22
22
|
PictureItem,
|
|
23
23
|
TableItem,
|
|
24
24
|
TextItem,
|
|
25
|
-
UnorderedList,
|
|
26
25
|
)
|
|
27
26
|
|
|
28
27
|
|
|
@@ -128,7 +127,7 @@ class BaseListSerializer(ABC):
|
|
|
128
127
|
def serialize(
|
|
129
128
|
self,
|
|
130
129
|
*,
|
|
131
|
-
item:
|
|
130
|
+
item: ListGroup,
|
|
132
131
|
doc_serializer: "BaseDocSerializer",
|
|
133
132
|
doc: DoclingDocument,
|
|
134
133
|
**kwargs: Any,
|
|
@@ -39,8 +39,8 @@ from docling_core.types.doc.document import (
|
|
|
39
39
|
FormItem,
|
|
40
40
|
InlineGroup,
|
|
41
41
|
KeyValueItem,
|
|
42
|
+
ListGroup,
|
|
42
43
|
NodeItem,
|
|
43
|
-
OrderedList,
|
|
44
44
|
PictureClassificationData,
|
|
45
45
|
PictureDataType,
|
|
46
46
|
PictureItem,
|
|
@@ -49,7 +49,6 @@ from docling_core.types.doc.document import (
|
|
|
49
49
|
TableAnnotationType,
|
|
50
50
|
TableItem,
|
|
51
51
|
TextItem,
|
|
52
|
-
UnorderedList,
|
|
53
52
|
)
|
|
54
53
|
from docling_core.types.doc.labels import DocItemLabel
|
|
55
54
|
|
|
@@ -89,7 +88,7 @@ def _iterate_items(
|
|
|
89
88
|
):
|
|
90
89
|
if add_page_breaks:
|
|
91
90
|
if (
|
|
92
|
-
isinstance(item, (
|
|
91
|
+
isinstance(item, (ListGroup, InlineGroup))
|
|
93
92
|
and item.self_ref not in my_visited
|
|
94
93
|
):
|
|
95
94
|
# if group starts with new page, yield page break before group node
|
|
@@ -316,7 +315,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
316
315
|
########
|
|
317
316
|
# groups
|
|
318
317
|
########
|
|
319
|
-
if isinstance(item,
|
|
318
|
+
if isinstance(item, ListGroup):
|
|
320
319
|
part = self.list_serializer.serialize(
|
|
321
320
|
item=item,
|
|
322
321
|
doc_serializer=self,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Define classes for Doctags serialization."""
|
|
2
2
|
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
from typing_extensions import override
|
|
@@ -34,9 +34,9 @@ from docling_core.types.doc.document import (
|
|
|
34
34
|
FormItem,
|
|
35
35
|
InlineGroup,
|
|
36
36
|
KeyValueItem,
|
|
37
|
+
ListGroup,
|
|
37
38
|
ListItem,
|
|
38
39
|
NodeItem,
|
|
39
|
-
OrderedList,
|
|
40
40
|
PictureClassificationData,
|
|
41
41
|
PictureItem,
|
|
42
42
|
PictureMoleculeData,
|
|
@@ -44,7 +44,6 @@ from docling_core.types.doc.document import (
|
|
|
44
44
|
ProvenanceItem,
|
|
45
45
|
TableItem,
|
|
46
46
|
TextItem,
|
|
47
|
-
UnorderedList,
|
|
48
47
|
)
|
|
49
48
|
from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel
|
|
50
49
|
from docling_core.types.doc.tokens import DocumentToken
|
|
@@ -376,7 +375,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
|
|
|
376
375
|
def serialize(
|
|
377
376
|
self,
|
|
378
377
|
*,
|
|
379
|
-
item:
|
|
378
|
+
item: ListGroup,
|
|
380
379
|
doc_serializer: "BaseDocSerializer",
|
|
381
380
|
doc: DoclingDocument,
|
|
382
381
|
list_level: int = 0,
|
|
@@ -406,7 +405,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
|
|
|
406
405
|
text_res = f"{text_res}{delim}"
|
|
407
406
|
wrap_tag = (
|
|
408
407
|
DocumentToken.ORDERED_LIST.value
|
|
409
|
-
if
|
|
408
|
+
if item.first_item_is_enumerated(doc)
|
|
410
409
|
else DocumentToken.UNORDERED_LIST.value
|
|
411
410
|
)
|
|
412
411
|
text_res = _wrap(text=text_res, wrap_tag=wrap_tag)
|
|
@@ -58,9 +58,9 @@ from docling_core.types.doc.document import (
|
|
|
58
58
|
ImageRef,
|
|
59
59
|
InlineGroup,
|
|
60
60
|
KeyValueItem,
|
|
61
|
+
ListGroup,
|
|
61
62
|
ListItem,
|
|
62
63
|
NodeItem,
|
|
63
|
-
OrderedList,
|
|
64
64
|
PictureClassificationData,
|
|
65
65
|
PictureItem,
|
|
66
66
|
PictureMoleculeData,
|
|
@@ -70,7 +70,6 @@ from docling_core.types.doc.document import (
|
|
|
70
70
|
TableItem,
|
|
71
71
|
TextItem,
|
|
72
72
|
TitleItem,
|
|
73
|
-
UnorderedList,
|
|
74
73
|
)
|
|
75
74
|
from docling_core.types.doc.labels import DocItemLabel
|
|
76
75
|
from docling_core.types.doc.utils import (
|
|
@@ -117,6 +116,8 @@ class HTMLParams(CommonParams):
|
|
|
117
116
|
|
|
118
117
|
include_annotations: bool = True
|
|
119
118
|
|
|
119
|
+
show_original_list_item_marker: bool = True
|
|
120
|
+
|
|
120
121
|
|
|
121
122
|
class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
122
123
|
"""HTML-specific text item serializer."""
|
|
@@ -162,7 +163,19 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
162
163
|
elif isinstance(item, ListItem):
|
|
163
164
|
# List items are handled by list serializer
|
|
164
165
|
text_inner = self._prepare_content(item.text)
|
|
165
|
-
text =
|
|
166
|
+
text = (
|
|
167
|
+
get_html_tag_with_text_direction(
|
|
168
|
+
html_tag="li",
|
|
169
|
+
text=text_inner,
|
|
170
|
+
attrs=(
|
|
171
|
+
{"style": f"list-style-type: '{item.marker} ';"}
|
|
172
|
+
if params.show_original_list_item_marker and item.marker
|
|
173
|
+
else {}
|
|
174
|
+
),
|
|
175
|
+
)
|
|
176
|
+
if text_inner
|
|
177
|
+
else ""
|
|
178
|
+
)
|
|
166
179
|
|
|
167
180
|
elif is_inline_scope:
|
|
168
181
|
text = self._prepare_content(item.text)
|
|
@@ -680,7 +693,7 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
|
|
|
680
693
|
def serialize(
|
|
681
694
|
self,
|
|
682
695
|
*,
|
|
683
|
-
item:
|
|
696
|
+
item: ListGroup,
|
|
684
697
|
doc_serializer: "BaseDocSerializer",
|
|
685
698
|
doc: DoclingDocument,
|
|
686
699
|
list_level: int = 0,
|
|
@@ -690,7 +703,7 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
|
|
|
690
703
|
) -> SerializationResult:
|
|
691
704
|
"""Serializes a list to HTML."""
|
|
692
705
|
my_visited: set[str] = visited if visited is not None else set()
|
|
693
|
-
|
|
706
|
+
params = HTMLParams(**kwargs)
|
|
694
707
|
# Get all child parts
|
|
695
708
|
parts = doc_serializer.get_parts(
|
|
696
709
|
item=item,
|
|
@@ -706,17 +719,51 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
|
|
|
706
719
|
(
|
|
707
720
|
p.text
|
|
708
721
|
if (
|
|
709
|
-
(
|
|
710
|
-
|
|
711
|
-
|
|
722
|
+
(
|
|
723
|
+
p.text.startswith(("<li>", "<li "))
|
|
724
|
+
and p.text.endswith("</li>")
|
|
725
|
+
)
|
|
726
|
+
or (
|
|
727
|
+
p.text.startswith(("<ol>", "<ol "))
|
|
728
|
+
and p.text.endswith("</ol>")
|
|
729
|
+
)
|
|
730
|
+
or (
|
|
731
|
+
p.text.startswith(("<ul>", "<ul "))
|
|
732
|
+
and p.text.endswith("</ul>")
|
|
733
|
+
)
|
|
734
|
+
)
|
|
735
|
+
else (
|
|
736
|
+
get_html_tag_with_text_direction(
|
|
737
|
+
html_tag="li",
|
|
738
|
+
text=p.text,
|
|
739
|
+
attrs=(
|
|
740
|
+
{
|
|
741
|
+
"style": f"list-style-type: '{grandparent_item.marker} ';"
|
|
742
|
+
}
|
|
743
|
+
if params.show_original_list_item_marker
|
|
744
|
+
and grandparent_item.marker
|
|
745
|
+
else {}
|
|
746
|
+
),
|
|
747
|
+
)
|
|
748
|
+
if p.spans
|
|
749
|
+
and p.spans[0].item.parent
|
|
750
|
+
and isinstance(
|
|
751
|
+
(parent_item := p.spans[0].item.parent.resolve(doc)),
|
|
752
|
+
InlineGroup,
|
|
753
|
+
)
|
|
754
|
+
and parent_item.parent
|
|
755
|
+
and isinstance(
|
|
756
|
+
(grandparent_item := parent_item.parent.resolve(doc)),
|
|
757
|
+
ListItem,
|
|
758
|
+
)
|
|
759
|
+
else f"<li>{p.text}</li>"
|
|
712
760
|
)
|
|
713
|
-
else f"<li>{p.text}</li>"
|
|
714
761
|
)
|
|
715
762
|
for p in parts
|
|
716
763
|
]
|
|
717
764
|
)
|
|
718
765
|
if text_res:
|
|
719
|
-
tag = "ol" if
|
|
766
|
+
tag = "ol" if item.first_item_is_enumerated(doc) else "ul"
|
|
720
767
|
text_res = f"<{tag}>\n{text_res}\n</{tag}>"
|
|
721
768
|
|
|
722
769
|
return create_ser_result(text=text_res, span_source=parts)
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
import html
|
|
8
8
|
import re
|
|
9
9
|
import textwrap
|
|
10
|
+
from enum import Enum
|
|
10
11
|
from pathlib import Path
|
|
11
12
|
from typing import Any, Optional, Union
|
|
12
13
|
|
|
@@ -31,7 +32,6 @@ from docling_core.transforms.serializer.common import (
|
|
|
31
32
|
CommonParams,
|
|
32
33
|
DocSerializer,
|
|
33
34
|
_get_annotation_text,
|
|
34
|
-
_PageBreakSerResult,
|
|
35
35
|
create_ser_result,
|
|
36
36
|
)
|
|
37
37
|
from docling_core.types.doc.base import ImageRefMode
|
|
@@ -48,8 +48,9 @@ from docling_core.types.doc.document import (
|
|
|
48
48
|
ImageRef,
|
|
49
49
|
InlineGroup,
|
|
50
50
|
KeyValueItem,
|
|
51
|
+
ListGroup,
|
|
52
|
+
ListItem,
|
|
51
53
|
NodeItem,
|
|
52
|
-
OrderedList,
|
|
53
54
|
PictureClassificationData,
|
|
54
55
|
PictureItem,
|
|
55
56
|
PictureMoleculeData,
|
|
@@ -58,7 +59,6 @@ from docling_core.types.doc.document import (
|
|
|
58
59
|
TableItem,
|
|
59
60
|
TextItem,
|
|
60
61
|
TitleItem,
|
|
61
|
-
UnorderedList,
|
|
62
62
|
)
|
|
63
63
|
|
|
64
64
|
|
|
@@ -79,6 +79,14 @@ def _get_annotation_ser_result(
|
|
|
79
79
|
)
|
|
80
80
|
|
|
81
81
|
|
|
82
|
+
class OrigListItemMarkerMode(str, Enum):
|
|
83
|
+
"""Display mode for original list item marker."""
|
|
84
|
+
|
|
85
|
+
NEVER = "never"
|
|
86
|
+
ALWAYS = "always"
|
|
87
|
+
AUTO = "auto"
|
|
88
|
+
|
|
89
|
+
|
|
82
90
|
class MarkdownParams(CommonParams):
|
|
83
91
|
"""Markdown-specific serialization parameters."""
|
|
84
92
|
|
|
@@ -93,6 +101,8 @@ class MarkdownParams(CommonParams):
|
|
|
93
101
|
escape_html: bool = True
|
|
94
102
|
include_annotations: bool = True
|
|
95
103
|
mark_annotations: bool = False
|
|
104
|
+
orig_list_item_marker_mode: OrigListItemMarkerMode = OrigListItemMarkerMode.AUTO
|
|
105
|
+
ensure_valid_list_item_marker: bool = True
|
|
96
106
|
|
|
97
107
|
|
|
98
108
|
class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
@@ -117,7 +127,7 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
117
127
|
escape_html = True
|
|
118
128
|
escape_underscores = True
|
|
119
129
|
processing_pending = True
|
|
120
|
-
if isinstance(item, (TitleItem, SectionHeaderItem)):
|
|
130
|
+
if isinstance(item, (ListItem, TitleItem, SectionHeaderItem)):
|
|
121
131
|
# case where processing/formatting should be applied first (in inner scope)
|
|
122
132
|
processing_pending = False
|
|
123
133
|
if (
|
|
@@ -127,7 +137,7 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
127
137
|
(child_group := item.children[0].resolve(doc)), InlineGroup
|
|
128
138
|
)
|
|
129
139
|
):
|
|
130
|
-
# case of heading
|
|
140
|
+
# case of inline within heading / list item
|
|
131
141
|
ser_res = doc_serializer.serialize(item=child_group)
|
|
132
142
|
text = ser_res.text
|
|
133
143
|
for span in ser_res.spans:
|
|
@@ -140,8 +150,55 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
140
150
|
formatting=item.formatting,
|
|
141
151
|
hyperlink=item.hyperlink,
|
|
142
152
|
)
|
|
143
|
-
|
|
144
|
-
|
|
153
|
+
|
|
154
|
+
if isinstance(item, ListItem):
|
|
155
|
+
pieces: list[str] = []
|
|
156
|
+
case_auto = (
|
|
157
|
+
params.orig_list_item_marker_mode == OrigListItemMarkerMode.AUTO
|
|
158
|
+
and bool(re.search(r"[a-zA-Z0-9]", item.marker))
|
|
159
|
+
)
|
|
160
|
+
case_already_valid = (
|
|
161
|
+
params.ensure_valid_list_item_marker
|
|
162
|
+
and params.orig_list_item_marker_mode
|
|
163
|
+
!= OrigListItemMarkerMode.NEVER
|
|
164
|
+
and (
|
|
165
|
+
item.marker in ["-", "*", "+"]
|
|
166
|
+
or re.fullmatch(r"\d+\.", item.marker)
|
|
167
|
+
)
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# wrap with outer marker (if applicable)
|
|
171
|
+
if params.ensure_valid_list_item_marker and not case_already_valid:
|
|
172
|
+
assert item.parent and isinstance(
|
|
173
|
+
(list_group := item.parent.resolve(doc)), ListGroup
|
|
174
|
+
)
|
|
175
|
+
if list_group.first_item_is_enumerated(doc) and (
|
|
176
|
+
params.orig_list_item_marker_mode != OrigListItemMarkerMode.AUTO
|
|
177
|
+
or not item.marker
|
|
178
|
+
):
|
|
179
|
+
pos = -1
|
|
180
|
+
for i, child in enumerate(list_group.children):
|
|
181
|
+
if child.resolve(doc) == item:
|
|
182
|
+
pos = i
|
|
183
|
+
break
|
|
184
|
+
md_marker = f"{pos + 1}."
|
|
185
|
+
else:
|
|
186
|
+
md_marker = "-"
|
|
187
|
+
pieces.append(md_marker)
|
|
188
|
+
|
|
189
|
+
# include original marker (if applicable)
|
|
190
|
+
if item.marker and (
|
|
191
|
+
params.orig_list_item_marker_mode == OrigListItemMarkerMode.ALWAYS
|
|
192
|
+
or case_auto
|
|
193
|
+
or case_already_valid
|
|
194
|
+
):
|
|
195
|
+
pieces.append(item.marker)
|
|
196
|
+
|
|
197
|
+
pieces.append(text)
|
|
198
|
+
text_part = " ".join(pieces)
|
|
199
|
+
else:
|
|
200
|
+
num_hashes = 1 if isinstance(item, TitleItem) else item.level + 1
|
|
201
|
+
text_part = f"{num_hashes * '#'} {text}"
|
|
145
202
|
elif isinstance(item, CodeItem):
|
|
146
203
|
text_part = f"`{text}`" if is_inline_scope else f"```\n{text}\n```"
|
|
147
204
|
escape_html = False
|
|
@@ -452,7 +509,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
|
452
509
|
def serialize(
|
|
453
510
|
self,
|
|
454
511
|
*,
|
|
455
|
-
item:
|
|
512
|
+
item: ListGroup,
|
|
456
513
|
doc_serializer: "BaseDocSerializer",
|
|
457
514
|
doc: DoclingDocument,
|
|
458
515
|
list_level: int = 0,
|
|
@@ -473,27 +530,24 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
|
473
530
|
sep = "\n"
|
|
474
531
|
my_parts: list[SerializationResult] = []
|
|
475
532
|
for p in parts:
|
|
476
|
-
if
|
|
477
|
-
my_parts
|
|
533
|
+
if (
|
|
534
|
+
my_parts
|
|
535
|
+
and p.text
|
|
536
|
+
and p.spans
|
|
537
|
+
and p.spans[0].item.parent
|
|
538
|
+
and isinstance(p.spans[0].item.parent.resolve(doc), InlineGroup)
|
|
539
|
+
):
|
|
540
|
+
my_parts[-1].text = f"{my_parts[-1].text}{p.text}" # append to last
|
|
478
541
|
my_parts[-1].spans.extend(p.spans)
|
|
479
542
|
else:
|
|
480
543
|
my_parts.append(p)
|
|
481
544
|
|
|
482
545
|
indent_str = list_level * params.indent * " "
|
|
483
|
-
is_ol = isinstance(item, OrderedList)
|
|
484
546
|
text_res = sep.join(
|
|
485
547
|
[
|
|
486
548
|
# avoid additional marker on already evaled sublists
|
|
487
|
-
(
|
|
488
|
-
|
|
489
|
-
if c.text and c.text[0] == " "
|
|
490
|
-
else (
|
|
491
|
-
f"{indent_str}"
|
|
492
|
-
f"{'' if isinstance(c, _PageBreakSerResult) else (f'{i + 1}. ' if is_ol else '- ')}" # noqa: E501
|
|
493
|
-
f"{c.text}"
|
|
494
|
-
)
|
|
495
|
-
)
|
|
496
|
-
for i, c in enumerate(my_parts)
|
|
549
|
+
(c.text if c.text and c.text[0] == " " else f"{indent_str}{c.text}")
|
|
550
|
+
for c in my_parts
|
|
497
551
|
]
|
|
498
552
|
)
|
|
499
553
|
return create_ser_result(text=text_res, span_source=my_parts)
|