docling-core 2.38.0__tar.gz → 2.38.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.38.0 → docling_core-2.38.1}/PKG-INFO +1 -1
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/serializer/common.py +1 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/serializer/markdown.py +43 -18
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core.egg-info/PKG-INFO +1 -1
- {docling_core-2.38.0 → docling_core-2.38.1}/pyproject.toml +1 -1
- {docling_core-2.38.0 → docling_core-2.38.1}/test/test_serialization.py +14 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/LICENSE +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/README.md +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/__init__.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/cli/view.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/py.typed +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/search/__init__.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/search/mapping.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/search/meta.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/search/package.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/serializer/__init__.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/serializer/base.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/serializer/doctags.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/serializer/html.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/serializer/html_styles.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/visualizer/__init__.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/visualizer/base.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/__init__.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/base.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/doc/document.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/doc/page.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/utils/alias.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/utils/file.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/utils/validate.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core/utils/validators.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core.egg-info/SOURCES.txt +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core.egg-info/dependency_links.txt +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core.egg-info/entry_points.txt +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core.egg-info/requires.txt +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/docling_core.egg-info/top_level.txt +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/setup.cfg +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/test/test_base.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/test/test_collection.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/test/test_data_gen_flag.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/test/test_doc_base.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/test/test_doc_legacy_convert.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/test/test_doc_schema.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/test/test_doc_schema_extractor.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/test/test_docling_doc.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/test/test_doctags_load.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/test/test_hierarchical_chunker.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/test/test_hybrid_chunker.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/test/test_json_schema_to_search_mapper.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/test/test_nlp_qa.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/test/test_otsl_table_export.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/test/test_page.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/test/test_rec_schema.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/test/test_search_meta.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/test/test_utils.py +0 -0
- {docling_core-2.38.0 → docling_core-2.38.1}/test/test_visualization.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.38.
|
|
3
|
+
Version: 2.38.1
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -106,26 +106,49 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
106
106
|
doc_serializer: BaseDocSerializer,
|
|
107
107
|
doc: DoclingDocument,
|
|
108
108
|
is_inline_scope: bool = False,
|
|
109
|
+
visited: Optional[set[str]] = None, # refs of visited items
|
|
109
110
|
**kwargs: Any,
|
|
110
111
|
) -> SerializationResult:
|
|
111
112
|
"""Serializes the passed item."""
|
|
113
|
+
my_visited = visited if visited is not None else set()
|
|
112
114
|
params = MarkdownParams(**kwargs)
|
|
113
115
|
res_parts: list[SerializationResult] = []
|
|
116
|
+
text = item.text
|
|
114
117
|
escape_html = True
|
|
115
118
|
escape_underscores = True
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
119
|
+
processing_pending = True
|
|
120
|
+
if isinstance(item, (TitleItem, SectionHeaderItem)):
|
|
121
|
+
# case where processing/formatting should be applied first (in inner scope)
|
|
122
|
+
processing_pending = False
|
|
123
|
+
if (
|
|
124
|
+
text == ""
|
|
125
|
+
and len(item.children) == 1
|
|
126
|
+
and isinstance(
|
|
127
|
+
(child_group := item.children[0].resolve(doc)), InlineGroup
|
|
128
|
+
)
|
|
129
|
+
):
|
|
130
|
+
# case of heading with inline
|
|
131
|
+
ser_res = doc_serializer.serialize(item=child_group)
|
|
132
|
+
text = ser_res.text
|
|
133
|
+
for span in ser_res.spans:
|
|
134
|
+
my_visited.add(span.item.self_ref)
|
|
135
|
+
else:
|
|
136
|
+
text = doc_serializer.post_process(
|
|
137
|
+
text=text,
|
|
138
|
+
escape_html=escape_html,
|
|
139
|
+
escape_underscores=escape_underscores,
|
|
140
|
+
formatting=item.formatting,
|
|
141
|
+
hyperlink=item.hyperlink,
|
|
142
|
+
)
|
|
143
|
+
num_hashes = 1 if isinstance(item, TitleItem) else item.level + 1
|
|
144
|
+
text_part = f"{num_hashes * '#'} {text}"
|
|
120
145
|
elif isinstance(item, CodeItem):
|
|
121
|
-
text_part =
|
|
122
|
-
f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
|
|
123
|
-
)
|
|
146
|
+
text_part = f"`{text}`" if is_inline_scope else f"```\n{text}\n```"
|
|
124
147
|
escape_html = False
|
|
125
148
|
escape_underscores = False
|
|
126
149
|
elif isinstance(item, FormulaItem):
|
|
127
|
-
if
|
|
128
|
-
text_part = f"${
|
|
150
|
+
if text:
|
|
151
|
+
text_part = f"${text}$" if is_inline_scope else f"$${text}$$"
|
|
129
152
|
elif item.orig:
|
|
130
153
|
text_part = "<!-- formula-not-decoded -->"
|
|
131
154
|
else:
|
|
@@ -133,9 +156,10 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
133
156
|
escape_html = False
|
|
134
157
|
escape_underscores = False
|
|
135
158
|
elif params.wrap_width:
|
|
136
|
-
|
|
159
|
+
# although wrapping is not guaranteed if post-processing makes changes
|
|
160
|
+
text_part = textwrap.fill(text, width=params.wrap_width)
|
|
137
161
|
else:
|
|
138
|
-
text_part =
|
|
162
|
+
text_part = text
|
|
139
163
|
|
|
140
164
|
if text_part:
|
|
141
165
|
text_res = create_ser_result(text=text_part, span_source=item)
|
|
@@ -147,13 +171,14 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
147
171
|
res_parts.append(cap_res)
|
|
148
172
|
|
|
149
173
|
text = (" " if is_inline_scope else "\n\n").join([r.text for r in res_parts])
|
|
150
|
-
|
|
151
|
-
text=
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
174
|
+
if processing_pending:
|
|
175
|
+
text = doc_serializer.post_process(
|
|
176
|
+
text=text,
|
|
177
|
+
escape_html=escape_html,
|
|
178
|
+
escape_underscores=escape_underscores,
|
|
179
|
+
formatting=item.formatting,
|
|
180
|
+
hyperlink=item.hyperlink,
|
|
181
|
+
)
|
|
157
182
|
return create_ser_result(text=text, span_source=res_parts)
|
|
158
183
|
|
|
159
184
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.38.
|
|
3
|
+
Version: 2.38.1
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "docling-core"
|
|
3
|
-
version = "2.38.
|
|
3
|
+
version = "2.38.1" # DO NOT EDIT, updated automatically
|
|
4
4
|
description = "A python library to define and validate data types in Docling."
|
|
5
5
|
license = "MIT"
|
|
6
6
|
license-files = ["LICENSE"]
|
|
@@ -196,6 +196,20 @@ def test_md_charts():
|
|
|
196
196
|
verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
|
|
197
197
|
|
|
198
198
|
|
|
199
|
+
def test_md_inline_and_formatting():
|
|
200
|
+
src = Path("./test/data/doc/inline_and_formatting.yaml")
|
|
201
|
+
doc = DoclingDocument.load_from_yaml(src)
|
|
202
|
+
|
|
203
|
+
ser = MarkdownDocSerializer(
|
|
204
|
+
doc=doc,
|
|
205
|
+
params=MarkdownParams(
|
|
206
|
+
image_mode=ImageRefMode.PLACEHOLDER,
|
|
207
|
+
),
|
|
208
|
+
)
|
|
209
|
+
actual = ser.serialize().text
|
|
210
|
+
verify(exp_file=src.parent / f"{src.stem}.md", actual=actual)
|
|
211
|
+
|
|
212
|
+
|
|
199
213
|
def test_html_cross_page_list_page_break():
|
|
200
214
|
src = Path("./test/data/doc/activities.json")
|
|
201
215
|
doc = DoclingDocument.load_from_json(src)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.38.0 → docling_core-2.38.1}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.38.0 → docling_core-2.38.1}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/chunker/tokenizer/__init__.py
RENAMED
|
File without changes
|
{docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/chunker/tokenizer/base.py
RENAMED
|
File without changes
|
{docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/chunker/tokenizer/huggingface.py
RENAMED
|
File without changes
|
{docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/chunker/tokenizer/openai.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/serializer/html_styles.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/visualizer/layout_visualizer.py
RENAMED
|
File without changes
|
|
File without changes
|
{docling_core-2.38.0 → docling_core-2.38.1}/docling_core/transforms/visualizer/table_visualizer.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|