docling-core 2.44.2__tar.gz → 2.46.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.44.2 → docling_core-2.46.0}/PKG-INFO +1 -1
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/serializer/common.py +1 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/serializer/doctags.py +2 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/serializer/html.py +18 -12
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/serializer/markdown.py +8 -1
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/doc/__init__.py +2 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/doc/document.py +285 -257
- docling_core-2.46.0/docling_core/types/doc/utils.py +282 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/utils/legacy.py +1 -1
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core.egg-info/PKG-INFO +1 -1
- {docling_core-2.44.2 → docling_core-2.46.0}/pyproject.toml +1 -1
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_docling_doc.py +332 -1
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_serialization.py +29 -1
- docling_core-2.44.2/docling_core/types/doc/utils.py +0 -86
- {docling_core-2.44.2 → docling_core-2.46.0}/LICENSE +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/README.md +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/py.typed +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/search/package.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/chunker/page_chunker.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/serializer/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/serializer/base.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/serializer/html_styles.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/visualizer/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/visualizer/base.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/visualizer/key_value_visualizer.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/base.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/doc/page.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core/utils/validators.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core.egg-info/SOURCES.txt +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core.egg-info/dependency_links.txt +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core.egg-info/entry_points.txt +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core.egg-info/requires.txt +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/docling_core.egg-info/top_level.txt +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/setup.cfg +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_base.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_collection.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_data_gen_flag.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_doc_base.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_doc_legacy_convert.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_doc_schema.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_doc_schema_extractor.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_doctags_load.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_hierarchical_chunker.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_hybrid_chunker.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_json_schema_to_search_mapper.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_nlp_qa.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_otsl_table_export.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_page.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_page_chunker.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_rec_schema.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_search_meta.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_utils.py +0 -0
- {docling_core-2.44.2 → docling_core-2.46.0}/test/test_visualization.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.46.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -157,6 +157,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
|
|
|
157
157
|
item: TableItem,
|
|
158
158
|
doc_serializer: BaseDocSerializer,
|
|
159
159
|
doc: DoclingDocument,
|
|
160
|
+
visited: Optional[set[str]] = None,
|
|
160
161
|
**kwargs: Any,
|
|
161
162
|
) -> SerializationResult:
|
|
162
163
|
"""Serializes the passed item."""
|
|
@@ -179,6 +180,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
|
|
|
179
180
|
add_cell_text=params.add_table_cell_text,
|
|
180
181
|
xsize=params.xsize,
|
|
181
182
|
ysize=params.ysize,
|
|
183
|
+
visited=visited,
|
|
182
184
|
)
|
|
183
185
|
res_parts.append(create_ser_result(text=otsl_text, span_source=item))
|
|
184
186
|
|
|
@@ -65,8 +65,8 @@ from docling_core.types.doc.document import (
|
|
|
65
65
|
PictureItem,
|
|
66
66
|
PictureMoleculeData,
|
|
67
67
|
PictureTabularChartData,
|
|
68
|
+
RichTableCell,
|
|
68
69
|
SectionHeaderItem,
|
|
69
|
-
TableCell,
|
|
70
70
|
TableItem,
|
|
71
71
|
TextItem,
|
|
72
72
|
TitleItem,
|
|
@@ -346,9 +346,6 @@ class HTMLTableSerializer(BaseTableSerializer):
|
|
|
346
346
|
**kwargs: Any,
|
|
347
347
|
) -> SerializationResult:
|
|
348
348
|
"""Serializes the passed table item to HTML."""
|
|
349
|
-
nrows = item.data.num_rows
|
|
350
|
-
ncols = item.data.num_cols
|
|
351
|
-
|
|
352
349
|
res_parts: list[SerializationResult] = []
|
|
353
350
|
cap_res = doc_serializer.serialize_captions(item=item, tag="caption", **kwargs)
|
|
354
351
|
if cap_res.text:
|
|
@@ -356,11 +353,11 @@ class HTMLTableSerializer(BaseTableSerializer):
|
|
|
356
353
|
|
|
357
354
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
358
355
|
body = ""
|
|
356
|
+
span_source: Union[DocItem, list[SerializationResult]] = []
|
|
359
357
|
|
|
360
|
-
for i in
|
|
358
|
+
for i, row in enumerate(item.data.grid):
|
|
361
359
|
body += "<tr>"
|
|
362
|
-
for j in
|
|
363
|
-
cell: TableCell = item.data.grid[i][j]
|
|
360
|
+
for j, cell in enumerate(row):
|
|
364
361
|
|
|
365
362
|
rowspan, rowstart = (
|
|
366
363
|
cell.row_span,
|
|
@@ -376,7 +373,16 @@ class HTMLTableSerializer(BaseTableSerializer):
|
|
|
376
373
|
if colstart != j:
|
|
377
374
|
continue
|
|
378
375
|
|
|
379
|
-
|
|
376
|
+
if isinstance(cell, RichTableCell):
|
|
377
|
+
ser_res = doc_serializer.serialize(
|
|
378
|
+
item=cell.ref.resolve(doc=doc), **kwargs
|
|
379
|
+
)
|
|
380
|
+
content = ser_res.text
|
|
381
|
+
span_source = [ser_res]
|
|
382
|
+
else:
|
|
383
|
+
content = html.escape(cell.text.strip())
|
|
384
|
+
span_source = item
|
|
385
|
+
|
|
380
386
|
celltag = "td"
|
|
381
387
|
if cell.column_header or cell.row_header or cell.row_section:
|
|
382
388
|
celltag = "th"
|
|
@@ -389,14 +395,14 @@ class HTMLTableSerializer(BaseTableSerializer):
|
|
|
389
395
|
|
|
390
396
|
text_dir = get_text_direction(content)
|
|
391
397
|
if text_dir == "rtl":
|
|
392
|
-
opening_tag += f' dir="{
|
|
398
|
+
opening_tag += f' dir="{text_dir}"'
|
|
393
399
|
|
|
394
400
|
body += f"<{opening_tag}>{content}</{celltag}>"
|
|
395
401
|
body += "</tr>"
|
|
396
402
|
|
|
397
403
|
if body:
|
|
398
404
|
body = f"<tbody>{body}</tbody>"
|
|
399
|
-
res_parts.append(create_ser_result(text=body, span_source=
|
|
405
|
+
res_parts.append(create_ser_result(text=body, span_source=span_source))
|
|
400
406
|
|
|
401
407
|
text_res = "".join([r.text for r in res_parts])
|
|
402
408
|
text_res = f"<table>{text_res}</table>" if text_res else ""
|
|
@@ -1057,7 +1063,7 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
1057
1063
|
if self.params.html_head is not None:
|
|
1058
1064
|
return self.params.html_head
|
|
1059
1065
|
|
|
1060
|
-
head_parts = ["<head>", '<meta charset="UTF-8"
|
|
1066
|
+
head_parts = ["<head>", '<meta charset="UTF-8"/>']
|
|
1061
1067
|
|
|
1062
1068
|
# Add metadata if requested
|
|
1063
1069
|
if params.add_document_metadata:
|
|
@@ -1067,7 +1073,7 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
1067
1073
|
head_parts.append("<title>Docling Document</title>")
|
|
1068
1074
|
|
|
1069
1075
|
head_parts.append(
|
|
1070
|
-
'<meta name="generator" content="Docling HTML Serializer"
|
|
1076
|
+
'<meta name="generator" content="Docling HTML Serializer"/>'
|
|
1071
1077
|
)
|
|
1072
1078
|
|
|
1073
1079
|
# Add default styles or custom CSS
|
|
@@ -55,6 +55,7 @@ from docling_core.types.doc.document import (
|
|
|
55
55
|
PictureItem,
|
|
56
56
|
PictureMoleculeData,
|
|
57
57
|
PictureTabularChartData,
|
|
58
|
+
RichTableCell,
|
|
58
59
|
SectionHeaderItem,
|
|
59
60
|
TableItem,
|
|
60
61
|
TextItem,
|
|
@@ -320,7 +321,13 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
|
|
320
321
|
[
|
|
321
322
|
# make sure that md tables are not broken
|
|
322
323
|
# due to newline chars in the text
|
|
323
|
-
|
|
324
|
+
(
|
|
325
|
+
doc_serializer.serialize(
|
|
326
|
+
item=col.ref.resolve(doc=doc), **kwargs
|
|
327
|
+
).text
|
|
328
|
+
if isinstance(col, RichTableCell)
|
|
329
|
+
else col.text
|
|
330
|
+
).replace("\n", " ")
|
|
324
331
|
for col in row
|
|
325
332
|
]
|
|
326
333
|
for row in item.data.grid
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
|
|
8
8
|
from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
|
|
9
9
|
from .document import (
|
|
10
|
+
AnyTableCell,
|
|
10
11
|
BaseAnnotation,
|
|
11
12
|
ChartBar,
|
|
12
13
|
ChartLine,
|
|
@@ -52,6 +53,7 @@ from .document import (
|
|
|
52
53
|
PictureTabularChartData,
|
|
53
54
|
ProvenanceItem,
|
|
54
55
|
RefItem,
|
|
56
|
+
RichTableCell,
|
|
55
57
|
Script,
|
|
56
58
|
SectionHeaderItem,
|
|
57
59
|
TableCell,
|