docling-core 2.26.4__tar.gz → 2.27.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.26.4 → docling_core-2.27.0}/PKG-INFO +1 -1
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/experimental/serializer/html.py +27 -1
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/doc/document.py +2 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/doc/labels.py +2 -1
- {docling_core-2.26.4 → docling_core-2.27.0}/pyproject.toml +3 -1
- {docling_core-2.26.4 → docling_core-2.27.0}/LICENSE +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/README.md +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/experimental/serializer/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/experimental/serializer/base.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/experimental/serializer/common.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/experimental/serializer/doctags.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/experimental/serializer/html_styles.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/experimental/serializer/markdown.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/py.typed +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/search/package.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/base.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/doc/page.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.26.4 → docling_core-2.27.0}/docling_core/utils/validators.py +0 -0
|
@@ -57,6 +57,7 @@ from docling_core.types.doc.document import (
|
|
|
57
57
|
NodeItem,
|
|
58
58
|
OrderedList,
|
|
59
59
|
PictureItem,
|
|
60
|
+
PictureTabularChartData,
|
|
60
61
|
SectionHeaderItem,
|
|
61
62
|
TableCell,
|
|
62
63
|
TableItem,
|
|
@@ -104,6 +105,9 @@ class HTMLParams(CommonParams):
|
|
|
104
105
|
# Allow for different output styles
|
|
105
106
|
output_style: HTMLOutputStyle = HTMLOutputStyle.SINGLE_COLUMN
|
|
106
107
|
|
|
108
|
+
# Enable charts to be printed into HTML as tables
|
|
109
|
+
enable_chart_tables: bool = True
|
|
110
|
+
|
|
107
111
|
|
|
108
112
|
class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
109
113
|
"""HTML-specific text item serializer."""
|
|
@@ -402,9 +406,28 @@ class HTMLPictureSerializer(BasePictureSerializer):
|
|
|
402
406
|
and item.image.uri.scheme == "data"
|
|
403
407
|
):
|
|
404
408
|
img_text = f'<img src="{quote(str(item.image.uri))}">'
|
|
409
|
+
|
|
405
410
|
if img_text:
|
|
406
411
|
res_parts.append(create_ser_result(text=img_text, span_source=item))
|
|
407
412
|
|
|
413
|
+
if params.enable_chart_tables:
|
|
414
|
+
# Check if picture has attached PictureTabularChartData
|
|
415
|
+
tabular_chart_annotations = [
|
|
416
|
+
ann
|
|
417
|
+
for ann in item.annotations
|
|
418
|
+
if isinstance(ann, PictureTabularChartData)
|
|
419
|
+
]
|
|
420
|
+
if len(tabular_chart_annotations) > 0:
|
|
421
|
+
temp_doc = DoclingDocument(name="temp")
|
|
422
|
+
temp_table = temp_doc.add_table(
|
|
423
|
+
data=tabular_chart_annotations[0].chart_data
|
|
424
|
+
)
|
|
425
|
+
html_table_content = temp_table.export_to_html(temp_doc)
|
|
426
|
+
if len(html_table_content) > 0:
|
|
427
|
+
res_parts.append(
|
|
428
|
+
create_ser_result(text=html_table_content, span_source=item)
|
|
429
|
+
)
|
|
430
|
+
|
|
408
431
|
text_res = "".join([r.text for r in res_parts])
|
|
409
432
|
if text_res:
|
|
410
433
|
text_res = f"<figure>{text_res}</figure>"
|
|
@@ -779,6 +802,8 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
779
802
|
]
|
|
780
803
|
|
|
781
804
|
if self.params.output_style == HTMLOutputStyle.SPLIT_PAGE:
|
|
805
|
+
applicable_pages = self._get_applicable_pages()
|
|
806
|
+
|
|
782
807
|
html_content = "\n".join([p.text for p in parts if p.text])
|
|
783
808
|
next_page: Optional[int] = None
|
|
784
809
|
prev_full_match_end = 0
|
|
@@ -791,11 +816,12 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
791
816
|
# capture last page
|
|
792
817
|
if next_page is not None:
|
|
793
818
|
pages[next_page] = html_content[prev_full_match_end:]
|
|
819
|
+
elif applicable_pages is not None and len(applicable_pages) == 1:
|
|
820
|
+
pages[applicable_pages[0]] = html_content
|
|
794
821
|
|
|
795
822
|
html_parts.append("<table>")
|
|
796
823
|
html_parts.append("<tbody>")
|
|
797
824
|
|
|
798
|
-
applicable_pages = self._get_applicable_pages()
|
|
799
825
|
for page_no, page in pages.items():
|
|
800
826
|
|
|
801
827
|
if isinstance(page_no, int):
|
|
@@ -3143,6 +3143,7 @@ class DoclingDocument(BaseModel):
|
|
|
3143
3143
|
from_element: int = 0,
|
|
3144
3144
|
to_element: int = sys.maxsize,
|
|
3145
3145
|
labels: Optional[set[DocItemLabel]] = None,
|
|
3146
|
+
enable_chart_tables: bool = True,
|
|
3146
3147
|
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
3147
3148
|
formula_to_mathml: bool = True,
|
|
3148
3149
|
page_no: Optional[int] = None,
|
|
@@ -3176,6 +3177,7 @@ class DoclingDocument(BaseModel):
|
|
|
3176
3177
|
start_idx=from_element,
|
|
3177
3178
|
stop_idx=to_element,
|
|
3178
3179
|
image_mode=image_mode,
|
|
3180
|
+
enable_chart_tables=enable_chart_tables,
|
|
3179
3181
|
formula_to_mathml=formula_to_mathml,
|
|
3180
3182
|
html_head=html_head,
|
|
3181
3183
|
html_lang=html_lang,
|
|
@@ -45,6 +45,7 @@ class DocItemLabel(str, Enum):
|
|
|
45
45
|
DocItemLabel.PAGE_FOOTER: (204, 255, 204),
|
|
46
46
|
DocItemLabel.PAGE_HEADER: (204, 255, 204),
|
|
47
47
|
DocItemLabel.PICTURE: (255, 204, 164),
|
|
48
|
+
DocItemLabel.CHART: (255, 204, 164),
|
|
48
49
|
DocItemLabel.SECTION_HEADER: (255, 153, 153),
|
|
49
50
|
DocItemLabel.TABLE: (255, 204, 204),
|
|
50
51
|
DocItemLabel.TEXT: (255, 255, 153),
|
|
@@ -58,7 +59,7 @@ class DocItemLabel(str, Enum):
|
|
|
58
59
|
DocItemLabel.PARAGRAPH: (255, 255, 153),
|
|
59
60
|
DocItemLabel.REFERENCE: (176, 224, 230),
|
|
60
61
|
}
|
|
61
|
-
return color_map
|
|
62
|
+
return color_map.get(label, (0, 0, 0))
|
|
62
63
|
|
|
63
64
|
|
|
64
65
|
class GroupLabel(str, Enum):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "docling-core"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.27.0"
|
|
4
4
|
description = "A python library to define and validate data types in Docling."
|
|
5
5
|
license = "MIT"
|
|
6
6
|
authors = [
|
|
@@ -80,6 +80,8 @@ types-setuptools = "^70.3.0"
|
|
|
80
80
|
python-semantic-release = "^7.32.2"
|
|
81
81
|
pandas-stubs = "^2.1.4.231227"
|
|
82
82
|
ipykernel = "^6.29.5"
|
|
83
|
+
coverage = "^7.6.2"
|
|
84
|
+
pytest-cov = "^6.0.0"
|
|
83
85
|
|
|
84
86
|
[tool.poetry.group.constraints]
|
|
85
87
|
optional = true
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.26.4 → docling_core-2.27.0}/docling_core/experimental/serializer/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.26.4 → docling_core-2.27.0}/docling_core/experimental/serializer/html_styles.py
RENAMED
|
File without changes
|
{docling_core-2.26.4 → docling_core-2.27.0}/docling_core/experimental/serializer/markdown.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.26.4 → docling_core-2.27.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.26.4 → docling_core-2.27.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.26.4 → docling_core-2.27.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.26.4 → docling_core-2.27.0}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|