docling-core 2.17.0__tar.gz → 2.17.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.17.0 → docling_core-2.17.1}/PKG-INFO +1 -1
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/doc/document.py +34 -17
- {docling_core-2.17.0 → docling_core-2.17.1}/pyproject.toml +1 -1
- {docling_core-2.17.0 → docling_core-2.17.1}/LICENSE +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/README.md +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/cli/view.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/py.typed +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/search/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/search/mapping.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/search/meta.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/search/package.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/base.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/utils/alias.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/utils/file.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/utils/validate.py +0 -0
- {docling_core-2.17.0 → docling_core-2.17.1}/docling_core/utils/validators.py +0 -0
|
@@ -5,6 +5,7 @@ import copy
|
|
|
5
5
|
import hashlib
|
|
6
6
|
import html
|
|
7
7
|
import json
|
|
8
|
+
import logging
|
|
8
9
|
import mimetypes
|
|
9
10
|
import os
|
|
10
11
|
import re
|
|
@@ -20,6 +21,7 @@ from xml.etree.cElementTree import SubElement, tostring
|
|
|
20
21
|
from xml.sax.saxutils import unescape
|
|
21
22
|
|
|
22
23
|
import latex2mathml.converter
|
|
24
|
+
import latex2mathml.exceptions
|
|
23
25
|
import pandas as pd
|
|
24
26
|
import yaml
|
|
25
27
|
from PIL import Image as PILImage
|
|
@@ -44,6 +46,8 @@ from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, Group
|
|
|
44
46
|
from docling_core.types.doc.tokens import DocumentToken, TableToken
|
|
45
47
|
from docling_core.types.doc.utils import relative_path
|
|
46
48
|
|
|
49
|
+
_logger = logging.getLogger(__name__)
|
|
50
|
+
|
|
47
51
|
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
48
52
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
49
53
|
CURRENT_VERSION: Final = "1.0.0"
|
|
@@ -2487,34 +2491,47 @@ class DoclingDocument(BaseModel):
|
|
|
2487
2491
|
)
|
|
2488
2492
|
text = ""
|
|
2489
2493
|
|
|
2490
|
-
|
|
2491
|
-
if (
|
|
2492
|
-
item.text == ""
|
|
2493
|
-
and item.orig != ""
|
|
2494
|
-
and image_mode == ImageRefMode.EMBEDDED
|
|
2495
|
-
and len(item.prov) > 0
|
|
2496
|
-
):
|
|
2494
|
+
def _image_fallback(item: TextItem):
|
|
2497
2495
|
item_image = item.get_image(doc=self)
|
|
2498
2496
|
if item_image is not None:
|
|
2499
2497
|
img_ref = ImageRef.from_pil(item_image, dpi=72)
|
|
2500
|
-
|
|
2498
|
+
return (
|
|
2501
2499
|
"<figure>"
|
|
2502
2500
|
f'<img src="{img_ref.uri}" alt="{item.orig}" />'
|
|
2503
2501
|
"</figure>"
|
|
2504
2502
|
)
|
|
2505
2503
|
|
|
2504
|
+
# If the formula is not processed correcty, use its image
|
|
2505
|
+
if (
|
|
2506
|
+
item.text == ""
|
|
2507
|
+
and item.orig != ""
|
|
2508
|
+
and image_mode == ImageRefMode.EMBEDDED
|
|
2509
|
+
and len(item.prov) > 0
|
|
2510
|
+
):
|
|
2511
|
+
text = _image_fallback(item)
|
|
2512
|
+
|
|
2506
2513
|
# Building a math equation in MathML format
|
|
2507
2514
|
# ref https://www.w3.org/TR/wai-aria-1.1/#math
|
|
2508
2515
|
elif formula_to_mathml:
|
|
2509
|
-
|
|
2510
|
-
|
|
2511
|
-
|
|
2512
|
-
|
|
2513
|
-
|
|
2514
|
-
|
|
2515
|
-
|
|
2516
|
-
|
|
2517
|
-
|
|
2516
|
+
try:
|
|
2517
|
+
mathml_element = latex2mathml.converter.convert_to_element(
|
|
2518
|
+
math_formula, display="block"
|
|
2519
|
+
)
|
|
2520
|
+
annotation = SubElement(
|
|
2521
|
+
mathml_element, "annotation", dict(encoding="TeX")
|
|
2522
|
+
)
|
|
2523
|
+
annotation.text = math_formula
|
|
2524
|
+
mathml = unescape(tostring(mathml_element, encoding="unicode"))
|
|
2525
|
+
text = f"<div>{mathml}</div>"
|
|
2526
|
+
except Exception as err:
|
|
2527
|
+
_logger.warning(
|
|
2528
|
+
"Malformed formula cannot be rendered. "
|
|
2529
|
+
f"Error {err.__class__.__name__}, formula={math_formula}"
|
|
2530
|
+
)
|
|
2531
|
+
if image_mode == ImageRefMode.EMBEDDED and len(item.prov) > 0:
|
|
2532
|
+
text = _image_fallback(item)
|
|
2533
|
+
else:
|
|
2534
|
+
text = f"<pre>{math_formula}</pre>"
|
|
2518
2535
|
|
|
2519
2536
|
elif math_formula != "":
|
|
2520
2537
|
text = f"<pre>{math_formula}</pre>"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.17.0 → docling_core-2.17.1}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.17.0 → docling_core-2.17.1}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.17.0 → docling_core-2.17.1}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.17.0 → docling_core-2.17.1}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|