docling-core 2.11.0__tar.gz → 2.12.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.11.0 → docling_core-2.12.1}/PKG-INFO +1 -1
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/cli/view.py +1 -1
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/doc/document.py +9 -6
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/doc/labels.py +1 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/legacy_doc/document.py +13 -8
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/utils/validate.py +1 -1
- {docling_core-2.11.0 → docling_core-2.12.1}/pyproject.toml +1 -1
- {docling_core-2.11.0 → docling_core-2.12.1}/LICENSE +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/README.md +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/__init__.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/py.typed +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/search/__init__.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/search/mapping.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/search/meta.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/search/package.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/__init__.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/base.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/utils/alias.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/utils/file.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.11.0 → docling_core-2.12.1}/docling_core/utils/validators.py +0 -0
|
@@ -57,7 +57,7 @@ def view(
|
|
|
57
57
|
doc = DoclingDocument.load_from_json(filename=path)
|
|
58
58
|
target_path = Path(tempfile.mkdtemp()) / "out.html"
|
|
59
59
|
html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
|
|
60
|
-
with open(target_path, "w") as f:
|
|
60
|
+
with open(target_path, "w", encoding="utf-8") as f:
|
|
61
61
|
f.write(html_output)
|
|
62
62
|
webbrowser.open(url=f"file://{target_path.absolute().resolve()}")
|
|
63
63
|
|
|
@@ -1884,7 +1884,7 @@ class DoclingDocument(BaseModel):
|
|
|
1884
1884
|
)
|
|
1885
1885
|
|
|
1886
1886
|
out = new_doc.export_to_dict()
|
|
1887
|
-
with open(filename, "w") as fw:
|
|
1887
|
+
with open(filename, "w", encoding="utf-8") as fw:
|
|
1888
1888
|
json.dump(out, fw, indent=indent)
|
|
1889
1889
|
|
|
1890
1890
|
@classmethod
|
|
@@ -1898,7 +1898,7 @@ class DoclingDocument(BaseModel):
|
|
|
1898
1898
|
:rtype: DoclingDocument
|
|
1899
1899
|
|
|
1900
1900
|
"""
|
|
1901
|
-
with open(filename, "r") as f:
|
|
1901
|
+
with open(filename, "r", encoding="utf-8") as f:
|
|
1902
1902
|
return cls.model_validate_json(f.read())
|
|
1903
1903
|
|
|
1904
1904
|
def save_as_yaml(
|
|
@@ -1919,7 +1919,7 @@ class DoclingDocument(BaseModel):
|
|
|
1919
1919
|
)
|
|
1920
1920
|
|
|
1921
1921
|
out = new_doc.export_to_dict()
|
|
1922
|
-
with open(filename, "w") as fw:
|
|
1922
|
+
with open(filename, "w", encoding="utf-8") as fw:
|
|
1923
1923
|
yaml.dump(out, fw, default_flow_style=default_flow_style)
|
|
1924
1924
|
|
|
1925
1925
|
def export_to_dict(
|
|
@@ -1971,7 +1971,7 @@ class DoclingDocument(BaseModel):
|
|
|
1971
1971
|
page_no=page_no,
|
|
1972
1972
|
)
|
|
1973
1973
|
|
|
1974
|
-
with open(filename, "w") as fw:
|
|
1974
|
+
with open(filename, "w", encoding="utf-8") as fw:
|
|
1975
1975
|
fw.write(md_out)
|
|
1976
1976
|
|
|
1977
1977
|
def export_to_markdown( # noqa: C901
|
|
@@ -2038,6 +2038,9 @@ class DoclingDocument(BaseModel):
|
|
|
2038
2038
|
if ix < from_element or to_element <= ix:
|
|
2039
2039
|
continue # skip as many items as you want
|
|
2040
2040
|
|
|
2041
|
+
if (isinstance(item, DocItem)) and (item.label not in labels):
|
|
2042
|
+
continue # skip any label that is not whitelisted
|
|
2043
|
+
|
|
2041
2044
|
# Handle newlines between different types of content
|
|
2042
2045
|
if (
|
|
2043
2046
|
len(mdtexts) > 0
|
|
@@ -2224,7 +2227,7 @@ class DoclingDocument(BaseModel):
|
|
|
2224
2227
|
html_head=html_head,
|
|
2225
2228
|
)
|
|
2226
2229
|
|
|
2227
|
-
with open(filename, "w") as fw:
|
|
2230
|
+
with open(filename, "w", encoding="utf-8") as fw:
|
|
2228
2231
|
fw.write(html_out)
|
|
2229
2232
|
|
|
2230
2233
|
def _get_output_paths(
|
|
@@ -2462,7 +2465,7 @@ class DoclingDocument(BaseModel):
|
|
|
2462
2465
|
with_groups=with_groups,
|
|
2463
2466
|
)
|
|
2464
2467
|
|
|
2465
|
-
with open(filename, "w") as fw:
|
|
2468
|
+
with open(filename, "w", encoding="utf-8") as fw:
|
|
2466
2469
|
fw.write(out)
|
|
2467
2470
|
|
|
2468
2471
|
def export_to_document_tokens(
|
|
@@ -550,17 +550,18 @@ class ExportedCCSDocument(
|
|
|
550
550
|
|
|
551
551
|
elif (
|
|
552
552
|
isinstance(item, Table)
|
|
553
|
-
and item.data
|
|
553
|
+
and (item.data or item.text)
|
|
554
554
|
and item_type in main_text_labels
|
|
555
555
|
):
|
|
556
556
|
|
|
557
557
|
md_table = ""
|
|
558
558
|
table = []
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
559
|
+
if item.data is not None:
|
|
560
|
+
for row in item.data:
|
|
561
|
+
tmp = []
|
|
562
|
+
for col in row:
|
|
563
|
+
tmp.append(col.text)
|
|
564
|
+
table.append(tmp)
|
|
564
565
|
|
|
565
566
|
if len(table) > 1 and len(table[0]) > 0:
|
|
566
567
|
try:
|
|
@@ -579,7 +580,9 @@ class ExportedCCSDocument(
|
|
|
579
580
|
if item.text:
|
|
580
581
|
markdown_text = item.text
|
|
581
582
|
if not strict_text:
|
|
582
|
-
markdown_text +=
|
|
583
|
+
markdown_text += (
|
|
584
|
+
"\n\n" if len(markdown_text) > 0 else ""
|
|
585
|
+
) + md_table
|
|
583
586
|
|
|
584
587
|
elif isinstance(item, Figure) and item_type in main_text_labels:
|
|
585
588
|
|
|
@@ -587,7 +590,9 @@ class ExportedCCSDocument(
|
|
|
587
590
|
if item.text:
|
|
588
591
|
markdown_text = item.text
|
|
589
592
|
if not strict_text:
|
|
590
|
-
markdown_text +=
|
|
593
|
+
markdown_text += (
|
|
594
|
+
"\n" if len(markdown_text) > 0 else ""
|
|
595
|
+
) + image_placeholder
|
|
591
596
|
|
|
592
597
|
if markdown_text:
|
|
593
598
|
md_texts.append(markdown_text)
|
|
@@ -38,7 +38,7 @@ def run():
|
|
|
38
38
|
"""Run the validation of a file containing a Document."""
|
|
39
39
|
file_format, input_file = parse_arguments()
|
|
40
40
|
|
|
41
|
-
with open(input_file, "r") as fd:
|
|
41
|
+
with open(input_file, "r", encoding="utf-8") as fd:
|
|
42
42
|
file_ = json.load(fd)
|
|
43
43
|
|
|
44
44
|
result = (False, "Empty result")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.11.0 → docling_core-2.12.1}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.11.0 → docling_core-2.12.1}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.11.0 → docling_core-2.12.1}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.11.0 → docling_core-2.12.1}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|