docling-core 1.7.2__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/__init__.py +2 -8
- docling_core/transforms/chunker/base.py +27 -40
- docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
- docling_core/types/__init__.py +3 -18
- docling_core/types/doc/__init__.py +25 -0
- docling_core/types/doc/base.py +136 -451
- docling_core/types/doc/document.py +1289 -559
- docling_core/types/{experimental → doc}/labels.py +4 -1
- docling_core/types/legacy_doc/__init__.py +6 -0
- docling_core/types/legacy_doc/base.py +485 -0
- docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
- docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
- docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
- docling_core/types/legacy_doc/document.py +715 -0
- docling_core/types/rec/subject.py +1 -1
- docling_core/utils/generate_docs.py +82 -0
- docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
- docling_core/utils/validators.py +3 -3
- {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/METADATA +17 -17
- {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/RECORD +24 -31
- docling_core-2.0.1.dist-info/entry_points.txt +5 -0
- docling_core/transforms/id_generator/__init__.py +0 -12
- docling_core/transforms/id_generator/base.py +0 -30
- docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
- docling_core/transforms/id_generator/uuid_generator.py +0 -34
- docling_core/transforms/metadata_extractor/__init__.py +0 -13
- docling_core/transforms/metadata_extractor/base.py +0 -59
- docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
- docling_core/types/experimental/__init__.py +0 -30
- docling_core/types/experimental/base.py +0 -167
- docling_core/types/experimental/document.py +0 -1192
- docling_core/utils/ds_generate_docs.py +0 -144
- docling_core-1.7.2.dist-info/entry_points.txt +0 -5
- /docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
- {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/LICENSE +0 -0
- {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/WHEEL +0 -0
|
@@ -1,144 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
-
# SPDX-License-Identifier: MIT
|
|
4
|
-
#
|
|
5
|
-
|
|
6
|
-
"""Generate documentation of Docling types in HTML and Markdown.
|
|
7
|
-
|
|
8
|
-
Example:
|
|
9
|
-
python docling_core/utils/ds_generate_docs.py /tmp/docling_core_files
|
|
10
|
-
"""
|
|
11
|
-
import argparse
|
|
12
|
-
import glob
|
|
13
|
-
import json
|
|
14
|
-
import os
|
|
15
|
-
from argparse import BooleanOptionalAction
|
|
16
|
-
from pathlib import Path
|
|
17
|
-
from shutil import rmtree
|
|
18
|
-
from typing import Final
|
|
19
|
-
|
|
20
|
-
from json_schema_for_humans.generate import generate_from_filename
|
|
21
|
-
from json_schema_for_humans.generation_configuration import GenerationConfiguration
|
|
22
|
-
|
|
23
|
-
from docling_core.utils.ds_generate_jsonschema import generate_json_schema
|
|
24
|
-
|
|
25
|
-
MODELS: Final = ["Document", "Record", "Generic"]
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def _prepare_directory(folder: str, clean: bool = False) -> None:
|
|
29
|
-
"""Create a directory or empty its content if it already exists.
|
|
30
|
-
|
|
31
|
-
Args:
|
|
32
|
-
folder: The name of the directory.
|
|
33
|
-
clean: Whether any existing content in the directory should be removed.
|
|
34
|
-
"""
|
|
35
|
-
if os.path.isdir(folder):
|
|
36
|
-
if clean:
|
|
37
|
-
for path in Path(folder).glob("**/*"):
|
|
38
|
-
if path.is_file():
|
|
39
|
-
path.unlink()
|
|
40
|
-
elif path.is_dir():
|
|
41
|
-
rmtree(path)
|
|
42
|
-
else:
|
|
43
|
-
os.makedirs(folder, exist_ok=True)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def generate_collection_jsonschema(folder: str):
|
|
47
|
-
"""Generate the JSON schema of Docling collections and export them to a folder.
|
|
48
|
-
|
|
49
|
-
Args:
|
|
50
|
-
folder: The name of the directory.
|
|
51
|
-
"""
|
|
52
|
-
for item in MODELS:
|
|
53
|
-
json_schema = generate_json_schema(item)
|
|
54
|
-
with open(
|
|
55
|
-
os.path.join(folder, f"{item}.json"), mode="w", encoding="utf8"
|
|
56
|
-
) as json_file:
|
|
57
|
-
json.dump(json_schema, json_file, ensure_ascii=False, indent=2)
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def generate_collection_html(folder: str):
|
|
61
|
-
"""Generate HTML pages documenting the data model of Docling collections.
|
|
62
|
-
|
|
63
|
-
The JSON schemas files need to be in a folder and the generated HTML pages will be
|
|
64
|
-
written in the same folder.
|
|
65
|
-
|
|
66
|
-
Args:
|
|
67
|
-
folder: The name of the directory.
|
|
68
|
-
"""
|
|
69
|
-
config = GenerationConfiguration(
|
|
70
|
-
template_name="js_offline",
|
|
71
|
-
expand_buttons=True,
|
|
72
|
-
link_to_reused_ref=False,
|
|
73
|
-
with_footer=False,
|
|
74
|
-
)
|
|
75
|
-
|
|
76
|
-
for doc_json in glob.glob(os.path.join(folder, "*.json")):
|
|
77
|
-
doc_html = doc_json.removesuffix(".json") + ".html"
|
|
78
|
-
generate_from_filename(doc_json, doc_html, config=config)
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def generate_collection_markdown(folder: str):
|
|
82
|
-
"""Generate Markdown pages documenting the data model of Docling collections.
|
|
83
|
-
|
|
84
|
-
The JSON schemas files need to be in a folder and the generated markdown pages will
|
|
85
|
-
be written in the same folder.
|
|
86
|
-
|
|
87
|
-
Args:
|
|
88
|
-
folder: The name of the directory.
|
|
89
|
-
"""
|
|
90
|
-
config = GenerationConfiguration(
|
|
91
|
-
template_name="md_nested",
|
|
92
|
-
expand_buttons=True,
|
|
93
|
-
link_to_reused_ref=False,
|
|
94
|
-
with_footer=False,
|
|
95
|
-
show_toc=False,
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
for doc_json in glob.glob(os.path.join(folder, "*.json")):
|
|
99
|
-
doc_html = doc_json.removesuffix(".json") + ".md"
|
|
100
|
-
generate_from_filename(doc_json, doc_html, config=config)
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
def main() -> None:
|
|
104
|
-
"""Generate the JSON Schema of Docling collections and export documentation."""
|
|
105
|
-
argparser = argparse.ArgumentParser()
|
|
106
|
-
argparser.add_argument(
|
|
107
|
-
"directory",
|
|
108
|
-
help=(
|
|
109
|
-
"Directory to generate files. If it exists, any existing content will be"
|
|
110
|
-
" removed."
|
|
111
|
-
),
|
|
112
|
-
)
|
|
113
|
-
argparser.add_argument(
|
|
114
|
-
"--clean",
|
|
115
|
-
help="Whether any existing content in directory should be removed.",
|
|
116
|
-
action=BooleanOptionalAction,
|
|
117
|
-
dest="clean",
|
|
118
|
-
default=False,
|
|
119
|
-
required=False,
|
|
120
|
-
)
|
|
121
|
-
argparser.add_argument(
|
|
122
|
-
"--template",
|
|
123
|
-
action="store",
|
|
124
|
-
default="markdown",
|
|
125
|
-
choices=["html", "markdown"],
|
|
126
|
-
type=str,
|
|
127
|
-
required=False,
|
|
128
|
-
dest="template",
|
|
129
|
-
help="Documentation template.",
|
|
130
|
-
)
|
|
131
|
-
args = argparser.parse_args()
|
|
132
|
-
|
|
133
|
-
_prepare_directory(args.directory, args.clean)
|
|
134
|
-
|
|
135
|
-
generate_collection_jsonschema(args.directory)
|
|
136
|
-
|
|
137
|
-
if args.template == "html":
|
|
138
|
-
generate_collection_html(args.directory)
|
|
139
|
-
elif args.template == "markdown":
|
|
140
|
-
generate_collection_markdown(args.directory)
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
if __name__ == "__main__":
|
|
144
|
-
main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|