docling-core 1.7.2__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (36) hide show
  1. docling_core/transforms/chunker/__init__.py +2 -8
  2. docling_core/transforms/chunker/base.py +27 -40
  3. docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
  4. docling_core/types/__init__.py +12 -8
  5. docling_core/types/doc/__init__.py +25 -0
  6. docling_core/types/doc/base.py +136 -451
  7. docling_core/types/doc/document.py +1288 -559
  8. docling_core/types/{experimental → doc}/labels.py +4 -1
  9. docling_core/types/legacy_doc/__init__.py +6 -0
  10. docling_core/types/legacy_doc/base.py +485 -0
  11. docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
  12. docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
  13. docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
  14. docling_core/types/legacy_doc/document.py +715 -0
  15. docling_core/types/rec/subject.py +1 -1
  16. docling_core/utils/generate_docs.py +82 -0
  17. docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
  18. docling_core/utils/validators.py +3 -3
  19. {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/METADATA +10 -10
  20. {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/RECORD +24 -31
  21. docling_core-2.0.0.dist-info/entry_points.txt +5 -0
  22. docling_core/transforms/id_generator/__init__.py +0 -12
  23. docling_core/transforms/id_generator/base.py +0 -30
  24. docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
  25. docling_core/transforms/id_generator/uuid_generator.py +0 -34
  26. docling_core/transforms/metadata_extractor/__init__.py +0 -13
  27. docling_core/transforms/metadata_extractor/base.py +0 -59
  28. docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
  29. docling_core/types/experimental/__init__.py +0 -30
  30. docling_core/types/experimental/base.py +0 -167
  31. docling_core/types/experimental/document.py +0 -1192
  32. docling_core/utils/ds_generate_docs.py +0 -144
  33. docling_core-1.7.2.dist-info/entry_points.txt +0 -5
  34. /docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
  35. {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/LICENSE +0 -0
  36. {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/WHEEL +0 -0
@@ -1,144 +0,0 @@
1
- #
2
- # Copyright IBM Corp. 2024 - 2024
3
- # SPDX-License-Identifier: MIT
4
- #
5
-
6
- """Generate documentation of Docling types in HTML and Markdown.
7
-
8
- Example:
9
- python docling_core/utils/ds_generate_docs.py /tmp/docling_core_files
10
- """
11
- import argparse
12
- import glob
13
- import json
14
- import os
15
- from argparse import BooleanOptionalAction
16
- from pathlib import Path
17
- from shutil import rmtree
18
- from typing import Final
19
-
20
- from json_schema_for_humans.generate import generate_from_filename
21
- from json_schema_for_humans.generation_configuration import GenerationConfiguration
22
-
23
- from docling_core.utils.ds_generate_jsonschema import generate_json_schema
24
-
25
- MODELS: Final = ["Document", "Record", "Generic"]
26
-
27
-
28
- def _prepare_directory(folder: str, clean: bool = False) -> None:
29
- """Create a directory or empty its content if it already exists.
30
-
31
- Args:
32
- folder: The name of the directory.
33
- clean: Whether any existing content in the directory should be removed.
34
- """
35
- if os.path.isdir(folder):
36
- if clean:
37
- for path in Path(folder).glob("**/*"):
38
- if path.is_file():
39
- path.unlink()
40
- elif path.is_dir():
41
- rmtree(path)
42
- else:
43
- os.makedirs(folder, exist_ok=True)
44
-
45
-
46
- def generate_collection_jsonschema(folder: str):
47
- """Generate the JSON schema of Docling collections and export them to a folder.
48
-
49
- Args:
50
- folder: The name of the directory.
51
- """
52
- for item in MODELS:
53
- json_schema = generate_json_schema(item)
54
- with open(
55
- os.path.join(folder, f"{item}.json"), mode="w", encoding="utf8"
56
- ) as json_file:
57
- json.dump(json_schema, json_file, ensure_ascii=False, indent=2)
58
-
59
-
60
- def generate_collection_html(folder: str):
61
- """Generate HTML pages documenting the data model of Docling collections.
62
-
63
- The JSON schemas files need to be in a folder and the generated HTML pages will be
64
- written in the same folder.
65
-
66
- Args:
67
- folder: The name of the directory.
68
- """
69
- config = GenerationConfiguration(
70
- template_name="js_offline",
71
- expand_buttons=True,
72
- link_to_reused_ref=False,
73
- with_footer=False,
74
- )
75
-
76
- for doc_json in glob.glob(os.path.join(folder, "*.json")):
77
- doc_html = doc_json.removesuffix(".json") + ".html"
78
- generate_from_filename(doc_json, doc_html, config=config)
79
-
80
-
81
- def generate_collection_markdown(folder: str):
82
- """Generate Markdown pages documenting the data model of Docling collections.
83
-
84
- The JSON schemas files need to be in a folder and the generated markdown pages will
85
- be written in the same folder.
86
-
87
- Args:
88
- folder: The name of the directory.
89
- """
90
- config = GenerationConfiguration(
91
- template_name="md_nested",
92
- expand_buttons=True,
93
- link_to_reused_ref=False,
94
- with_footer=False,
95
- show_toc=False,
96
- )
97
-
98
- for doc_json in glob.glob(os.path.join(folder, "*.json")):
99
- doc_html = doc_json.removesuffix(".json") + ".md"
100
- generate_from_filename(doc_json, doc_html, config=config)
101
-
102
-
103
- def main() -> None:
104
- """Generate the JSON Schema of Docling collections and export documentation."""
105
- argparser = argparse.ArgumentParser()
106
- argparser.add_argument(
107
- "directory",
108
- help=(
109
- "Directory to generate files. If it exists, any existing content will be"
110
- " removed."
111
- ),
112
- )
113
- argparser.add_argument(
114
- "--clean",
115
- help="Whether any existing content in directory should be removed.",
116
- action=BooleanOptionalAction,
117
- dest="clean",
118
- default=False,
119
- required=False,
120
- )
121
- argparser.add_argument(
122
- "--template",
123
- action="store",
124
- default="markdown",
125
- choices=["html", "markdown"],
126
- type=str,
127
- required=False,
128
- dest="template",
129
- help="Documentation template.",
130
- )
131
- args = argparser.parse_args()
132
-
133
- _prepare_directory(args.directory, args.clean)
134
-
135
- generate_collection_jsonschema(args.directory)
136
-
137
- if args.template == "html":
138
- generate_collection_html(args.directory)
139
- elif args.template == "markdown":
140
- generate_collection_markdown(args.directory)
141
-
142
-
143
- if __name__ == "__main__":
144
- main()
@@ -1,5 +0,0 @@
1
- [console_scripts]
2
- ds_generate_docs=docling_core.utils.ds_generate_docs:main
3
- ds_generate_jsonschema=docling_core.utils.ds_generate_jsonschema:main
4
- validate=docling_core.utils.validate:main
5
-
File without changes