docling-core 1.7.2__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/__init__.py +2 -8
- docling_core/transforms/chunker/base.py +27 -40
- docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
- docling_core/types/__init__.py +12 -8
- docling_core/types/doc/__init__.py +25 -0
- docling_core/types/doc/base.py +136 -451
- docling_core/types/doc/document.py +1288 -559
- docling_core/types/{experimental → doc}/labels.py +4 -1
- docling_core/types/legacy_doc/__init__.py +6 -0
- docling_core/types/legacy_doc/base.py +485 -0
- docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
- docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
- docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
- docling_core/types/legacy_doc/document.py +715 -0
- docling_core/types/rec/subject.py +1 -1
- docling_core/utils/generate_docs.py +82 -0
- docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
- docling_core/utils/validators.py +3 -3
- {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/METADATA +10 -10
- {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/RECORD +24 -31
- docling_core-2.0.0.dist-info/entry_points.txt +5 -0
- docling_core/transforms/id_generator/__init__.py +0 -12
- docling_core/transforms/id_generator/base.py +0 -30
- docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
- docling_core/transforms/id_generator/uuid_generator.py +0 -34
- docling_core/transforms/metadata_extractor/__init__.py +0 -13
- docling_core/transforms/metadata_extractor/base.py +0 -59
- docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
- docling_core/types/experimental/__init__.py +0 -30
- docling_core/types/experimental/base.py +0 -167
- docling_core/types/experimental/document.py +0 -1192
- docling_core/utils/ds_generate_docs.py +0 -144
- docling_core-1.7.2.dist-info/entry_points.txt +0 -5
- /docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
- {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/LICENSE +0 -0
- {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Generate documentation of Docling types as JSON schema.
|
|
7
|
+
|
|
8
|
+
Example:
|
|
9
|
+
python docling_core/utils/generate_docs.py /tmp/docling_core_files
|
|
10
|
+
"""
|
|
11
|
+
import argparse
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
from argparse import BooleanOptionalAction
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from shutil import rmtree
|
|
17
|
+
from typing import Final
|
|
18
|
+
|
|
19
|
+
from docling_core.utils.generate_jsonschema import generate_json_schema
|
|
20
|
+
|
|
21
|
+
MODELS: Final = ["Document", "Record", "Generic"]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _prepare_directory(folder: str, clean: bool = False) -> None:
|
|
25
|
+
"""Create a directory or empty its content if it already exists.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
folder: The name of the directory.
|
|
29
|
+
clean: Whether any existing content in the directory should be removed.
|
|
30
|
+
"""
|
|
31
|
+
if os.path.isdir(folder):
|
|
32
|
+
if clean:
|
|
33
|
+
for path in Path(folder).glob("**/*"):
|
|
34
|
+
if path.is_file():
|
|
35
|
+
path.unlink()
|
|
36
|
+
elif path.is_dir():
|
|
37
|
+
rmtree(path)
|
|
38
|
+
else:
|
|
39
|
+
os.makedirs(folder, exist_ok=True)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def generate_collection_jsonschema(folder: str):
|
|
43
|
+
"""Generate the JSON schema of Docling collections and export them to a folder.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
folder: The name of the directory.
|
|
47
|
+
"""
|
|
48
|
+
for item in MODELS:
|
|
49
|
+
json_schema = generate_json_schema(item)
|
|
50
|
+
with open(
|
|
51
|
+
os.path.join(folder, f"{item}.json"), mode="w", encoding="utf8"
|
|
52
|
+
) as json_file:
|
|
53
|
+
json.dump(json_schema, json_file, ensure_ascii=False, indent=2)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def main() -> None:
|
|
57
|
+
"""Generate the JSON Schema of Docling collections and export documentation."""
|
|
58
|
+
argparser = argparse.ArgumentParser()
|
|
59
|
+
argparser.add_argument(
|
|
60
|
+
"directory",
|
|
61
|
+
help=(
|
|
62
|
+
"Directory to generate files. If it exists, any existing content will be"
|
|
63
|
+
" removed."
|
|
64
|
+
),
|
|
65
|
+
)
|
|
66
|
+
argparser.add_argument(
|
|
67
|
+
"--clean",
|
|
68
|
+
help="Whether any existing content in directory should be removed.",
|
|
69
|
+
action=BooleanOptionalAction,
|
|
70
|
+
dest="clean",
|
|
71
|
+
default=False,
|
|
72
|
+
required=False,
|
|
73
|
+
)
|
|
74
|
+
args = argparser.parse_args()
|
|
75
|
+
|
|
76
|
+
_prepare_directory(args.directory, args.clean)
|
|
77
|
+
|
|
78
|
+
generate_collection_jsonschema(args.directory)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
if __name__ == "__main__":
|
|
82
|
+
main()
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
"""Generate the JSON Schema of pydantic models and export them to files.
|
|
7
7
|
|
|
8
8
|
Example:
|
|
9
|
-
python docling_core/utils/
|
|
9
|
+
python docling_core/utils/generate_jsonschema.py doc.document.TableCell
|
|
10
10
|
|
|
11
11
|
"""
|
|
12
12
|
import argparse
|
|
@@ -27,10 +27,10 @@ def _import_class(class_reference: str) -> Any:
|
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
def generate_json_schema(class_reference: str) -> Union[dict, None]:
|
|
30
|
-
"""Generate a jsonable dict of a model's schema from
|
|
30
|
+
"""Generate a jsonable dict of a model's schema from a data type.
|
|
31
31
|
|
|
32
32
|
Args:
|
|
33
|
-
class_reference: The reference to a class in '
|
|
33
|
+
class_reference: The reference to a class in 'docling_core.types'.
|
|
34
34
|
|
|
35
35
|
Returns:
|
|
36
36
|
A jsonable dict of the model's schema.
|
|
@@ -48,7 +48,7 @@ def main() -> None:
|
|
|
48
48
|
"""Print the JSON Schema of a model."""
|
|
49
49
|
argparser = argparse.ArgumentParser()
|
|
50
50
|
argparser.add_argument(
|
|
51
|
-
"class_ref", help="Class reference, e.g., doc.
|
|
51
|
+
"class_ref", help="Class reference, e.g., doc.document.TableCell"
|
|
52
52
|
)
|
|
53
53
|
args = argparser.parse_args()
|
|
54
54
|
|
docling_core/utils/validators.py
CHANGED
|
@@ -38,7 +38,7 @@ def validate_raw_schema(file_: dict) -> tuple[bool, str]:
|
|
|
38
38
|
|
|
39
39
|
schema_txt = (
|
|
40
40
|
resources.files("docling_core")
|
|
41
|
-
.joinpath("resources/schemas/
|
|
41
|
+
.joinpath("resources/schemas/legacy_doc/RAW.json")
|
|
42
42
|
.read_text("utf-8")
|
|
43
43
|
)
|
|
44
44
|
schema = json.loads(schema_txt)
|
|
@@ -52,7 +52,7 @@ def validate_ann_schema(file_: dict) -> tuple[bool, str]:
|
|
|
52
52
|
|
|
53
53
|
schema_txt = (
|
|
54
54
|
resources.files("docling_core")
|
|
55
|
-
.joinpath("resources/schemas/
|
|
55
|
+
.joinpath("resources/schemas/legacy_doc/ANN.json")
|
|
56
56
|
.read_text("utf-8")
|
|
57
57
|
)
|
|
58
58
|
schema = json.loads(schema_txt)
|
|
@@ -66,7 +66,7 @@ def validate_ocr_schema(file_: dict) -> tuple[bool, str]:
|
|
|
66
66
|
|
|
67
67
|
schema_txt = (
|
|
68
68
|
resources.files("docling_core")
|
|
69
|
-
.joinpath("resources/schemas/
|
|
69
|
+
.joinpath("resources/schemas/legacy_doc/OCR-output.json")
|
|
70
70
|
.read_text("utf-8")
|
|
71
71
|
)
|
|
72
72
|
schema = json.loads(schema_txt)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2.0.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://ds4sd.github.io/
|
|
6
6
|
License: MIT
|
|
@@ -25,10 +25,10 @@ Classifier: Topic :: Database
|
|
|
25
25
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
26
26
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
27
27
|
Classifier: Typing :: Typed
|
|
28
|
-
Requires-Dist: json-schema-for-humans (>=1.0.0,<2.0.0)
|
|
29
28
|
Requires-Dist: jsonref (>=1.1.0,<2.0.0)
|
|
30
29
|
Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
|
|
31
30
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
|
31
|
+
Requires-Dist: pillow (>=10.3.0,<11.0.0)
|
|
32
32
|
Requires-Dist: pydantic (>=2.6.0,<3.0.0)
|
|
33
33
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
34
34
|
Project-URL: Repository, https://github.com/DS4SD/docling-core
|
|
@@ -37,7 +37,7 @@ Description-Content-Type: text/markdown
|
|
|
37
37
|
# Docling Core
|
|
38
38
|
|
|
39
39
|
[](https://pypi.org/project/docling-core/)
|
|
40
|
-

|
|
40
|
+

|
|
41
41
|
[](https://python-poetry.org/)
|
|
42
42
|
[](https://github.com/psf/black)
|
|
43
43
|
[](https://pycqa.github.io/isort/)
|
|
@@ -57,7 +57,7 @@ pip install docling-core
|
|
|
57
57
|
|
|
58
58
|
### Development setup
|
|
59
59
|
|
|
60
|
-
To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
|
|
60
|
+
To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 / 3.13 and Poetry. You can then install from your local clone's root dir:
|
|
61
61
|
```bash
|
|
62
62
|
poetry install
|
|
63
63
|
```
|
|
@@ -81,14 +81,14 @@ poetry run pytest test
|
|
|
81
81
|
Document.model_validate_json(data_str)
|
|
82
82
|
```
|
|
83
83
|
|
|
84
|
-
- You can generate the JSON schema of a model with the script `
|
|
84
|
+
- You can generate the JSON schema of a model with the script `generate_jsonschema`.
|
|
85
85
|
|
|
86
86
|
```py
|
|
87
87
|
# for the `Document` type
|
|
88
|
-
|
|
88
|
+
generate_jsonschema Document
|
|
89
89
|
|
|
90
90
|
# for the use `Record` type
|
|
91
|
-
|
|
91
|
+
generate_jsonschema Record
|
|
92
92
|
```
|
|
93
93
|
|
|
94
94
|
## Documentation
|
|
@@ -97,12 +97,12 @@ Docling supports 3 main data types:
|
|
|
97
97
|
|
|
98
98
|
- **Document** for publications like books, articles, reports, or patents. When Docling converts an unstructured PDF document, the generated JSON follows this schema.
|
|
99
99
|
The Document type also models the metadata that may be attached to the converted document.
|
|
100
|
-
Check [Document](docs/Document.
|
|
100
|
+
Check [Document](docs/Document.json) for the full JSON schema.
|
|
101
101
|
- **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.
|
|
102
102
|
Related to records, the statements can represent annotations on text by Natural Language Processing (NLP) tools.
|
|
103
|
-
Check [Record](docs/Record.
|
|
103
|
+
Check [Record](docs/Record.json) for the full JSON schema.
|
|
104
104
|
- **Generic** for any data representation, ensuring minimal configuration and maximum flexibility.
|
|
105
|
-
Check [Generic](docs/Generic.
|
|
105
|
+
Check [Generic](docs/Generic.json) for the full JSON schema.
|
|
106
106
|
|
|
107
107
|
The data schemas are defined using [pydantic](https://pydantic-docs.helpmanual.io/) models, which provide built-in processes to support the creation of data that adhere to those models.
|
|
108
108
|
|
|
@@ -14,31 +14,24 @@ docling_core/search/mapping.py,sha256=6rqG7LgYSeWmooKNEcRa5gFDLp1ZdzPqDGlwTA5gpO
|
|
|
14
14
|
docling_core/search/meta.py,sha256=wSurrsqdP1N3gQKx027fVdzVmc33a7Y6rPl-FClQvtA,3318
|
|
15
15
|
docling_core/search/package.py,sha256=Lz2ml2eDy5t0ZimnGTq-DXHAn-f18w0bn4H5xrhs75A,1841
|
|
16
16
|
docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9ACDd57ds,106
|
|
17
|
-
docling_core/transforms/chunker/__init__.py,sha256=
|
|
18
|
-
docling_core/transforms/chunker/base.py,sha256=
|
|
19
|
-
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=
|
|
20
|
-
docling_core/
|
|
21
|
-
docling_core/transforms/id_generator/base.py,sha256=SufPsaZUfMpuITq7pMv5YtlLmtGTDgA4LWmjmhQuSM0,704
|
|
22
|
-
docling_core/transforms/id_generator/doc_hash_id_generator.py,sha256=SUw4FBhMZtbWCfc7oMucSwYvJTXqIPMn3yCXPRxtPCI,656
|
|
23
|
-
docling_core/transforms/id_generator/uuid_generator.py,sha256=t8Bky_1JQB9myX-PJGWvW_c4-NvtHPHab6b1NdS-bpU,929
|
|
24
|
-
docling_core/transforms/metadata_extractor/__init__.py,sha256=q_eAUcbaToEuYUPco4uiBO8vgTGSmZUC-r0mS7KbWh8,335
|
|
25
|
-
docling_core/transforms/metadata_extractor/base.py,sha256=7h_S6-buCVtvAvKQKLISjDqFV8D3brewiQ-geqlUriI,1467
|
|
26
|
-
docling_core/transforms/metadata_extractor/simple_metadata_extractor.py,sha256=ZRjDdXgFe8jPBNC_0ruJjQanabpkxceVsCJVVWVWlIg,1629
|
|
27
|
-
docling_core/types/__init__.py,sha256=6mrAEKRW85uHJwNQBufwjPcMWCjm3oocA6MaO4_NLgg,805
|
|
17
|
+
docling_core/transforms/chunker/__init__.py,sha256=cSY_2L6EpR0lkPSDgt_ikjVoQpgIAhofvBfvfR3w_1Y,270
|
|
18
|
+
docling_core/transforms/chunker/base.py,sha256=uPNj6NHUl394Uh6wf01vmro4i3Ez4WUlV5ljfp85EM4,1565
|
|
19
|
+
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=tKJnaKhdAAOkwRu4WOoHISo7qgBx_4T3YNS4nPB_iqc,6390
|
|
20
|
+
docling_core/types/__init__.py,sha256=x_IgCOzOr5TiOVrKNPjWIJjLkDqkyMwNSpDMkGPg2fg,872
|
|
28
21
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
29
|
-
docling_core/types/doc/__init__.py,sha256=
|
|
30
|
-
docling_core/types/doc/base.py,sha256=
|
|
31
|
-
docling_core/types/doc/
|
|
32
|
-
docling_core/types/doc/
|
|
33
|
-
docling_core/types/doc/doc_raw.py,sha256=Y69G6IiauNDaoT-5el4xo1ypWpnBJQ75akGGkCMTZSc,3888
|
|
34
|
-
docling_core/types/doc/document.py,sha256=AKp1kOo0tncf9FX3q7qRWQ2Jz_hZE44smZpyrtsRzY4,24104
|
|
35
|
-
docling_core/types/doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
|
|
36
|
-
docling_core/types/experimental/__init__.py,sha256=mpqa2soTcHHEKqkcSeYBbAHepg0OgVZNReKvPmGz2r4,587
|
|
37
|
-
docling_core/types/experimental/base.py,sha256=k04zvzNI7qo4HfKxLPCePKxCnerzXd582gvrVjF25SI,4225
|
|
38
|
-
docling_core/types/experimental/document.py,sha256=X3z4sjRmWytRbEmSCnKat4D9sYxSV7Olm1YNmG3c5Kg,37874
|
|
39
|
-
docling_core/types/experimental/labels.py,sha256=tpmvpmJuQyYMLhxAvJSVuFhDRh_zQNiP1WrQmNXKQzo,1224
|
|
22
|
+
docling_core/types/doc/__init__.py,sha256=_6QvDWO_AV0iHx72PpDb6XLZTlA7KYQhfL80xGiCq70,625
|
|
23
|
+
docling_core/types/doc/base.py,sha256=tNEXzxe2ihduCezYTUy_jNKMs0RJ6hBS79epYwyc2QY,4326
|
|
24
|
+
docling_core/types/doc/document.py,sha256=hJxpmQOxfTQDL78lNp1LT-ap3uRcxFHKRXj7swscCGo,45403
|
|
25
|
+
docling_core/types/doc/labels.py,sha256=mzmSd072A-qW3IThswHxwIHV8IoyTCbHHlNOrisinRA,1335
|
|
40
26
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
41
27
|
docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
|
|
28
|
+
docling_core/types/legacy_doc/__init__.py,sha256=Pzj_8rft6SJTVTCHgXRwHtuZjL6LK_6dcBWjikL9biY,125
|
|
29
|
+
docling_core/types/legacy_doc/base.py,sha256=l8NKCuORUQ1ebjdGWpj6b30oQEvtErLsIHKQHbbJiPg,14683
|
|
30
|
+
docling_core/types/legacy_doc/doc_ann.py,sha256=CIQHW8yzu70bsMR9gtu7dqe4oz603Tq2eDDt9sh-tYo,1203
|
|
31
|
+
docling_core/types/legacy_doc/doc_ocr.py,sha256=FfFqHAyMSbFt5cKeE7QLcxS0qUweBilBJoN9CH2TsQs,1394
|
|
32
|
+
docling_core/types/legacy_doc/doc_raw.py,sha256=LrvQ9DhNjBRy98p_F9PUyHZeTGAxMKWqJzY4WJ7v-xs,3895
|
|
33
|
+
docling_core/types/legacy_doc/document.py,sha256=A_cTYOjx6pNIICpOUm09YsfwPrIGDEZTKdetb2fx4PM,24273
|
|
34
|
+
docling_core/types/legacy_doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
|
|
42
35
|
docling_core/types/nlp/__init__.py,sha256=hGcztAeVK7xkRBqRRvc4zbY4PGeJ0r0QrEsetnSx9nI,119
|
|
43
36
|
docling_core/types/nlp/qa.py,sha256=TyZjubqkEoREv0YzmuLKlq4WW_TnJNj7BoBY1_r2a1E,2731
|
|
44
37
|
docling_core/types/nlp/qa_labels.py,sha256=YLW2SYM9M1riktCUYctsg83Msb988NV2I754w4ibWzA,5880
|
|
@@ -48,16 +41,16 @@ docling_core/types/rec/base.py,sha256=jhTfInNGyB9NUw7o33PElrFGL80TqhU8MLcLZNZYj3
|
|
|
48
41
|
docling_core/types/rec/predicate.py,sha256=4iDwXl9c4jzHTDIlRNE88yvDzKA9_od0xjPUUUP5IjI,3959
|
|
49
42
|
docling_core/types/rec/record.py,sha256=r1QgPepwH3YjmMHlwwmeK00ZHEJnAsvyOMeXFY_D9_Q,2750
|
|
50
43
|
docling_core/types/rec/statement.py,sha256=YwcV4CbVaAbzNwh14yJ_6Py3Ww0XnUJrEEUiKRdCZ5o,1701
|
|
51
|
-
docling_core/types/rec/subject.py,sha256=
|
|
44
|
+
docling_core/types/rec/subject.py,sha256=PRCERGTMs4YhR3_Ne6jogkm41zYg8uUWb1yFpM7atm4,2572
|
|
52
45
|
docling_core/utils/__init__.py,sha256=VauNNpWRHG0_ISKrsy5-gTxicrdQZSau6qMfuMl3iqk,120
|
|
53
46
|
docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,874
|
|
54
|
-
docling_core/utils/ds_generate_docs.py,sha256=0xGBagdC_PGjyeHXYZo90VnVrSTMZgHb0SYhFa6X7bQ,4248
|
|
55
|
-
docling_core/utils/ds_generate_jsonschema.py,sha256=EhNQutqWJFWuN-yl9UUPFZ7DJTvGqg54qBIvUMHTHdA,1647
|
|
56
47
|
docling_core/utils/file.py,sha256=VQgzjyvmJnAIHB6ex7ikcmbDAR4GA1ALreuO7Ubrp50,1895
|
|
48
|
+
docling_core/utils/generate_docs.py,sha256=td16gmPMAKk_Q-uAWIdYRGwWN9QRr_8_JQJM1gAF5Bw,2283
|
|
49
|
+
docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
|
|
57
50
|
docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
|
|
58
|
-
docling_core/utils/validators.py,sha256=
|
|
59
|
-
docling_core-
|
|
60
|
-
docling_core-
|
|
61
|
-
docling_core-
|
|
62
|
-
docling_core-
|
|
63
|
-
docling_core-
|
|
51
|
+
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
52
|
+
docling_core-2.0.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
53
|
+
docling_core-2.0.0.dist-info/METADATA,sha256=LUzVQtKKrKcVKRlKtI_oWGaKMYt08h0poFGlnxF2zg0,5389
|
|
54
|
+
docling_core-2.0.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
55
|
+
docling_core-2.0.0.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
|
|
56
|
+
docling_core-2.0.0.dist-info/RECORD,,
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
-
# SPDX-License-Identifier: MIT
|
|
4
|
-
#
|
|
5
|
-
|
|
6
|
-
"""Define the ID generator types."""
|
|
7
|
-
|
|
8
|
-
from docling_core.transforms.id_generator.base import BaseIDGenerator # noqa
|
|
9
|
-
from docling_core.transforms.id_generator.doc_hash_id_generator import ( # noqa
|
|
10
|
-
DocHashIDGenerator,
|
|
11
|
-
)
|
|
12
|
-
from docling_core.transforms.id_generator.uuid_generator import UUIDGenerator # noqa
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
-
# SPDX-License-Identifier: MIT
|
|
4
|
-
#
|
|
5
|
-
|
|
6
|
-
"""Base document ID generator module."""
|
|
7
|
-
|
|
8
|
-
from abc import ABC, abstractmethod
|
|
9
|
-
from typing import Any
|
|
10
|
-
|
|
11
|
-
from docling_core.types import Document as DLDocument
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class BaseIDGenerator(ABC):
|
|
15
|
-
"""Document ID generator base class."""
|
|
16
|
-
|
|
17
|
-
@abstractmethod
|
|
18
|
-
def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
|
|
19
|
-
"""Generate an ID for the given document.
|
|
20
|
-
|
|
21
|
-
Args:
|
|
22
|
-
doc (DLDocument): document to generate ID for
|
|
23
|
-
|
|
24
|
-
Raises:
|
|
25
|
-
NotImplementedError: in this abstract implementation
|
|
26
|
-
|
|
27
|
-
Returns:
|
|
28
|
-
str: the generated ID
|
|
29
|
-
"""
|
|
30
|
-
raise NotImplementedError()
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
-
# SPDX-License-Identifier: MIT
|
|
4
|
-
#
|
|
5
|
-
|
|
6
|
-
"""Doc-hash-based ID generator module."""
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
from typing import Any
|
|
10
|
-
|
|
11
|
-
from docling_core.transforms.id_generator import BaseIDGenerator
|
|
12
|
-
from docling_core.types import Document as DLDocument
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class DocHashIDGenerator(BaseIDGenerator):
|
|
16
|
-
"""Doc-hash-based ID generator class."""
|
|
17
|
-
|
|
18
|
-
def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
|
|
19
|
-
"""Generate an ID for the given document.
|
|
20
|
-
|
|
21
|
-
Args:
|
|
22
|
-
doc (DLDocument): document to generate ID for
|
|
23
|
-
|
|
24
|
-
Returns:
|
|
25
|
-
str: the generated ID
|
|
26
|
-
"""
|
|
27
|
-
return doc.file_info.document_hash
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
-
# SPDX-License-Identifier: MIT
|
|
4
|
-
#
|
|
5
|
-
|
|
6
|
-
"""UUID-based ID generator module."""
|
|
7
|
-
|
|
8
|
-
from random import Random
|
|
9
|
-
from typing import Annotated, Any, Optional
|
|
10
|
-
from uuid import UUID
|
|
11
|
-
|
|
12
|
-
from pydantic import BaseModel, Field
|
|
13
|
-
|
|
14
|
-
from docling_core.transforms.id_generator import BaseIDGenerator
|
|
15
|
-
from docling_core.types import Document as DLDocument
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class UUIDGenerator(BaseModel, BaseIDGenerator):
|
|
19
|
-
"""UUID-based ID generator class."""
|
|
20
|
-
|
|
21
|
-
seed: Optional[int] = None
|
|
22
|
-
uuid_version: Annotated[int, Field(strict=True, ge=1, le=5)] = 4
|
|
23
|
-
|
|
24
|
-
def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
|
|
25
|
-
"""Generate an ID for the given document.
|
|
26
|
-
|
|
27
|
-
Args:
|
|
28
|
-
doc (DLDocument): document to generate ID for
|
|
29
|
-
|
|
30
|
-
Returns:
|
|
31
|
-
str: the generated ID
|
|
32
|
-
"""
|
|
33
|
-
rd = Random(x=self.seed)
|
|
34
|
-
return str(UUID(int=rd.getrandbits(128), version=self.uuid_version))
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
-
# SPDX-License-Identifier: MIT
|
|
4
|
-
#
|
|
5
|
-
|
|
6
|
-
"""Define the metadata extractor types."""
|
|
7
|
-
|
|
8
|
-
from docling_core.transforms.metadata_extractor.base import ( # noqa
|
|
9
|
-
BaseMetadataExtractor,
|
|
10
|
-
)
|
|
11
|
-
from docling_core.transforms.metadata_extractor.simple_metadata_extractor import ( # noqa
|
|
12
|
-
SimpleMetadataExtractor,
|
|
13
|
-
)
|
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
-
# SPDX-License-Identifier: MIT
|
|
4
|
-
#
|
|
5
|
-
|
|
6
|
-
"""Base metadata extractor module."""
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
from abc import ABC, abstractmethod
|
|
10
|
-
from typing import Any
|
|
11
|
-
|
|
12
|
-
from pydantic import BaseModel
|
|
13
|
-
|
|
14
|
-
from docling_core.types import Document as DLDocument
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class BaseMetadataExtractor(BaseModel, ABC):
|
|
18
|
-
"""Metadata extractor base class."""
|
|
19
|
-
|
|
20
|
-
@abstractmethod
|
|
21
|
-
def get_metadata(
|
|
22
|
-
self, doc: DLDocument, *args: Any, **kwargs: Any
|
|
23
|
-
) -> dict[str, Any]:
|
|
24
|
-
"""Extract metadata for the given document.
|
|
25
|
-
|
|
26
|
-
Args:
|
|
27
|
-
doc (DLDocument): document to extract metadata for
|
|
28
|
-
|
|
29
|
-
Raises:
|
|
30
|
-
NotImplementedError: in this abstract implementation
|
|
31
|
-
|
|
32
|
-
Returns:
|
|
33
|
-
dict[str, Any]: the extracted metadata
|
|
34
|
-
"""
|
|
35
|
-
raise NotImplementedError()
|
|
36
|
-
|
|
37
|
-
@abstractmethod
|
|
38
|
-
def get_excluded_embed_metadata_keys(self) -> list[str]:
|
|
39
|
-
"""Get metadata keys to exclude from embedding.
|
|
40
|
-
|
|
41
|
-
Raises:
|
|
42
|
-
NotImplementedError: in this abstract implementation
|
|
43
|
-
|
|
44
|
-
Returns:
|
|
45
|
-
list[str]: the metadata to exclude
|
|
46
|
-
"""
|
|
47
|
-
raise NotImplementedError()
|
|
48
|
-
|
|
49
|
-
@abstractmethod
|
|
50
|
-
def get_excluded_llm_metadata_keys(self) -> list[str]:
|
|
51
|
-
"""Get metadata keys to exclude from LLM generation.
|
|
52
|
-
|
|
53
|
-
Raises:
|
|
54
|
-
NotImplementedError: in this abstract implementation
|
|
55
|
-
|
|
56
|
-
Returns:
|
|
57
|
-
list[str]: the metadata to exclude
|
|
58
|
-
"""
|
|
59
|
-
raise NotImplementedError()
|
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
-
# SPDX-License-Identifier: MIT
|
|
4
|
-
#
|
|
5
|
-
|
|
6
|
-
"""Simple metadata extractor module."""
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
from typing import Any, Final
|
|
10
|
-
|
|
11
|
-
from docling_core.transforms.metadata_extractor import BaseMetadataExtractor
|
|
12
|
-
from docling_core.types import Document as DLDocument
|
|
13
|
-
|
|
14
|
-
_DL_DOC_HASH: Final[str] = "dl_doc_hash"
|
|
15
|
-
_ORIGIN: Final[str] = "origin"
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class SimpleMetadataExtractor(BaseMetadataExtractor):
|
|
19
|
-
"""Simple metadata extractor class."""
|
|
20
|
-
|
|
21
|
-
include_origin: bool = False
|
|
22
|
-
|
|
23
|
-
def get_metadata(
|
|
24
|
-
self, doc: DLDocument, origin: str, *args: Any, **kwargs: Any
|
|
25
|
-
) -> dict[str, Any]:
|
|
26
|
-
"""Extract metadata for the given document.
|
|
27
|
-
|
|
28
|
-
Args:
|
|
29
|
-
doc (DLDocument): document to extract metadata for
|
|
30
|
-
origin (str): the document origin
|
|
31
|
-
|
|
32
|
-
Returns:
|
|
33
|
-
dict[str, Any]: the extracted metadata
|
|
34
|
-
"""
|
|
35
|
-
meta: dict[str, Any] = {
|
|
36
|
-
_DL_DOC_HASH: doc.file_info.document_hash,
|
|
37
|
-
}
|
|
38
|
-
if self.include_origin:
|
|
39
|
-
meta[_ORIGIN] = origin
|
|
40
|
-
return meta
|
|
41
|
-
|
|
42
|
-
def get_excluded_embed_metadata_keys(self) -> list[str]:
|
|
43
|
-
"""Get metadata keys to exclude from embedding.
|
|
44
|
-
|
|
45
|
-
Returns:
|
|
46
|
-
list[str]: the metadata to exclude
|
|
47
|
-
"""
|
|
48
|
-
excl_keys: list[str] = [_DL_DOC_HASH]
|
|
49
|
-
if self.include_origin:
|
|
50
|
-
excl_keys.append(_ORIGIN)
|
|
51
|
-
return excl_keys
|
|
52
|
-
|
|
53
|
-
def get_excluded_llm_metadata_keys(self) -> list[str]:
|
|
54
|
-
"""Get metadata keys to exclude from LLM generation.
|
|
55
|
-
|
|
56
|
-
Returns:
|
|
57
|
-
list[str]: the metadata to exclude
|
|
58
|
-
"""
|
|
59
|
-
return self.get_excluded_embed_metadata_keys()
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
-
# SPDX-License-Identifier: MIT
|
|
4
|
-
#
|
|
5
|
-
|
|
6
|
-
"""Package for models defined by the Document type."""
|
|
7
|
-
|
|
8
|
-
from .base import BoundingBox, CoordOrigin, Size
|
|
9
|
-
from .document import (
|
|
10
|
-
BasePictureData,
|
|
11
|
-
BaseTableData,
|
|
12
|
-
DescriptionItem,
|
|
13
|
-
DocItem,
|
|
14
|
-
DoclingDocument,
|
|
15
|
-
DocumentOrigin,
|
|
16
|
-
FloatingItem,
|
|
17
|
-
GroupItem,
|
|
18
|
-
ImageRef,
|
|
19
|
-
KeyValueItem,
|
|
20
|
-
NodeItem,
|
|
21
|
-
PageItem,
|
|
22
|
-
PictureItem,
|
|
23
|
-
ProvenanceItem,
|
|
24
|
-
RefItem,
|
|
25
|
-
SectionHeaderItem,
|
|
26
|
-
TableCell,
|
|
27
|
-
TableItem,
|
|
28
|
-
TextItem,
|
|
29
|
-
)
|
|
30
|
-
from .labels import DocItemLabel, GroupLabel, TableCellLabel
|