docling-core 1.7.2__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (36) hide show
  1. docling_core/transforms/chunker/__init__.py +2 -8
  2. docling_core/transforms/chunker/base.py +27 -40
  3. docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
  4. docling_core/types/__init__.py +3 -18
  5. docling_core/types/doc/__init__.py +25 -0
  6. docling_core/types/doc/base.py +136 -451
  7. docling_core/types/doc/document.py +1289 -559
  8. docling_core/types/{experimental → doc}/labels.py +4 -1
  9. docling_core/types/legacy_doc/__init__.py +6 -0
  10. docling_core/types/legacy_doc/base.py +485 -0
  11. docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
  12. docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
  13. docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
  14. docling_core/types/legacy_doc/document.py +715 -0
  15. docling_core/types/rec/subject.py +1 -1
  16. docling_core/utils/generate_docs.py +82 -0
  17. docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
  18. docling_core/utils/validators.py +3 -3
  19. {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/METADATA +17 -17
  20. {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/RECORD +24 -31
  21. docling_core-2.0.1.dist-info/entry_points.txt +5 -0
  22. docling_core/transforms/id_generator/__init__.py +0 -12
  23. docling_core/transforms/id_generator/base.py +0 -30
  24. docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
  25. docling_core/transforms/id_generator/uuid_generator.py +0 -34
  26. docling_core/transforms/metadata_extractor/__init__.py +0 -13
  27. docling_core/transforms/metadata_extractor/base.py +0 -59
  28. docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
  29. docling_core/types/experimental/__init__.py +0 -30
  30. docling_core/types/experimental/base.py +0 -167
  31. docling_core/types/experimental/document.py +0 -1192
  32. docling_core/utils/ds_generate_docs.py +0 -144
  33. docling_core-1.7.2.dist-info/entry_points.txt +0 -5
  34. /docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
  35. {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/LICENSE +0 -0
  36. {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/WHEEL +0 -0
@@ -15,7 +15,7 @@ from docling_core.types.base import (
15
15
  SubjectNameTypeT,
16
16
  SubjectTypeT,
17
17
  )
18
- from docling_core.types.doc.base import S3Reference
18
+ from docling_core.types.legacy_doc.base import S3Reference
19
19
  from docling_core.utils.alias import AliasModel
20
20
 
21
21
 
@@ -0,0 +1,82 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Generate documentation of Docling types as JSON schema.
7
+
8
+ Example:
9
+ python docling_core/utils/generate_docs.py /tmp/docling_core_files
10
+ """
11
+ import argparse
12
+ import json
13
+ import os
14
+ from argparse import BooleanOptionalAction
15
+ from pathlib import Path
16
+ from shutil import rmtree
17
+ from typing import Final
18
+
19
+ from docling_core.utils.generate_jsonschema import generate_json_schema
20
+
21
+ MODELS: Final = ["DoclingDocument", "Record", "Generic"]
22
+
23
+
24
+ def _prepare_directory(folder: str, clean: bool = False) -> None:
25
+ """Create a directory or empty its content if it already exists.
26
+
27
+ Args:
28
+ folder: The name of the directory.
29
+ clean: Whether any existing content in the directory should be removed.
30
+ """
31
+ if os.path.isdir(folder):
32
+ if clean:
33
+ for path in Path(folder).glob("**/*"):
34
+ if path.is_file():
35
+ path.unlink()
36
+ elif path.is_dir():
37
+ rmtree(path)
38
+ else:
39
+ os.makedirs(folder, exist_ok=True)
40
+
41
+
42
+ def generate_collection_jsonschema(folder: str):
43
+ """Generate the JSON schema of Docling collections and export them to a folder.
44
+
45
+ Args:
46
+ folder: The name of the directory.
47
+ """
48
+ for item in MODELS:
49
+ json_schema = generate_json_schema(item)
50
+ with open(
51
+ os.path.join(folder, f"{item}.json"), mode="w", encoding="utf8"
52
+ ) as json_file:
53
+ json.dump(json_schema, json_file, ensure_ascii=False, indent=2)
54
+
55
+
56
+ def main() -> None:
57
+ """Generate the JSON Schema of Docling collections and export documentation."""
58
+ argparser = argparse.ArgumentParser()
59
+ argparser.add_argument(
60
+ "directory",
61
+ help=(
62
+ "Directory to generate files. If it exists, any existing content will be"
63
+ " removed."
64
+ ),
65
+ )
66
+ argparser.add_argument(
67
+ "--clean",
68
+ help="Whether any existing content in directory should be removed.",
69
+ action=BooleanOptionalAction,
70
+ dest="clean",
71
+ default=False,
72
+ required=False,
73
+ )
74
+ args = argparser.parse_args()
75
+
76
+ _prepare_directory(args.directory, args.clean)
77
+
78
+ generate_collection_jsonschema(args.directory)
79
+
80
+
81
+ if __name__ == "__main__":
82
+ main()
@@ -6,7 +6,7 @@
6
6
  """Generate the JSON Schema of pydantic models and export them to files.
7
7
 
8
8
  Example:
9
- python docling_core/utils/ds_generate_jsonschema.py doc.base.TableCell
9
+ python docling_core/utils/generate_jsonschema.py doc.document.TableCell
10
10
 
11
11
  """
12
12
  import argparse
@@ -27,10 +27,10 @@ def _import_class(class_reference: str) -> Any:
27
27
 
28
28
 
29
29
  def generate_json_schema(class_reference: str) -> Union[dict, None]:
30
- """Generate a jsonable dict of a model's schema from DS data types.
30
+ """Generate a jsonable dict of a model's schema from a data type.
31
31
 
32
32
  Args:
33
- class_reference: The reference to a class in 'src.data_types'.
33
+ class_reference: The reference to a class in 'docling_core.types'.
34
34
 
35
35
  Returns:
36
36
  A jsonable dict of the model's schema.
@@ -48,7 +48,7 @@ def main() -> None:
48
48
  """Print the JSON Schema of a model."""
49
49
  argparser = argparse.ArgumentParser()
50
50
  argparser.add_argument(
51
- "class_ref", help="Class reference, e.g., doc.base.TableCell"
51
+ "class_ref", help="Class reference, e.g., doc.document.TableCell"
52
52
  )
53
53
  args = argparser.parse_args()
54
54
 
@@ -38,7 +38,7 @@ def validate_raw_schema(file_: dict) -> tuple[bool, str]:
38
38
 
39
39
  schema_txt = (
40
40
  resources.files("docling_core")
41
- .joinpath("resources/schemas/doc/RAW.json")
41
+ .joinpath("resources/schemas/legacy_doc/RAW.json")
42
42
  .read_text("utf-8")
43
43
  )
44
44
  schema = json.loads(schema_txt)
@@ -52,7 +52,7 @@ def validate_ann_schema(file_: dict) -> tuple[bool, str]:
52
52
 
53
53
  schema_txt = (
54
54
  resources.files("docling_core")
55
- .joinpath("resources/schemas/doc/ANN.json")
55
+ .joinpath("resources/schemas/legacy_doc/ANN.json")
56
56
  .read_text("utf-8")
57
57
  )
58
58
  schema = json.loads(schema_txt)
@@ -66,7 +66,7 @@ def validate_ocr_schema(file_: dict) -> tuple[bool, str]:
66
66
 
67
67
  schema_txt = (
68
68
  resources.files("docling_core")
69
- .joinpath("resources/schemas/doc/OCR-output.json")
69
+ .joinpath("resources/schemas/legacy_doc/OCR-output.json")
70
70
  .read_text("utf-8")
71
71
  )
72
72
  schema = json.loads(schema_txt)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 1.7.2
3
+ Version: 2.0.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -25,10 +25,10 @@ Classifier: Topic :: Database
25
25
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
26
26
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
27
27
  Classifier: Typing :: Typed
28
- Requires-Dist: json-schema-for-humans (>=1.0.0,<2.0.0)
29
28
  Requires-Dist: jsonref (>=1.1.0,<2.0.0)
30
29
  Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
31
30
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
31
+ Requires-Dist: pillow (>=10.3.0,<11.0.0)
32
32
  Requires-Dist: pydantic (>=2.6.0,<3.0.0)
33
33
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
34
34
  Project-URL: Repository, https://github.com/DS4SD/docling-core
@@ -37,7 +37,7 @@ Description-Content-Type: text/markdown
37
37
  # Docling Core
38
38
 
39
39
  [![PyPI version](https://img.shields.io/pypi/v/docling-core)](https://pypi.org/project/docling-core/)
40
- ![Python](https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12-blue)
40
+ ![Python](https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%20%203.11%20%7C%203.12%20%7C%203.13-blue)
41
41
  [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
42
42
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
43
43
  [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
@@ -57,7 +57,7 @@ pip install docling-core
57
57
 
58
58
  ### Development setup
59
59
 
60
- To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
60
+ To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 / 3.13 and Poetry. You can then install from your local clone's root dir:
61
61
  ```bash
62
62
  poetry install
63
63
  ```
@@ -72,37 +72,37 @@ poetry run pytest test
72
72
  - You can validate your JSON objects using the pydantic class definition.
73
73
 
74
74
  ```py
75
- from docling_core.types import Document
75
+ from docling_core.types import DoclingDocument
76
76
 
77
77
  data_dict = {...} # here the object you want to validate, as a dictionary
78
- Document.model_validate(data_dict)
78
+ DoclingDocument.model_validate(data_dict)
79
79
 
80
80
  data_str = {...} # here the object as a JSON string
81
- Document.model_validate_json(data_str)
81
+ DoclingDocument.model_validate_json(data_str)
82
82
  ```
83
83
 
84
- - You can generate the JSON schema of a model with the script `ds_generate_jsonschema`.
84
+ - You can generate the JSON schema of a model with the script `generate_jsonschema`.
85
85
 
86
86
  ```py
87
- # for the `Document` type
88
- ds_generate_jsonschema Document
87
+ # for the `DoclingDocument` type
88
+ generate_jsonschema DoclingDocument
89
89
 
90
90
  # for the use `Record` type
91
- ds_generate_jsonschema Record
91
+ generate_jsonschema Record
92
92
  ```
93
93
 
94
94
  ## Documentation
95
95
 
96
- Docling supports 3 main data types:
96
+ Docling Core contains 3 top-level data types:
97
97
 
98
- - **Document** for publications like books, articles, reports, or patents. When Docling converts an unstructured PDF document, the generated JSON follows this schema.
99
- The Document type also models the metadata that may be attached to the converted document.
100
- Check [Document](docs/Document.md) for the full JSON schema.
98
+ - **DoclingDocument** for publications like books, articles, reports, or patents. When Docling converts an unstructured PDF document, the generated JSON follows this schema.
99
+ The DoclingDocument type also models the metadata that may be attached to the converted document.
100
+ Check [DoclingDocument](docs/DoclingDocument.json) for the full JSON schema.
101
101
  - **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.
102
102
  Related to records, the statements can represent annotations on text by Natural Language Processing (NLP) tools.
103
- Check [Record](docs/Record.md) for the full JSON schema.
103
+ Check [Record](docs/Record.json) for the full JSON schema.
104
104
  - **Generic** for any data representation, ensuring minimal configuration and maximum flexibility.
105
- Check [Generic](docs/Generic.md) for the full JSON schema.
105
+ Check [Generic](docs/Generic.json) for the full JSON schema.
106
106
 
107
107
  The data schemas are defined using [pydantic](https://pydantic-docs.helpmanual.io/) models, which provide built-in processes to support the creation of data that adhere to those models.
108
108
 
@@ -14,31 +14,24 @@ docling_core/search/mapping.py,sha256=6rqG7LgYSeWmooKNEcRa5gFDLp1ZdzPqDGlwTA5gpO
14
14
  docling_core/search/meta.py,sha256=wSurrsqdP1N3gQKx027fVdzVmc33a7Y6rPl-FClQvtA,3318
15
15
  docling_core/search/package.py,sha256=Lz2ml2eDy5t0ZimnGTq-DXHAn-f18w0bn4H5xrhs75A,1841
16
16
  docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9ACDd57ds,106
17
- docling_core/transforms/chunker/__init__.py,sha256=xZ5ELOB8tbCoJY1dKUvOrFqxYyoHmmCNUSHxrrRi8a4,317
18
- docling_core/transforms/chunker/base.py,sha256=5EW89CZf4SMB6Eh4yNjzYoNjn8S7oHH8NEpMck3Lcio,2078
19
- docling_core/transforms/chunker/hierarchical_chunker.py,sha256=DexBEPMR5rqnwrCMi-g98AtLDG7PKyZuf7u3NuXo-tA,12682
20
- docling_core/transforms/id_generator/__init__.py,sha256=7UoSyAcLsvw-RRrNjYXRVS4rIOUXjwqVpaQA-SSeINU,379
21
- docling_core/transforms/id_generator/base.py,sha256=SufPsaZUfMpuITq7pMv5YtlLmtGTDgA4LWmjmhQuSM0,704
22
- docling_core/transforms/id_generator/doc_hash_id_generator.py,sha256=SUw4FBhMZtbWCfc7oMucSwYvJTXqIPMn3yCXPRxtPCI,656
23
- docling_core/transforms/id_generator/uuid_generator.py,sha256=t8Bky_1JQB9myX-PJGWvW_c4-NvtHPHab6b1NdS-bpU,929
24
- docling_core/transforms/metadata_extractor/__init__.py,sha256=q_eAUcbaToEuYUPco4uiBO8vgTGSmZUC-r0mS7KbWh8,335
25
- docling_core/transforms/metadata_extractor/base.py,sha256=7h_S6-buCVtvAvKQKLISjDqFV8D3brewiQ-geqlUriI,1467
26
- docling_core/transforms/metadata_extractor/simple_metadata_extractor.py,sha256=ZRjDdXgFe8jPBNC_0ruJjQanabpkxceVsCJVVWVWlIg,1629
27
- docling_core/types/__init__.py,sha256=6mrAEKRW85uHJwNQBufwjPcMWCjm3oocA6MaO4_NLgg,805
17
+ docling_core/transforms/chunker/__init__.py,sha256=cSY_2L6EpR0lkPSDgt_ikjVoQpgIAhofvBfvfR3w_1Y,270
18
+ docling_core/transforms/chunker/base.py,sha256=uPNj6NHUl394Uh6wf01vmro4i3Ez4WUlV5ljfp85EM4,1565
19
+ docling_core/transforms/chunker/hierarchical_chunker.py,sha256=tKJnaKhdAAOkwRu4WOoHISo7qgBx_4T3YNS4nPB_iqc,6390
20
+ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
28
21
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
29
- docling_core/types/doc/__init__.py,sha256=Pzj_8rft6SJTVTCHgXRwHtuZjL6LK_6dcBWjikL9biY,125
30
- docling_core/types/doc/base.py,sha256=ujko-oQKoXw6wjBn0Il2Khu3PyljHqYnUNh3mPDVJF8,14676
31
- docling_core/types/doc/doc_ann.py,sha256=8pV2efUglw19jxl4_oqB__mSxjWvtGIcllyCdqA-b2s,1196
32
- docling_core/types/doc/doc_ocr.py,sha256=6PC0C-OczF-MyfgRxEI1xs3PWgNOzi7i2yEQbTqZz0I,1387
33
- docling_core/types/doc/doc_raw.py,sha256=Y69G6IiauNDaoT-5el4xo1ypWpnBJQ75akGGkCMTZSc,3888
34
- docling_core/types/doc/document.py,sha256=AKp1kOo0tncf9FX3q7qRWQ2Jz_hZE44smZpyrtsRzY4,24104
35
- docling_core/types/doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
36
- docling_core/types/experimental/__init__.py,sha256=mpqa2soTcHHEKqkcSeYBbAHepg0OgVZNReKvPmGz2r4,587
37
- docling_core/types/experimental/base.py,sha256=k04zvzNI7qo4HfKxLPCePKxCnerzXd582gvrVjF25SI,4225
38
- docling_core/types/experimental/document.py,sha256=X3z4sjRmWytRbEmSCnKat4D9sYxSV7Olm1YNmG3c5Kg,37874
39
- docling_core/types/experimental/labels.py,sha256=tpmvpmJuQyYMLhxAvJSVuFhDRh_zQNiP1WrQmNXKQzo,1224
22
+ docling_core/types/doc/__init__.py,sha256=_6QvDWO_AV0iHx72PpDb6XLZTlA7KYQhfL80xGiCq70,625
23
+ docling_core/types/doc/base.py,sha256=tNEXzxe2ihduCezYTUy_jNKMs0RJ6hBS79epYwyc2QY,4326
24
+ docling_core/types/doc/document.py,sha256=EeavMTImP8IlqeK8s7spwXX-_aawEGlHvDxbpWDAkOY,45428
25
+ docling_core/types/doc/labels.py,sha256=mzmSd072A-qW3IThswHxwIHV8IoyTCbHHlNOrisinRA,1335
40
26
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
41
27
  docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
28
+ docling_core/types/legacy_doc/__init__.py,sha256=Pzj_8rft6SJTVTCHgXRwHtuZjL6LK_6dcBWjikL9biY,125
29
+ docling_core/types/legacy_doc/base.py,sha256=l8NKCuORUQ1ebjdGWpj6b30oQEvtErLsIHKQHbbJiPg,14683
30
+ docling_core/types/legacy_doc/doc_ann.py,sha256=CIQHW8yzu70bsMR9gtu7dqe4oz603Tq2eDDt9sh-tYo,1203
31
+ docling_core/types/legacy_doc/doc_ocr.py,sha256=FfFqHAyMSbFt5cKeE7QLcxS0qUweBilBJoN9CH2TsQs,1394
32
+ docling_core/types/legacy_doc/doc_raw.py,sha256=LrvQ9DhNjBRy98p_F9PUyHZeTGAxMKWqJzY4WJ7v-xs,3895
33
+ docling_core/types/legacy_doc/document.py,sha256=A_cTYOjx6pNIICpOUm09YsfwPrIGDEZTKdetb2fx4PM,24273
34
+ docling_core/types/legacy_doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
42
35
  docling_core/types/nlp/__init__.py,sha256=hGcztAeVK7xkRBqRRvc4zbY4PGeJ0r0QrEsetnSx9nI,119
43
36
  docling_core/types/nlp/qa.py,sha256=TyZjubqkEoREv0YzmuLKlq4WW_TnJNj7BoBY1_r2a1E,2731
44
37
  docling_core/types/nlp/qa_labels.py,sha256=YLW2SYM9M1riktCUYctsg83Msb988NV2I754w4ibWzA,5880
@@ -48,16 +41,16 @@ docling_core/types/rec/base.py,sha256=jhTfInNGyB9NUw7o33PElrFGL80TqhU8MLcLZNZYj3
48
41
  docling_core/types/rec/predicate.py,sha256=4iDwXl9c4jzHTDIlRNE88yvDzKA9_od0xjPUUUP5IjI,3959
49
42
  docling_core/types/rec/record.py,sha256=r1QgPepwH3YjmMHlwwmeK00ZHEJnAsvyOMeXFY_D9_Q,2750
50
43
  docling_core/types/rec/statement.py,sha256=YwcV4CbVaAbzNwh14yJ_6Py3Ww0XnUJrEEUiKRdCZ5o,1701
51
- docling_core/types/rec/subject.py,sha256=wX9qsihwDbR7ZNSzY3vQymxi0eN1nxxsonrhSZzsMhA,2565
44
+ docling_core/types/rec/subject.py,sha256=PRCERGTMs4YhR3_Ne6jogkm41zYg8uUWb1yFpM7atm4,2572
52
45
  docling_core/utils/__init__.py,sha256=VauNNpWRHG0_ISKrsy5-gTxicrdQZSau6qMfuMl3iqk,120
53
46
  docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,874
54
- docling_core/utils/ds_generate_docs.py,sha256=0xGBagdC_PGjyeHXYZo90VnVrSTMZgHb0SYhFa6X7bQ,4248
55
- docling_core/utils/ds_generate_jsonschema.py,sha256=EhNQutqWJFWuN-yl9UUPFZ7DJTvGqg54qBIvUMHTHdA,1647
56
47
  docling_core/utils/file.py,sha256=VQgzjyvmJnAIHB6ex7ikcmbDAR4GA1ALreuO7Ubrp50,1895
48
+ docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
49
+ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
57
50
  docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
58
- docling_core/utils/validators.py,sha256=fBdyWX4PvFh7o_d25ZTs4iwmeo75QTbrxsvXv2kXkTg,2777
59
- docling_core-1.7.2.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
- docling_core-1.7.2.dist-info/METADATA,sha256=OfpdHwn-55a4Z-61sx7SZ1yD7jMDaiZ_LEmEIoYKa9I,5383
61
- docling_core-1.7.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
- docling_core-1.7.2.dist-info/entry_points.txt,sha256=XHhtJEkdUuLxXSNxLdFIzx_siQ3z2UFQEKp-P8VYAE4,189
63
- docling_core-1.7.2.dist-info/RECORD,,
51
+ docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
52
+ docling_core-2.0.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
53
+ docling_core-2.0.1.dist-info/METADATA,sha256=20SeaCpEJoVqWrXxuFF1a80s9GoW6W4gGZjs1hYGzaM,5459
54
+ docling_core-2.0.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
55
+ docling_core-2.0.1.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
56
+ docling_core-2.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ [console_scripts]
2
+ generate_docs=docling_core.utils.generate_docs:main
3
+ generate_jsonschema=docling_core.utils.generate_jsonschema:main
4
+ validate=docling_core.utils.validate:main
5
+
@@ -1,12 +0,0 @@
1
- #
2
- # Copyright IBM Corp. 2024 - 2024
3
- # SPDX-License-Identifier: MIT
4
- #
5
-
6
- """Define the ID generator types."""
7
-
8
- from docling_core.transforms.id_generator.base import BaseIDGenerator # noqa
9
- from docling_core.transforms.id_generator.doc_hash_id_generator import ( # noqa
10
- DocHashIDGenerator,
11
- )
12
- from docling_core.transforms.id_generator.uuid_generator import UUIDGenerator # noqa
@@ -1,30 +0,0 @@
1
- #
2
- # Copyright IBM Corp. 2024 - 2024
3
- # SPDX-License-Identifier: MIT
4
- #
5
-
6
- """Base document ID generator module."""
7
-
8
- from abc import ABC, abstractmethod
9
- from typing import Any
10
-
11
- from docling_core.types import Document as DLDocument
12
-
13
-
14
- class BaseIDGenerator(ABC):
15
- """Document ID generator base class."""
16
-
17
- @abstractmethod
18
- def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
19
- """Generate an ID for the given document.
20
-
21
- Args:
22
- doc (DLDocument): document to generate ID for
23
-
24
- Raises:
25
- NotImplementedError: in this abstract implementation
26
-
27
- Returns:
28
- str: the generated ID
29
- """
30
- raise NotImplementedError()
@@ -1,27 +0,0 @@
1
- #
2
- # Copyright IBM Corp. 2024 - 2024
3
- # SPDX-License-Identifier: MIT
4
- #
5
-
6
- """Doc-hash-based ID generator module."""
7
-
8
-
9
- from typing import Any
10
-
11
- from docling_core.transforms.id_generator import BaseIDGenerator
12
- from docling_core.types import Document as DLDocument
13
-
14
-
15
- class DocHashIDGenerator(BaseIDGenerator):
16
- """Doc-hash-based ID generator class."""
17
-
18
- def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
19
- """Generate an ID for the given document.
20
-
21
- Args:
22
- doc (DLDocument): document to generate ID for
23
-
24
- Returns:
25
- str: the generated ID
26
- """
27
- return doc.file_info.document_hash
@@ -1,34 +0,0 @@
1
- #
2
- # Copyright IBM Corp. 2024 - 2024
3
- # SPDX-License-Identifier: MIT
4
- #
5
-
6
- """UUID-based ID generator module."""
7
-
8
- from random import Random
9
- from typing import Annotated, Any, Optional
10
- from uuid import UUID
11
-
12
- from pydantic import BaseModel, Field
13
-
14
- from docling_core.transforms.id_generator import BaseIDGenerator
15
- from docling_core.types import Document as DLDocument
16
-
17
-
18
- class UUIDGenerator(BaseModel, BaseIDGenerator):
19
- """UUID-based ID generator class."""
20
-
21
- seed: Optional[int] = None
22
- uuid_version: Annotated[int, Field(strict=True, ge=1, le=5)] = 4
23
-
24
- def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
25
- """Generate an ID for the given document.
26
-
27
- Args:
28
- doc (DLDocument): document to generate ID for
29
-
30
- Returns:
31
- str: the generated ID
32
- """
33
- rd = Random(x=self.seed)
34
- return str(UUID(int=rd.getrandbits(128), version=self.uuid_version))
@@ -1,13 +0,0 @@
1
- #
2
- # Copyright IBM Corp. 2024 - 2024
3
- # SPDX-License-Identifier: MIT
4
- #
5
-
6
- """Define the metadata extractor types."""
7
-
8
- from docling_core.transforms.metadata_extractor.base import ( # noqa
9
- BaseMetadataExtractor,
10
- )
11
- from docling_core.transforms.metadata_extractor.simple_metadata_extractor import ( # noqa
12
- SimpleMetadataExtractor,
13
- )
@@ -1,59 +0,0 @@
1
- #
2
- # Copyright IBM Corp. 2024 - 2024
3
- # SPDX-License-Identifier: MIT
4
- #
5
-
6
- """Base metadata extractor module."""
7
-
8
-
9
- from abc import ABC, abstractmethod
10
- from typing import Any
11
-
12
- from pydantic import BaseModel
13
-
14
- from docling_core.types import Document as DLDocument
15
-
16
-
17
- class BaseMetadataExtractor(BaseModel, ABC):
18
- """Metadata extractor base class."""
19
-
20
- @abstractmethod
21
- def get_metadata(
22
- self, doc: DLDocument, *args: Any, **kwargs: Any
23
- ) -> dict[str, Any]:
24
- """Extract metadata for the given document.
25
-
26
- Args:
27
- doc (DLDocument): document to extract metadata for
28
-
29
- Raises:
30
- NotImplementedError: in this abstract implementation
31
-
32
- Returns:
33
- dict[str, Any]: the extracted metadata
34
- """
35
- raise NotImplementedError()
36
-
37
- @abstractmethod
38
- def get_excluded_embed_metadata_keys(self) -> list[str]:
39
- """Get metadata keys to exclude from embedding.
40
-
41
- Raises:
42
- NotImplementedError: in this abstract implementation
43
-
44
- Returns:
45
- list[str]: the metadata to exclude
46
- """
47
- raise NotImplementedError()
48
-
49
- @abstractmethod
50
- def get_excluded_llm_metadata_keys(self) -> list[str]:
51
- """Get metadata keys to exclude from LLM generation.
52
-
53
- Raises:
54
- NotImplementedError: in this abstract implementation
55
-
56
- Returns:
57
- list[str]: the metadata to exclude
58
- """
59
- raise NotImplementedError()
@@ -1,59 +0,0 @@
1
- #
2
- # Copyright IBM Corp. 2024 - 2024
3
- # SPDX-License-Identifier: MIT
4
- #
5
-
6
- """Simple metadata extractor module."""
7
-
8
-
9
- from typing import Any, Final
10
-
11
- from docling_core.transforms.metadata_extractor import BaseMetadataExtractor
12
- from docling_core.types import Document as DLDocument
13
-
14
- _DL_DOC_HASH: Final[str] = "dl_doc_hash"
15
- _ORIGIN: Final[str] = "origin"
16
-
17
-
18
- class SimpleMetadataExtractor(BaseMetadataExtractor):
19
- """Simple metadata extractor class."""
20
-
21
- include_origin: bool = False
22
-
23
- def get_metadata(
24
- self, doc: DLDocument, origin: str, *args: Any, **kwargs: Any
25
- ) -> dict[str, Any]:
26
- """Extract metadata for the given document.
27
-
28
- Args:
29
- doc (DLDocument): document to extract metadata for
30
- origin (str): the document origin
31
-
32
- Returns:
33
- dict[str, Any]: the extracted metadata
34
- """
35
- meta: dict[str, Any] = {
36
- _DL_DOC_HASH: doc.file_info.document_hash,
37
- }
38
- if self.include_origin:
39
- meta[_ORIGIN] = origin
40
- return meta
41
-
42
- def get_excluded_embed_metadata_keys(self) -> list[str]:
43
- """Get metadata keys to exclude from embedding.
44
-
45
- Returns:
46
- list[str]: the metadata to exclude
47
- """
48
- excl_keys: list[str] = [_DL_DOC_HASH]
49
- if self.include_origin:
50
- excl_keys.append(_ORIGIN)
51
- return excl_keys
52
-
53
- def get_excluded_llm_metadata_keys(self) -> list[str]:
54
- """Get metadata keys to exclude from LLM generation.
55
-
56
- Returns:
57
- list[str]: the metadata to exclude
58
- """
59
- return self.get_excluded_embed_metadata_keys()
@@ -1,30 +0,0 @@
1
- #
2
- # Copyright IBM Corp. 2024 - 2024
3
- # SPDX-License-Identifier: MIT
4
- #
5
-
6
- """Package for models defined by the Document type."""
7
-
8
- from .base import BoundingBox, CoordOrigin, Size
9
- from .document import (
10
- BasePictureData,
11
- BaseTableData,
12
- DescriptionItem,
13
- DocItem,
14
- DoclingDocument,
15
- DocumentOrigin,
16
- FloatingItem,
17
- GroupItem,
18
- ImageRef,
19
- KeyValueItem,
20
- NodeItem,
21
- PageItem,
22
- PictureItem,
23
- ProvenanceItem,
24
- RefItem,
25
- SectionHeaderItem,
26
- TableCell,
27
- TableItem,
28
- TextItem,
29
- )
30
- from .labels import DocItemLabel, GroupLabel, TableCellLabel