docling 1.10.0__tar.gz → 1.12.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-1.10.0 → docling-1.12.0}/PKG-INFO +21 -5
- {docling-1.10.0 → docling-1.12.0}/README.md +5 -2
- {docling-1.10.0 → docling-1.12.0}/docling/backend/abstract_backend.py +6 -3
- {docling-1.10.0 → docling-1.12.0}/docling/backend/docling_parse_backend.py +4 -5
- {docling-1.10.0 → docling-1.12.0}/docling/backend/pypdfium2_backend.py +4 -4
- docling-1.12.0/docling/cli/main.py +257 -0
- {docling-1.10.0 → docling-1.12.0}/docling/datamodel/base_models.py +3 -3
- {docling-1.10.0 → docling-1.12.0}/docling/datamodel/document.py +72 -3
- {docling-1.10.0 → docling-1.12.0}/docling/pipeline/base_model_pipeline.py +2 -2
- docling-1.12.0/docling/utils/__init__.py +0 -0
- {docling-1.10.0 → docling-1.12.0}/docling/utils/export.py +24 -11
- {docling-1.10.0 → docling-1.12.0}/pyproject.toml +56 -4
- {docling-1.10.0 → docling-1.12.0}/LICENSE +0 -0
- {docling-1.10.0 → docling-1.12.0}/docling/__init__.py +0 -0
- {docling-1.10.0 → docling-1.12.0}/docling/backend/__init__.py +0 -0
- {docling-1.10.0/docling/datamodel → docling-1.12.0/docling/cli}/__init__.py +0 -0
- {docling-1.10.0/docling/models → docling-1.12.0/docling/datamodel}/__init__.py +0 -0
- {docling-1.10.0 → docling-1.12.0}/docling/datamodel/settings.py +0 -0
- {docling-1.10.0 → docling-1.12.0}/docling/document_converter.py +0 -0
- {docling-1.10.0/docling/pipeline → docling-1.12.0/docling/models}/__init__.py +0 -0
- {docling-1.10.0 → docling-1.12.0}/docling/models/base_ocr_model.py +0 -0
- {docling-1.10.0 → docling-1.12.0}/docling/models/ds_glm_model.py +0 -0
- {docling-1.10.0 → docling-1.12.0}/docling/models/easyocr_model.py +0 -0
- {docling-1.10.0 → docling-1.12.0}/docling/models/layout_model.py +0 -0
- {docling-1.10.0 → docling-1.12.0}/docling/models/page_assemble_model.py +0 -0
- {docling-1.10.0 → docling-1.12.0}/docling/models/table_structure_model.py +0 -0
- {docling-1.10.0/docling/utils → docling-1.12.0/docling/pipeline}/__init__.py +0 -0
- {docling-1.10.0 → docling-1.12.0}/docling/pipeline/standard_model_pipeline.py +0 -0
- {docling-1.10.0 → docling-1.12.0}/docling/utils/layout_utils.py +0 -0
- {docling-1.10.0 → docling-1.12.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.12.0
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -19,21 +19,34 @@ Classifier: Programming Language :: Python :: 3.10
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.11
|
20
20
|
Classifier: Programming Language :: Python :: 3.12
|
21
21
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
22
|
+
Provides-Extra: examples
|
22
23
|
Requires-Dist: certifi (>=2024.7.4)
|
23
24
|
Requires-Dist: deepsearch-glm (>=0.21.0,<0.22.0)
|
24
|
-
Requires-Dist: docling-core (>=1.
|
25
|
-
Requires-Dist: docling-ibm-models (>=1.1.
|
25
|
+
Requires-Dist: docling-core (>=1.3.0,<2.0.0)
|
26
|
+
Requires-Dist: docling-ibm-models (>=1.1.7,<2.0.0)
|
26
27
|
Requires-Dist: docling-parse (>=1.2.0,<2.0.0)
|
27
28
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
28
29
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
29
30
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
31
|
+
Requires-Dist: langchain-huggingface (>=0.0.3,<0.0.4) ; extra == "examples"
|
32
|
+
Requires-Dist: langchain-milvus (>=0.1.4,<0.2.0) ; extra == "examples"
|
33
|
+
Requires-Dist: langchain-text-splitters (>=0.2.4,<0.3.0) ; extra == "examples"
|
34
|
+
Requires-Dist: llama-index-embeddings-huggingface (>=0.3.1,<0.4.0) ; extra == "examples"
|
35
|
+
Requires-Dist: llama-index-llms-huggingface-api (>=0.2.0,<0.3.0) ; extra == "examples"
|
36
|
+
Requires-Dist: llama-index-vector-stores-milvus (>=0.2.1,<0.3.0) ; extra == "examples"
|
30
37
|
Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
|
31
38
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
32
39
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
33
40
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
41
|
+
Requires-Dist: python-dotenv (>=1.0.1,<2.0.0) ; extra == "examples"
|
34
42
|
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
35
43
|
Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
36
44
|
Requires-Dist: scipy (>=1.14.1,<2.0.0)
|
45
|
+
Requires-Dist: torch (>=2.2.2,<2.3.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
|
46
|
+
Requires-Dist: torch (>=2.2.2,<3.0.0) ; sys_platform != "darwin" or platform_machine != "x86_64"
|
47
|
+
Requires-Dist: torchvision (>=0,<1) ; sys_platform != "darwin" or platform_machine != "x86_64"
|
48
|
+
Requires-Dist: torchvision (>=0.17.2,<0.18.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
|
49
|
+
Requires-Dist: typer (>=0.12.5,<0.13.0)
|
37
50
|
Project-URL: Repository, https://github.com/DS4SD/docling
|
38
51
|
Description-Content-Type: text/markdown
|
39
52
|
|
@@ -62,8 +75,7 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
|
|
62
75
|
* 📑 Understands detailed page layout, reading order and recovers table structures
|
63
76
|
* 📝 Extracts metadata from the document, such as title, authors, references and language
|
64
77
|
* 🔍 Optionally applies OCR (use with scanned PDFs)
|
65
|
-
|
66
|
-
For RAG, check out [Quackling](https://github.com/DS4SD/quackling) to get the most out of your docs, be it using LlamaIndex, LangChain or your pipeline.
|
78
|
+
* 🤖 Integrates easily with LLM app / RAG frameworks like 🦙 LlamaIndex and 🦜🔗 LangChain
|
67
79
|
|
68
80
|
## Installation
|
69
81
|
|
@@ -182,6 +194,10 @@ results = doc_converter.convert(conv_input)
|
|
182
194
|
|
183
195
|
You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
|
184
196
|
|
197
|
+
### RAG
|
198
|
+
Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
|
199
|
+
- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
|
200
|
+
- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
|
185
201
|
|
186
202
|
## Technical report
|
187
203
|
|
@@ -23,8 +23,7 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
|
|
23
23
|
* 📑 Understands detailed page layout, reading order and recovers table structures
|
24
24
|
* 📝 Extracts metadata from the document, such as title, authors, references and language
|
25
25
|
* 🔍 Optionally applies OCR (use with scanned PDFs)
|
26
|
-
|
27
|
-
For RAG, check out [Quackling](https://github.com/DS4SD/quackling) to get the most out of your docs, be it using LlamaIndex, LangChain or your pipeline.
|
26
|
+
* 🤖 Integrates easily with LLM app / RAG frameworks like 🦙 LlamaIndex and 🦜🔗 LangChain
|
28
27
|
|
29
28
|
## Installation
|
30
29
|
|
@@ -143,6 +142,10 @@ results = doc_converter.convert(conv_input)
|
|
143
142
|
|
144
143
|
You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
|
145
144
|
|
145
|
+
### RAG
|
146
|
+
Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
|
147
|
+
- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
|
148
|
+
- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
|
146
149
|
|
147
150
|
## Technical report
|
148
151
|
|
@@ -1,10 +1,13 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
2
|
from io import BytesIO
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Any, Iterable, Optional, Union
|
4
|
+
from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
|
5
5
|
|
6
6
|
from PIL import Image
|
7
7
|
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from docling.datamodel.base_models import BoundingBox, Cell, PageSize
|
10
|
+
|
8
11
|
|
9
12
|
class PdfPageBackend(ABC):
|
10
13
|
|
@@ -17,12 +20,12 @@ class PdfPageBackend(ABC):
|
|
17
20
|
pass
|
18
21
|
|
19
22
|
@abstractmethod
|
20
|
-
def get_bitmap_rects(self,
|
23
|
+
def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
|
21
24
|
pass
|
22
25
|
|
23
26
|
@abstractmethod
|
24
27
|
def get_page_image(
|
25
|
-
self, scale:
|
28
|
+
self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
|
26
29
|
) -> Image.Image:
|
27
30
|
pass
|
28
31
|
|
@@ -2,7 +2,7 @@ import logging
|
|
2
2
|
import random
|
3
3
|
from io import BytesIO
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import Iterable, Optional, Union
|
5
|
+
from typing import Iterable, List, Optional, Union
|
6
6
|
|
7
7
|
import pypdfium2 as pdfium
|
8
8
|
from docling_parse.docling_parse import pdf_parser
|
@@ -22,7 +22,6 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
22
22
|
self._ppage = page_obj
|
23
23
|
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
|
24
24
|
|
25
|
-
self._dpage = None
|
26
25
|
self.valid = "pages" in parsed_page
|
27
26
|
if self.valid:
|
28
27
|
self._dpage = parsed_page["pages"][0]
|
@@ -68,7 +67,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
68
67
|
return text_piece
|
69
68
|
|
70
69
|
def get_text_cells(self) -> Iterable[Cell]:
|
71
|
-
cells = []
|
70
|
+
cells: List[Cell] = []
|
72
71
|
cell_counter = 0
|
73
72
|
|
74
73
|
if not self.valid:
|
@@ -130,7 +129,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
130
129
|
|
131
130
|
return cells
|
132
131
|
|
133
|
-
def get_bitmap_rects(self, scale:
|
132
|
+
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
134
133
|
AREA_THRESHOLD = 32 * 32
|
135
134
|
|
136
135
|
for i in range(len(self._dpage["images"])):
|
@@ -145,7 +144,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
145
144
|
yield cropbox
|
146
145
|
|
147
146
|
def get_page_image(
|
148
|
-
self, scale:
|
147
|
+
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
149
148
|
) -> Image.Image:
|
150
149
|
|
151
150
|
page_size = self.get_size()
|
@@ -7,7 +7,7 @@ from typing import Iterable, List, Optional, Union
|
|
7
7
|
import pypdfium2 as pdfium
|
8
8
|
import pypdfium2.raw as pdfium_c
|
9
9
|
from PIL import Image, ImageDraw
|
10
|
-
from pypdfium2 import PdfPage
|
10
|
+
from pypdfium2 import PdfPage, PdfTextPage
|
11
11
|
from pypdfium2._helpers.misc import PdfiumError
|
12
12
|
|
13
13
|
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
@@ -29,12 +29,12 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
29
29
|
exc_info=True,
|
30
30
|
)
|
31
31
|
self.valid = False
|
32
|
-
self.text_page = None
|
32
|
+
self.text_page: Optional[PdfTextPage] = None
|
33
33
|
|
34
34
|
def is_valid(self) -> bool:
|
35
35
|
return self.valid
|
36
36
|
|
37
|
-
def get_bitmap_rects(self, scale:
|
37
|
+
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
38
38
|
AREA_THRESHOLD = 32 * 32
|
39
39
|
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
40
40
|
pos = obj.get_pos()
|
@@ -189,7 +189,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
189
189
|
return cells
|
190
190
|
|
191
191
|
def get_page_image(
|
192
|
-
self, scale:
|
192
|
+
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
193
193
|
) -> Image.Image:
|
194
194
|
|
195
195
|
page_size = self.get_size()
|
@@ -0,0 +1,257 @@
|
|
1
|
+
import importlib
|
2
|
+
import json
|
3
|
+
import logging
|
4
|
+
import time
|
5
|
+
import warnings
|
6
|
+
from enum import Enum
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import Annotated, Iterable, List, Optional
|
9
|
+
|
10
|
+
import typer
|
11
|
+
from pydantic import AnyUrl
|
12
|
+
|
13
|
+
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
14
|
+
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
15
|
+
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
16
|
+
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
17
|
+
from docling.document_converter import DocumentConverter
|
18
|
+
|
19
|
+
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
20
|
+
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
21
|
+
|
22
|
+
_log = logging.getLogger(__name__)
|
23
|
+
from rich.console import Console
|
24
|
+
|
25
|
+
err_console = Console(stderr=True)
|
26
|
+
|
27
|
+
|
28
|
+
app = typer.Typer(
|
29
|
+
name="Docling",
|
30
|
+
no_args_is_help=True,
|
31
|
+
add_completion=False,
|
32
|
+
pretty_exceptions_enable=False,
|
33
|
+
)
|
34
|
+
|
35
|
+
|
36
|
+
def version_callback(value: bool):
|
37
|
+
if value:
|
38
|
+
docling_version = importlib.metadata.version("docling")
|
39
|
+
docling_core_version = importlib.metadata.version("docling-core")
|
40
|
+
docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
|
41
|
+
docling_parse_version = importlib.metadata.version("docling-parse")
|
42
|
+
print(f"Docling version: {docling_version}")
|
43
|
+
print(f"Docling Core version: {docling_core_version}")
|
44
|
+
print(f"Docling IBM Models version: {docling_ibm_models_version}")
|
45
|
+
print(f"Docling Parse version: {docling_parse_version}")
|
46
|
+
raise typer.Exit()
|
47
|
+
|
48
|
+
|
49
|
+
# Define an enum for the backend options
|
50
|
+
class Backend(str, Enum):
|
51
|
+
PYPDFIUM2 = "pypdfium2"
|
52
|
+
DOCLING = "docling"
|
53
|
+
|
54
|
+
|
55
|
+
def export_documents(
|
56
|
+
conv_results: Iterable[ConversionResult],
|
57
|
+
output_dir: Path,
|
58
|
+
export_json: bool,
|
59
|
+
export_md: bool,
|
60
|
+
export_txt: bool,
|
61
|
+
export_doctags: bool,
|
62
|
+
):
|
63
|
+
|
64
|
+
success_count = 0
|
65
|
+
failure_count = 0
|
66
|
+
|
67
|
+
for conv_res in conv_results:
|
68
|
+
if conv_res.status == ConversionStatus.SUCCESS:
|
69
|
+
success_count += 1
|
70
|
+
doc_filename = conv_res.input.file.stem
|
71
|
+
|
72
|
+
# Export Deep Search document JSON format:
|
73
|
+
if export_json:
|
74
|
+
fname = output_dir / f"{doc_filename}.json"
|
75
|
+
with fname.open("w") as fp:
|
76
|
+
_log.info(f"writing JSON output to {fname}")
|
77
|
+
fp.write(json.dumps(conv_res.render_as_dict()))
|
78
|
+
|
79
|
+
# Export Text format:
|
80
|
+
if export_txt:
|
81
|
+
fname = output_dir / f"{doc_filename}.txt"
|
82
|
+
with fname.open("w") as fp:
|
83
|
+
_log.info(f"writing Text output to {fname}")
|
84
|
+
fp.write(conv_res.render_as_text())
|
85
|
+
|
86
|
+
# Export Markdown format:
|
87
|
+
if export_md:
|
88
|
+
fname = output_dir / f"{doc_filename}.md"
|
89
|
+
with fname.open("w") as fp:
|
90
|
+
_log.info(f"writing Markdown output to {fname}")
|
91
|
+
fp.write(conv_res.render_as_markdown())
|
92
|
+
|
93
|
+
# Export Document Tags format:
|
94
|
+
if export_doctags:
|
95
|
+
fname = output_dir / f"{doc_filename}.doctags"
|
96
|
+
with fname.open("w") as fp:
|
97
|
+
_log.info(f"writing Doc Tags output to {fname}")
|
98
|
+
fp.write(conv_res.render_as_doctags())
|
99
|
+
|
100
|
+
else:
|
101
|
+
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
102
|
+
failure_count += 1
|
103
|
+
|
104
|
+
_log.info(
|
105
|
+
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
|
106
|
+
)
|
107
|
+
|
108
|
+
|
109
|
+
@app.command(no_args_is_help=True)
|
110
|
+
def convert(
|
111
|
+
input_sources: Annotated[
|
112
|
+
List[Path],
|
113
|
+
typer.Argument(
|
114
|
+
...,
|
115
|
+
metavar="source",
|
116
|
+
help="PDF files to convert. Directories are also accepted.",
|
117
|
+
),
|
118
|
+
],
|
119
|
+
export_json: Annotated[
|
120
|
+
bool,
|
121
|
+
typer.Option(
|
122
|
+
..., "--json/--no-json", help="If enabled the document is exported as JSON."
|
123
|
+
),
|
124
|
+
] = False,
|
125
|
+
export_md: Annotated[
|
126
|
+
bool,
|
127
|
+
typer.Option(
|
128
|
+
..., "--md/--no-md", help="If enabled the document is exported as Markdown."
|
129
|
+
),
|
130
|
+
] = True,
|
131
|
+
export_txt: Annotated[
|
132
|
+
bool,
|
133
|
+
typer.Option(
|
134
|
+
..., "--txt/--no-txt", help="If enabled the document is exported as Text."
|
135
|
+
),
|
136
|
+
] = False,
|
137
|
+
export_doctags: Annotated[
|
138
|
+
bool,
|
139
|
+
typer.Option(
|
140
|
+
...,
|
141
|
+
"--doctags/--no-doctags",
|
142
|
+
help="If enabled the document is exported as Doc Tags.",
|
143
|
+
),
|
144
|
+
] = False,
|
145
|
+
ocr: Annotated[
|
146
|
+
bool,
|
147
|
+
typer.Option(
|
148
|
+
..., help="If enabled, the bitmap content will be processed using OCR."
|
149
|
+
),
|
150
|
+
] = True,
|
151
|
+
backend: Annotated[
|
152
|
+
Backend, typer.Option(..., help="The PDF backend to use.")
|
153
|
+
] = Backend.DOCLING,
|
154
|
+
output: Annotated[
|
155
|
+
Path, typer.Option(..., help="Output directory where results are saved.")
|
156
|
+
] = Path("."),
|
157
|
+
version: Annotated[
|
158
|
+
Optional[bool],
|
159
|
+
typer.Option(
|
160
|
+
"--version",
|
161
|
+
callback=version_callback,
|
162
|
+
is_eager=True,
|
163
|
+
help="Show version information.",
|
164
|
+
),
|
165
|
+
] = None,
|
166
|
+
):
|
167
|
+
logging.basicConfig(level=logging.INFO)
|
168
|
+
|
169
|
+
input_doc_paths: List[Path] = []
|
170
|
+
for source in input_sources:
|
171
|
+
if not source.exists():
|
172
|
+
err_console.print(
|
173
|
+
f"[red]Error: The input file {source} does not exist.[/red]"
|
174
|
+
)
|
175
|
+
raise typer.Abort()
|
176
|
+
elif source.is_dir():
|
177
|
+
input_doc_paths.extend(list(source.glob("**/*.pdf", case_sensitive=False)))
|
178
|
+
else:
|
179
|
+
input_doc_paths.append(source)
|
180
|
+
|
181
|
+
###########################################################################
|
182
|
+
|
183
|
+
# The following sections contain a combination of PipelineOptions
|
184
|
+
# and PDF Backends for various configurations.
|
185
|
+
# Uncomment one section at the time to see the differences in the output.
|
186
|
+
|
187
|
+
doc_converter = None
|
188
|
+
if backend == Backend.PYPDFIUM2 and not ocr: # PyPdfium without OCR
|
189
|
+
pipeline_options = PipelineOptions()
|
190
|
+
pipeline_options.do_ocr = False
|
191
|
+
pipeline_options.do_table_structure = True
|
192
|
+
pipeline_options.table_structure_options.do_cell_matching = False
|
193
|
+
|
194
|
+
doc_converter = DocumentConverter(
|
195
|
+
pipeline_options=pipeline_options,
|
196
|
+
pdf_backend=PyPdfiumDocumentBackend,
|
197
|
+
)
|
198
|
+
|
199
|
+
elif backend == Backend.PYPDFIUM2.value and ocr: # PyPdfium with OCR
|
200
|
+
pipeline_options = PipelineOptions()
|
201
|
+
pipeline_options.do_ocr = False
|
202
|
+
pipeline_options.do_table_structure = True
|
203
|
+
pipeline_options.table_structure_options.do_cell_matching = True
|
204
|
+
|
205
|
+
doc_converter = DocumentConverter(
|
206
|
+
pipeline_options=pipeline_options,
|
207
|
+
pdf_backend=PyPdfiumDocumentBackend,
|
208
|
+
)
|
209
|
+
|
210
|
+
elif backend == Backend.DOCLING.value and not ocr: # Docling Parse without OCR
|
211
|
+
pipeline_options = PipelineOptions()
|
212
|
+
pipeline_options.do_ocr = False
|
213
|
+
pipeline_options.do_table_structure = True
|
214
|
+
pipeline_options.table_structure_options.do_cell_matching = True
|
215
|
+
|
216
|
+
doc_converter = DocumentConverter(
|
217
|
+
pipeline_options=pipeline_options,
|
218
|
+
pdf_backend=DoclingParseDocumentBackend,
|
219
|
+
)
|
220
|
+
|
221
|
+
elif backend == Backend.DOCLING.value and ocr: # Docling Parse with OCR
|
222
|
+
pipeline_options = PipelineOptions()
|
223
|
+
pipeline_options.do_ocr = True
|
224
|
+
pipeline_options.do_table_structure = True
|
225
|
+
pipeline_options.table_structure_options.do_cell_matching = True
|
226
|
+
|
227
|
+
doc_converter = DocumentConverter(
|
228
|
+
pipeline_options=pipeline_options,
|
229
|
+
pdf_backend=DoclingParseDocumentBackend,
|
230
|
+
)
|
231
|
+
|
232
|
+
###########################################################################
|
233
|
+
|
234
|
+
# Define input files
|
235
|
+
input = DocumentConversionInput.from_paths(input_doc_paths)
|
236
|
+
|
237
|
+
start_time = time.time()
|
238
|
+
|
239
|
+
conv_results = doc_converter.convert(input)
|
240
|
+
|
241
|
+
output.mkdir(parents=True, exist_ok=True)
|
242
|
+
export_documents(
|
243
|
+
conv_results,
|
244
|
+
output_dir=output,
|
245
|
+
export_json=export_json,
|
246
|
+
export_md=export_md,
|
247
|
+
export_txt=export_txt,
|
248
|
+
export_doctags=export_doctags,
|
249
|
+
)
|
250
|
+
|
251
|
+
end_time = time.time() - start_time
|
252
|
+
|
253
|
+
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
254
|
+
|
255
|
+
|
256
|
+
if __name__ == "__main__":
|
257
|
+
app()
|
@@ -87,7 +87,7 @@ class BoundingBox(BaseModel):
|
|
87
87
|
return (self.l, self.b, self.r, self.t)
|
88
88
|
|
89
89
|
@classmethod
|
90
|
-
def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
|
90
|
+
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
|
91
91
|
if origin == CoordOrigin.TOPLEFT:
|
92
92
|
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
|
93
93
|
if r < l:
|
@@ -246,7 +246,7 @@ class EquationPrediction(BaseModel):
|
|
246
246
|
|
247
247
|
|
248
248
|
class PagePredictions(BaseModel):
|
249
|
-
layout: LayoutPrediction = None
|
249
|
+
layout: Optional[LayoutPrediction] = None
|
250
250
|
tablestructure: Optional[TableStructurePrediction] = None
|
251
251
|
figures_classification: Optional[FigureClassificationPrediction] = None
|
252
252
|
equations_prediction: Optional[EquationPrediction] = None
|
@@ -267,7 +267,7 @@ class Page(BaseModel):
|
|
267
267
|
page_no: int
|
268
268
|
page_hash: Optional[str] = None
|
269
269
|
size: Optional[PageSize] = None
|
270
|
-
cells: List[Cell] =
|
270
|
+
cells: List[Cell] = []
|
271
271
|
predictions: PagePredictions = PagePredictions()
|
272
272
|
assembled: Optional[AssembledUnit] = None
|
273
273
|
|
@@ -11,6 +11,7 @@ from docling_core.types import FileInfoObject as DsFileInfoObject
|
|
11
11
|
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
12
12
|
from docling_core.types import Table as DsSchemaTable
|
13
13
|
from docling_core.types import TableCell
|
14
|
+
from docling_core.types.doc.base import Figure
|
14
15
|
from pydantic import BaseModel
|
15
16
|
from typing_extensions import deprecated
|
16
17
|
|
@@ -279,7 +280,7 @@ class ConvertedDocument(BaseModel):
|
|
279
280
|
),
|
280
281
|
)
|
281
282
|
figures.append(
|
282
|
-
|
283
|
+
Figure(
|
283
284
|
prov=[
|
284
285
|
Prov(
|
285
286
|
bbox=target_bbox,
|
@@ -312,8 +313,76 @@ class ConvertedDocument(BaseModel):
|
|
312
313
|
def render_as_dict(self):
|
313
314
|
return self.output.model_dump(by_alias=True, exclude_none=True)
|
314
315
|
|
315
|
-
def render_as_markdown(
|
316
|
-
|
316
|
+
def render_as_markdown(
|
317
|
+
self,
|
318
|
+
delim: str = "\n\n",
|
319
|
+
main_text_start: int = 0,
|
320
|
+
main_text_stop: Optional[int] = None,
|
321
|
+
main_text_labels: list[str] = [
|
322
|
+
"title",
|
323
|
+
"subtitle-level-1",
|
324
|
+
"paragraph",
|
325
|
+
"caption",
|
326
|
+
"table",
|
327
|
+
],
|
328
|
+
strict_text: bool = False,
|
329
|
+
):
|
330
|
+
return self.output.export_to_markdown(
|
331
|
+
delim=delim,
|
332
|
+
main_text_start=main_text_start,
|
333
|
+
main_text_stop=main_text_stop,
|
334
|
+
main_text_labels=main_text_labels,
|
335
|
+
strict_text=strict_text,
|
336
|
+
)
|
337
|
+
|
338
|
+
def render_as_text(
|
339
|
+
self,
|
340
|
+
delim: str = "\n\n",
|
341
|
+
main_text_start: int = 0,
|
342
|
+
main_text_stop: Optional[int] = None,
|
343
|
+
main_text_labels: list[str] = [
|
344
|
+
"title",
|
345
|
+
"subtitle-level-1",
|
346
|
+
"paragraph",
|
347
|
+
"caption",
|
348
|
+
],
|
349
|
+
):
|
350
|
+
return self.output.export_to_markdown(
|
351
|
+
delim=delim,
|
352
|
+
main_text_start=main_text_start,
|
353
|
+
main_text_stop=main_text_stop,
|
354
|
+
main_text_labels=main_text_labels,
|
355
|
+
strict_text=True,
|
356
|
+
)
|
357
|
+
|
358
|
+
def render_as_doctags(
|
359
|
+
self,
|
360
|
+
delim: str = "\n\n",
|
361
|
+
main_text_start: int = 0,
|
362
|
+
main_text_stop: Optional[int] = None,
|
363
|
+
main_text_labels: list[str] = [
|
364
|
+
"title",
|
365
|
+
"subtitle-level-1",
|
366
|
+
"paragraph",
|
367
|
+
"caption",
|
368
|
+
"table",
|
369
|
+
"figure",
|
370
|
+
],
|
371
|
+
page_tagging: bool = True,
|
372
|
+
location_tagging: bool = True,
|
373
|
+
location_dimensions: Tuple[int, int] = (100, 100),
|
374
|
+
add_new_line: bool = True,
|
375
|
+
) -> str:
|
376
|
+
return self.output.export_to_document_tokens(
|
377
|
+
delim=delim,
|
378
|
+
main_text_start=main_text_start,
|
379
|
+
main_text_stop=main_text_stop,
|
380
|
+
main_text_labels=main_text_labels,
|
381
|
+
page_tagging=page_tagging,
|
382
|
+
location_tagging=location_tagging,
|
383
|
+
location_dimensions=location_dimensions,
|
384
|
+
add_new_line=add_new_line,
|
385
|
+
)
|
317
386
|
|
318
387
|
def render_element_images(
|
319
388
|
self, element_types: Tuple[PageElement] = (FigureElement,)
|
@@ -1,12 +1,12 @@
|
|
1
1
|
from pathlib import Path
|
2
|
-
from typing import Iterable
|
2
|
+
from typing import Callable, Iterable, List
|
3
3
|
|
4
4
|
from docling.datamodel.base_models import Page, PipelineOptions
|
5
5
|
|
6
6
|
|
7
7
|
class BaseModelPipeline:
|
8
8
|
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
9
|
-
self.model_pipe = []
|
9
|
+
self.model_pipe: List[Callable] = []
|
10
10
|
self.artifacts_path = artifacts_path
|
11
11
|
self.pipeline_options = pipeline_options
|
12
12
|
|
File without changes
|
@@ -1,10 +1,10 @@
|
|
1
1
|
import logging
|
2
|
-
from typing import Any, Dict, Iterable, List, Tuple
|
2
|
+
from typing import Any, Dict, Iterable, List, Tuple, Union
|
3
3
|
|
4
|
-
from docling_core.types.doc.base import BaseCell, Ref, Table, TableCell
|
4
|
+
from docling_core.types.doc.base import BaseCell, BaseText, Ref, Table, TableCell
|
5
5
|
|
6
6
|
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
|
7
|
-
from docling.datamodel.document import
|
7
|
+
from docling.datamodel.document import ConversionResult, Page
|
8
8
|
|
9
9
|
_log = logging.getLogger(__name__)
|
10
10
|
|
@@ -15,7 +15,10 @@ def _export_table_to_html(table: Table):
|
|
15
15
|
# to the docling-core package.
|
16
16
|
|
17
17
|
def _get_tablecell_span(cell: TableCell, ix):
|
18
|
-
|
18
|
+
if cell.spans is None:
|
19
|
+
span = set()
|
20
|
+
else:
|
21
|
+
span = set([s[ix] for s in cell.spans])
|
19
22
|
if len(span) == 0:
|
20
23
|
return 1, None, None
|
21
24
|
return len(span), min(span), max(span)
|
@@ -24,6 +27,8 @@ def _export_table_to_html(table: Table):
|
|
24
27
|
nrows = table.num_rows
|
25
28
|
ncols = table.num_cols
|
26
29
|
|
30
|
+
if table.data is None:
|
31
|
+
return ""
|
27
32
|
for i in range(nrows):
|
28
33
|
body += "<tr>"
|
29
34
|
for j in range(ncols):
|
@@ -66,7 +71,7 @@ def _export_table_to_html(table: Table):
|
|
66
71
|
|
67
72
|
|
68
73
|
def generate_multimodal_pages(
|
69
|
-
doc_result:
|
74
|
+
doc_result: ConversionResult,
|
70
75
|
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
|
71
76
|
|
72
77
|
label_to_doclaynet = {
|
@@ -94,7 +99,7 @@ def generate_multimodal_pages(
|
|
94
99
|
page_no = 0
|
95
100
|
start_ix = 0
|
96
101
|
end_ix = 0
|
97
|
-
doc_items = []
|
102
|
+
doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []
|
98
103
|
|
99
104
|
doc = doc_result.output
|
100
105
|
|
@@ -105,11 +110,11 @@ def generate_multimodal_pages(
|
|
105
110
|
item_type = item.obj_type
|
106
111
|
label = label_to_doclaynet.get(item_type, None)
|
107
112
|
|
108
|
-
if label is None:
|
113
|
+
if label is None or item.prov is None or page.size is None:
|
109
114
|
continue
|
110
115
|
|
111
116
|
bbox = BoundingBox.from_tuple(
|
112
|
-
item.prov[0].bbox, origin=CoordOrigin.BOTTOMLEFT
|
117
|
+
tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT
|
113
118
|
)
|
114
119
|
new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(
|
115
120
|
page_size=page.size
|
@@ -137,13 +142,15 @@ def generate_multimodal_pages(
|
|
137
142
|
return segments
|
138
143
|
|
139
144
|
def _process_page_cells(page: Page):
|
140
|
-
cells = []
|
145
|
+
cells: List[dict] = []
|
146
|
+
if page.size is None:
|
147
|
+
return cells
|
141
148
|
for cell in page.cells:
|
142
149
|
new_bbox = cell.bbox.to_top_left_origin(
|
143
150
|
page_height=page.size.height
|
144
151
|
).normalized(page_size=page.size)
|
145
152
|
is_ocr = isinstance(cell, OcrCell)
|
146
|
-
ocr_confidence = cell.confidence if
|
153
|
+
ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0
|
147
154
|
cells.append(
|
148
155
|
{
|
149
156
|
"text": cell.text,
|
@@ -163,9 +170,15 @@ def generate_multimodal_pages(
|
|
163
170
|
content_md = doc.export_to_markdown(
|
164
171
|
main_text_start=start_ix, main_text_stop=end_ix
|
165
172
|
)
|
173
|
+
# No page-tagging since we only do 1 page at the time
|
174
|
+
content_dt = doc.export_to_document_tokens(
|
175
|
+
main_text_start=start_ix, main_text_stop=end_ix, page_tagging=False
|
176
|
+
)
|
166
177
|
|
167
|
-
return content_text, content_md, page_cells, page_segments, page
|
178
|
+
return content_text, content_md, content_dt, page_cells, page_segments, page
|
168
179
|
|
180
|
+
if doc.main_text is None:
|
181
|
+
return
|
169
182
|
for ix, orig_item in enumerate(doc.main_text):
|
170
183
|
|
171
184
|
item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "1.
|
3
|
+
version = "1.12.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "Docling PDF conversion package"
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
@@ -23,8 +23,8 @@ packages = [{include = "docling"}]
|
|
23
23
|
[tool.poetry.dependencies]
|
24
24
|
python = "^3.10"
|
25
25
|
pydantic = "^2.0.0"
|
26
|
-
docling-core = "^1.
|
27
|
-
docling-ibm-models = "^1.1.
|
26
|
+
docling-core = "^1.3.0"
|
27
|
+
docling-ibm-models = "^1.1.7"
|
28
28
|
deepsearch-glm = "^0.21.0"
|
29
29
|
filetype = "^1.2.0"
|
30
30
|
pypdfium2 = "^4.30.0"
|
@@ -38,6 +38,30 @@ rtree = "^1.3.0"
|
|
38
38
|
scipy = "^1.14.1"
|
39
39
|
pyarrow = "^16.1.0"
|
40
40
|
|
41
|
+
#########
|
42
|
+
# extras:
|
43
|
+
#########
|
44
|
+
python-dotenv = { version = "^1.0.1", optional = true }
|
45
|
+
llama-index-embeddings-huggingface = { version = "^0.3.1", optional = true }
|
46
|
+
llama-index-llms-huggingface-api = { version = "^0.2.0", optional = true }
|
47
|
+
llama-index-vector-stores-milvus = { version = "^0.2.1", optional = true }
|
48
|
+
langchain-huggingface = { version = "^0.0.3", optional = true}
|
49
|
+
langchain-milvus = { version = "^0.1.4", optional = true }
|
50
|
+
langchain-text-splitters = { version = "^0.2.4", optional = true }
|
51
|
+
|
52
|
+
##############
|
53
|
+
# constraints:
|
54
|
+
##############
|
55
|
+
torch = [
|
56
|
+
{version = "^2.2.2", optional = true, markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'"},
|
57
|
+
{version = "~2.2.2", optional = true, markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'"}
|
58
|
+
]
|
59
|
+
torchvision = [
|
60
|
+
{version = "^0", optional = true, markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'"},
|
61
|
+
{version = "~0.17.2", optional = true, markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'"}
|
62
|
+
]
|
63
|
+
typer = "^0.12.5"
|
64
|
+
|
41
65
|
[tool.poetry.group.dev.dependencies]
|
42
66
|
black = {extras = ["jupyter"], version = "^24.4.2"}
|
43
67
|
pytest = "^7.2.2"
|
@@ -51,11 +75,31 @@ pytest-xdist = "^3.3.1"
|
|
51
75
|
types-requests = "^2.31.0.2"
|
52
76
|
flake8-pyproject = "^1.2.3"
|
53
77
|
pylint = "^2.17.5"
|
54
|
-
|
78
|
+
pandas-stubs = "^2.2.2.240909"
|
79
|
+
ipykernel = "^6.29.5"
|
80
|
+
ipywidgets = "^8.1.5"
|
81
|
+
nbqa = "^1.9.0"
|
55
82
|
|
56
83
|
[tool.poetry.group.examples.dependencies]
|
57
84
|
datasets = "^2.21.0"
|
58
85
|
|
86
|
+
[tool.poetry.extras]
|
87
|
+
examples = [
|
88
|
+
"python-dotenv",
|
89
|
+
# LlamaIndex examples:
|
90
|
+
"llama-index-embeddings-huggingface",
|
91
|
+
"llama-index-llms-huggingface-api",
|
92
|
+
"llama-index-vector-stores-milvus",
|
93
|
+
# LangChain examples:
|
94
|
+
"langchain-huggingface",
|
95
|
+
"langchain-milvus",
|
96
|
+
"langchain-text-splitters",
|
97
|
+
]
|
98
|
+
|
99
|
+
|
100
|
+
[tool.poetry.scripts]
|
101
|
+
docling = "docling.cli.main:app"
|
102
|
+
|
59
103
|
[build-system]
|
60
104
|
requires = ["poetry-core"]
|
61
105
|
build-backend = "poetry.core.masonry.api"
|
@@ -76,6 +120,14 @@ pretty = true
|
|
76
120
|
no_implicit_optional = true
|
77
121
|
python_version = "3.10"
|
78
122
|
|
123
|
+
[[tool.mypy.overrides]]
|
124
|
+
module = [
|
125
|
+
"docling_parse.*",
|
126
|
+
"pypdfium2.*",
|
127
|
+
"networkx.*",
|
128
|
+
]
|
129
|
+
ignore_missing_imports = true
|
130
|
+
|
79
131
|
[tool.flake8]
|
80
132
|
max-line-length = 88
|
81
133
|
extend-ignore = ["E203", "E501"]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|