docling 1.11.0__py3-none-any.whl → 1.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +6 -3
- docling/backend/docling_parse_backend.py +4 -5
- docling/backend/pypdfium2_backend.py +4 -4
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +258 -0
- docling/datamodel/base_models.py +3 -3
- docling/pipeline/base_model_pipeline.py +2 -2
- docling/utils/export.py +19 -10
- {docling-1.11.0.dist-info → docling-1.12.1.dist-info}/METADATA +20 -4
- {docling-1.11.0.dist-info → docling-1.12.1.dist-info}/RECORD +13 -10
- docling-1.12.1.dist-info/entry_points.txt +3 -0
- {docling-1.11.0.dist-info → docling-1.12.1.dist-info}/LICENSE +0 -0
- {docling-1.11.0.dist-info → docling-1.12.1.dist-info}/WHEEL +0 -0
@@ -1,10 +1,13 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
2
|
from io import BytesIO
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Any, Iterable, Optional, Union
|
4
|
+
from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
|
5
5
|
|
6
6
|
from PIL import Image
|
7
7
|
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from docling.datamodel.base_models import BoundingBox, Cell, PageSize
|
10
|
+
|
8
11
|
|
9
12
|
class PdfPageBackend(ABC):
|
10
13
|
|
@@ -17,12 +20,12 @@ class PdfPageBackend(ABC):
|
|
17
20
|
pass
|
18
21
|
|
19
22
|
@abstractmethod
|
20
|
-
def get_bitmap_rects(self,
|
23
|
+
def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
|
21
24
|
pass
|
22
25
|
|
23
26
|
@abstractmethod
|
24
27
|
def get_page_image(
|
25
|
-
self, scale:
|
28
|
+
self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
|
26
29
|
) -> Image.Image:
|
27
30
|
pass
|
28
31
|
|
@@ -2,7 +2,7 @@ import logging
|
|
2
2
|
import random
|
3
3
|
from io import BytesIO
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import Iterable, Optional, Union
|
5
|
+
from typing import Iterable, List, Optional, Union
|
6
6
|
|
7
7
|
import pypdfium2 as pdfium
|
8
8
|
from docling_parse.docling_parse import pdf_parser
|
@@ -22,7 +22,6 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
22
22
|
self._ppage = page_obj
|
23
23
|
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
|
24
24
|
|
25
|
-
self._dpage = None
|
26
25
|
self.valid = "pages" in parsed_page
|
27
26
|
if self.valid:
|
28
27
|
self._dpage = parsed_page["pages"][0]
|
@@ -68,7 +67,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
68
67
|
return text_piece
|
69
68
|
|
70
69
|
def get_text_cells(self) -> Iterable[Cell]:
|
71
|
-
cells = []
|
70
|
+
cells: List[Cell] = []
|
72
71
|
cell_counter = 0
|
73
72
|
|
74
73
|
if not self.valid:
|
@@ -130,7 +129,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
130
129
|
|
131
130
|
return cells
|
132
131
|
|
133
|
-
def get_bitmap_rects(self, scale:
|
132
|
+
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
134
133
|
AREA_THRESHOLD = 32 * 32
|
135
134
|
|
136
135
|
for i in range(len(self._dpage["images"])):
|
@@ -145,7 +144,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
145
144
|
yield cropbox
|
146
145
|
|
147
146
|
def get_page_image(
|
148
|
-
self, scale:
|
147
|
+
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
149
148
|
) -> Image.Image:
|
150
149
|
|
151
150
|
page_size = self.get_size()
|
@@ -7,7 +7,7 @@ from typing import Iterable, List, Optional, Union
|
|
7
7
|
import pypdfium2 as pdfium
|
8
8
|
import pypdfium2.raw as pdfium_c
|
9
9
|
from PIL import Image, ImageDraw
|
10
|
-
from pypdfium2 import PdfPage
|
10
|
+
from pypdfium2 import PdfPage, PdfTextPage
|
11
11
|
from pypdfium2._helpers.misc import PdfiumError
|
12
12
|
|
13
13
|
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
@@ -29,12 +29,12 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
29
29
|
exc_info=True,
|
30
30
|
)
|
31
31
|
self.valid = False
|
32
|
-
self.text_page = None
|
32
|
+
self.text_page: Optional[PdfTextPage] = None
|
33
33
|
|
34
34
|
def is_valid(self) -> bool:
|
35
35
|
return self.valid
|
36
36
|
|
37
|
-
def get_bitmap_rects(self, scale:
|
37
|
+
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
38
38
|
AREA_THRESHOLD = 32 * 32
|
39
39
|
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
40
40
|
pos = obj.get_pos()
|
@@ -189,7 +189,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
189
189
|
return cells
|
190
190
|
|
191
191
|
def get_page_image(
|
192
|
-
self, scale:
|
192
|
+
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
193
193
|
) -> Image.Image:
|
194
194
|
|
195
195
|
page_size = self.get_size()
|
docling/cli/__init__.py
ADDED
File without changes
|
docling/cli/main.py
ADDED
@@ -0,0 +1,258 @@
|
|
1
|
+
import importlib
|
2
|
+
import json
|
3
|
+
import logging
|
4
|
+
import time
|
5
|
+
import warnings
|
6
|
+
from enum import Enum
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import Annotated, Iterable, List, Optional
|
9
|
+
|
10
|
+
import typer
|
11
|
+
from pydantic import AnyUrl
|
12
|
+
|
13
|
+
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
14
|
+
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
15
|
+
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
16
|
+
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
17
|
+
from docling.document_converter import DocumentConverter
|
18
|
+
|
19
|
+
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
20
|
+
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
21
|
+
|
22
|
+
_log = logging.getLogger(__name__)
|
23
|
+
from rich.console import Console
|
24
|
+
|
25
|
+
err_console = Console(stderr=True)
|
26
|
+
|
27
|
+
|
28
|
+
app = typer.Typer(
|
29
|
+
name="Docling",
|
30
|
+
no_args_is_help=True,
|
31
|
+
add_completion=False,
|
32
|
+
pretty_exceptions_enable=False,
|
33
|
+
)
|
34
|
+
|
35
|
+
|
36
|
+
def version_callback(value: bool):
|
37
|
+
if value:
|
38
|
+
docling_version = importlib.metadata.version("docling")
|
39
|
+
docling_core_version = importlib.metadata.version("docling-core")
|
40
|
+
docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
|
41
|
+
docling_parse_version = importlib.metadata.version("docling-parse")
|
42
|
+
print(f"Docling version: {docling_version}")
|
43
|
+
print(f"Docling Core version: {docling_core_version}")
|
44
|
+
print(f"Docling IBM Models version: {docling_ibm_models_version}")
|
45
|
+
print(f"Docling Parse version: {docling_parse_version}")
|
46
|
+
raise typer.Exit()
|
47
|
+
|
48
|
+
|
49
|
+
# Define an enum for the backend options
|
50
|
+
class Backend(str, Enum):
|
51
|
+
PYPDFIUM2 = "pypdfium2"
|
52
|
+
DOCLING = "docling"
|
53
|
+
|
54
|
+
|
55
|
+
def export_documents(
|
56
|
+
conv_results: Iterable[ConversionResult],
|
57
|
+
output_dir: Path,
|
58
|
+
export_json: bool,
|
59
|
+
export_md: bool,
|
60
|
+
export_txt: bool,
|
61
|
+
export_doctags: bool,
|
62
|
+
):
|
63
|
+
|
64
|
+
success_count = 0
|
65
|
+
failure_count = 0
|
66
|
+
|
67
|
+
for conv_res in conv_results:
|
68
|
+
if conv_res.status == ConversionStatus.SUCCESS:
|
69
|
+
success_count += 1
|
70
|
+
doc_filename = conv_res.input.file.stem
|
71
|
+
|
72
|
+
# Export Deep Search document JSON format:
|
73
|
+
if export_json:
|
74
|
+
fname = output_dir / f"{doc_filename}.json"
|
75
|
+
with fname.open("w") as fp:
|
76
|
+
_log.info(f"writing JSON output to {fname}")
|
77
|
+
fp.write(json.dumps(conv_res.render_as_dict()))
|
78
|
+
|
79
|
+
# Export Text format:
|
80
|
+
if export_txt:
|
81
|
+
fname = output_dir / f"{doc_filename}.txt"
|
82
|
+
with fname.open("w") as fp:
|
83
|
+
_log.info(f"writing Text output to {fname}")
|
84
|
+
fp.write(conv_res.render_as_text())
|
85
|
+
|
86
|
+
# Export Markdown format:
|
87
|
+
if export_md:
|
88
|
+
fname = output_dir / f"{doc_filename}.md"
|
89
|
+
with fname.open("w") as fp:
|
90
|
+
_log.info(f"writing Markdown output to {fname}")
|
91
|
+
fp.write(conv_res.render_as_markdown())
|
92
|
+
|
93
|
+
# Export Document Tags format:
|
94
|
+
if export_doctags:
|
95
|
+
fname = output_dir / f"{doc_filename}.doctags"
|
96
|
+
with fname.open("w") as fp:
|
97
|
+
_log.info(f"writing Doc Tags output to {fname}")
|
98
|
+
fp.write(conv_res.render_as_doctags())
|
99
|
+
|
100
|
+
else:
|
101
|
+
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
102
|
+
failure_count += 1
|
103
|
+
|
104
|
+
_log.info(
|
105
|
+
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
|
106
|
+
)
|
107
|
+
|
108
|
+
|
109
|
+
@app.command(no_args_is_help=True)
|
110
|
+
def convert(
|
111
|
+
input_sources: Annotated[
|
112
|
+
List[Path],
|
113
|
+
typer.Argument(
|
114
|
+
...,
|
115
|
+
metavar="source",
|
116
|
+
help="PDF files to convert. Directories are also accepted.",
|
117
|
+
),
|
118
|
+
],
|
119
|
+
export_json: Annotated[
|
120
|
+
bool,
|
121
|
+
typer.Option(
|
122
|
+
..., "--json/--no-json", help="If enabled the document is exported as JSON."
|
123
|
+
),
|
124
|
+
] = False,
|
125
|
+
export_md: Annotated[
|
126
|
+
bool,
|
127
|
+
typer.Option(
|
128
|
+
..., "--md/--no-md", help="If enabled the document is exported as Markdown."
|
129
|
+
),
|
130
|
+
] = True,
|
131
|
+
export_txt: Annotated[
|
132
|
+
bool,
|
133
|
+
typer.Option(
|
134
|
+
..., "--txt/--no-txt", help="If enabled the document is exported as Text."
|
135
|
+
),
|
136
|
+
] = False,
|
137
|
+
export_doctags: Annotated[
|
138
|
+
bool,
|
139
|
+
typer.Option(
|
140
|
+
...,
|
141
|
+
"--doctags/--no-doctags",
|
142
|
+
help="If enabled the document is exported as Doc Tags.",
|
143
|
+
),
|
144
|
+
] = False,
|
145
|
+
ocr: Annotated[
|
146
|
+
bool,
|
147
|
+
typer.Option(
|
148
|
+
..., help="If enabled, the bitmap content will be processed using OCR."
|
149
|
+
),
|
150
|
+
] = True,
|
151
|
+
backend: Annotated[
|
152
|
+
Backend, typer.Option(..., help="The PDF backend to use.")
|
153
|
+
] = Backend.DOCLING,
|
154
|
+
output: Annotated[
|
155
|
+
Path, typer.Option(..., help="Output directory where results are saved.")
|
156
|
+
] = Path("."),
|
157
|
+
version: Annotated[
|
158
|
+
Optional[bool],
|
159
|
+
typer.Option(
|
160
|
+
"--version",
|
161
|
+
callback=version_callback,
|
162
|
+
is_eager=True,
|
163
|
+
help="Show version information.",
|
164
|
+
),
|
165
|
+
] = None,
|
166
|
+
):
|
167
|
+
logging.basicConfig(level=logging.INFO)
|
168
|
+
|
169
|
+
input_doc_paths: List[Path] = []
|
170
|
+
for source in input_sources:
|
171
|
+
if not source.exists():
|
172
|
+
err_console.print(
|
173
|
+
f"[red]Error: The input file {source} does not exist.[/red]"
|
174
|
+
)
|
175
|
+
raise typer.Abort()
|
176
|
+
elif source.is_dir():
|
177
|
+
input_doc_paths.extend(list(source.glob("**/*.pdf")))
|
178
|
+
input_doc_paths.extend(list(source.glob("**/*.PDF")))
|
179
|
+
else:
|
180
|
+
input_doc_paths.append(source)
|
181
|
+
|
182
|
+
###########################################################################
|
183
|
+
|
184
|
+
# The following sections contain a combination of PipelineOptions
|
185
|
+
# and PDF Backends for various configurations.
|
186
|
+
# Uncomment one section at the time to see the differences in the output.
|
187
|
+
|
188
|
+
doc_converter = None
|
189
|
+
if backend == Backend.PYPDFIUM2 and not ocr: # PyPdfium without OCR
|
190
|
+
pipeline_options = PipelineOptions()
|
191
|
+
pipeline_options.do_ocr = False
|
192
|
+
pipeline_options.do_table_structure = True
|
193
|
+
pipeline_options.table_structure_options.do_cell_matching = False
|
194
|
+
|
195
|
+
doc_converter = DocumentConverter(
|
196
|
+
pipeline_options=pipeline_options,
|
197
|
+
pdf_backend=PyPdfiumDocumentBackend,
|
198
|
+
)
|
199
|
+
|
200
|
+
elif backend == Backend.PYPDFIUM2.value and ocr: # PyPdfium with OCR
|
201
|
+
pipeline_options = PipelineOptions()
|
202
|
+
pipeline_options.do_ocr = False
|
203
|
+
pipeline_options.do_table_structure = True
|
204
|
+
pipeline_options.table_structure_options.do_cell_matching = True
|
205
|
+
|
206
|
+
doc_converter = DocumentConverter(
|
207
|
+
pipeline_options=pipeline_options,
|
208
|
+
pdf_backend=PyPdfiumDocumentBackend,
|
209
|
+
)
|
210
|
+
|
211
|
+
elif backend == Backend.DOCLING.value and not ocr: # Docling Parse without OCR
|
212
|
+
pipeline_options = PipelineOptions()
|
213
|
+
pipeline_options.do_ocr = False
|
214
|
+
pipeline_options.do_table_structure = True
|
215
|
+
pipeline_options.table_structure_options.do_cell_matching = True
|
216
|
+
|
217
|
+
doc_converter = DocumentConverter(
|
218
|
+
pipeline_options=pipeline_options,
|
219
|
+
pdf_backend=DoclingParseDocumentBackend,
|
220
|
+
)
|
221
|
+
|
222
|
+
elif backend == Backend.DOCLING.value and ocr: # Docling Parse with OCR
|
223
|
+
pipeline_options = PipelineOptions()
|
224
|
+
pipeline_options.do_ocr = True
|
225
|
+
pipeline_options.do_table_structure = True
|
226
|
+
pipeline_options.table_structure_options.do_cell_matching = True
|
227
|
+
|
228
|
+
doc_converter = DocumentConverter(
|
229
|
+
pipeline_options=pipeline_options,
|
230
|
+
pdf_backend=DoclingParseDocumentBackend,
|
231
|
+
)
|
232
|
+
|
233
|
+
###########################################################################
|
234
|
+
|
235
|
+
# Define input files
|
236
|
+
input = DocumentConversionInput.from_paths(input_doc_paths)
|
237
|
+
|
238
|
+
start_time = time.time()
|
239
|
+
|
240
|
+
conv_results = doc_converter.convert(input)
|
241
|
+
|
242
|
+
output.mkdir(parents=True, exist_ok=True)
|
243
|
+
export_documents(
|
244
|
+
conv_results,
|
245
|
+
output_dir=output,
|
246
|
+
export_json=export_json,
|
247
|
+
export_md=export_md,
|
248
|
+
export_txt=export_txt,
|
249
|
+
export_doctags=export_doctags,
|
250
|
+
)
|
251
|
+
|
252
|
+
end_time = time.time() - start_time
|
253
|
+
|
254
|
+
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
255
|
+
|
256
|
+
|
257
|
+
if __name__ == "__main__":
|
258
|
+
app()
|
docling/datamodel/base_models.py
CHANGED
@@ -87,7 +87,7 @@ class BoundingBox(BaseModel):
|
|
87
87
|
return (self.l, self.b, self.r, self.t)
|
88
88
|
|
89
89
|
@classmethod
|
90
|
-
def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
|
90
|
+
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
|
91
91
|
if origin == CoordOrigin.TOPLEFT:
|
92
92
|
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
|
93
93
|
if r < l:
|
@@ -246,7 +246,7 @@ class EquationPrediction(BaseModel):
|
|
246
246
|
|
247
247
|
|
248
248
|
class PagePredictions(BaseModel):
|
249
|
-
layout: LayoutPrediction = None
|
249
|
+
layout: Optional[LayoutPrediction] = None
|
250
250
|
tablestructure: Optional[TableStructurePrediction] = None
|
251
251
|
figures_classification: Optional[FigureClassificationPrediction] = None
|
252
252
|
equations_prediction: Optional[EquationPrediction] = None
|
@@ -267,7 +267,7 @@ class Page(BaseModel):
|
|
267
267
|
page_no: int
|
268
268
|
page_hash: Optional[str] = None
|
269
269
|
size: Optional[PageSize] = None
|
270
|
-
cells: List[Cell] =
|
270
|
+
cells: List[Cell] = []
|
271
271
|
predictions: PagePredictions = PagePredictions()
|
272
272
|
assembled: Optional[AssembledUnit] = None
|
273
273
|
|
@@ -1,12 +1,12 @@
|
|
1
1
|
from pathlib import Path
|
2
|
-
from typing import Iterable
|
2
|
+
from typing import Callable, Iterable, List
|
3
3
|
|
4
4
|
from docling.datamodel.base_models import Page, PipelineOptions
|
5
5
|
|
6
6
|
|
7
7
|
class BaseModelPipeline:
|
8
8
|
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
9
|
-
self.model_pipe = []
|
9
|
+
self.model_pipe: List[Callable] = []
|
10
10
|
self.artifacts_path = artifacts_path
|
11
11
|
self.pipeline_options = pipeline_options
|
12
12
|
|
docling/utils/export.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
import logging
|
2
|
-
from typing import Any, Dict, Iterable, List, Tuple
|
2
|
+
from typing import Any, Dict, Iterable, List, Tuple, Union
|
3
3
|
|
4
|
-
from docling_core.types.doc.base import BaseCell, Ref, Table, TableCell
|
4
|
+
from docling_core.types.doc.base import BaseCell, BaseText, Ref, Table, TableCell
|
5
5
|
|
6
6
|
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
|
7
|
-
from docling.datamodel.document import
|
7
|
+
from docling.datamodel.document import ConversionResult, Page
|
8
8
|
|
9
9
|
_log = logging.getLogger(__name__)
|
10
10
|
|
@@ -15,7 +15,10 @@ def _export_table_to_html(table: Table):
|
|
15
15
|
# to the docling-core package.
|
16
16
|
|
17
17
|
def _get_tablecell_span(cell: TableCell, ix):
|
18
|
-
|
18
|
+
if cell.spans is None:
|
19
|
+
span = set()
|
20
|
+
else:
|
21
|
+
span = set([s[ix] for s in cell.spans])
|
19
22
|
if len(span) == 0:
|
20
23
|
return 1, None, None
|
21
24
|
return len(span), min(span), max(span)
|
@@ -24,6 +27,8 @@ def _export_table_to_html(table: Table):
|
|
24
27
|
nrows = table.num_rows
|
25
28
|
ncols = table.num_cols
|
26
29
|
|
30
|
+
if table.data is None:
|
31
|
+
return ""
|
27
32
|
for i in range(nrows):
|
28
33
|
body += "<tr>"
|
29
34
|
for j in range(ncols):
|
@@ -66,7 +71,7 @@ def _export_table_to_html(table: Table):
|
|
66
71
|
|
67
72
|
|
68
73
|
def generate_multimodal_pages(
|
69
|
-
doc_result:
|
74
|
+
doc_result: ConversionResult,
|
70
75
|
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
|
71
76
|
|
72
77
|
label_to_doclaynet = {
|
@@ -94,7 +99,7 @@ def generate_multimodal_pages(
|
|
94
99
|
page_no = 0
|
95
100
|
start_ix = 0
|
96
101
|
end_ix = 0
|
97
|
-
doc_items = []
|
102
|
+
doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []
|
98
103
|
|
99
104
|
doc = doc_result.output
|
100
105
|
|
@@ -105,11 +110,11 @@ def generate_multimodal_pages(
|
|
105
110
|
item_type = item.obj_type
|
106
111
|
label = label_to_doclaynet.get(item_type, None)
|
107
112
|
|
108
|
-
if label is None:
|
113
|
+
if label is None or item.prov is None or page.size is None:
|
109
114
|
continue
|
110
115
|
|
111
116
|
bbox = BoundingBox.from_tuple(
|
112
|
-
item.prov[0].bbox, origin=CoordOrigin.BOTTOMLEFT
|
117
|
+
tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT
|
113
118
|
)
|
114
119
|
new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(
|
115
120
|
page_size=page.size
|
@@ -137,13 +142,15 @@ def generate_multimodal_pages(
|
|
137
142
|
return segments
|
138
143
|
|
139
144
|
def _process_page_cells(page: Page):
|
140
|
-
cells = []
|
145
|
+
cells: List[dict] = []
|
146
|
+
if page.size is None:
|
147
|
+
return cells
|
141
148
|
for cell in page.cells:
|
142
149
|
new_bbox = cell.bbox.to_top_left_origin(
|
143
150
|
page_height=page.size.height
|
144
151
|
).normalized(page_size=page.size)
|
145
152
|
is_ocr = isinstance(cell, OcrCell)
|
146
|
-
ocr_confidence = cell.confidence if
|
153
|
+
ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0
|
147
154
|
cells.append(
|
148
155
|
{
|
149
156
|
"text": cell.text,
|
@@ -170,6 +177,8 @@ def generate_multimodal_pages(
|
|
170
177
|
|
171
178
|
return content_text, content_md, content_dt, page_cells, page_segments, page
|
172
179
|
|
180
|
+
if doc.main_text is None:
|
181
|
+
return
|
173
182
|
for ix, orig_item in enumerate(doc.main_text):
|
174
183
|
|
175
184
|
item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.12.1
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -19,21 +19,34 @@ Classifier: Programming Language :: Python :: 3.10
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.11
|
20
20
|
Classifier: Programming Language :: Python :: 3.12
|
21
21
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
22
|
+
Provides-Extra: examples
|
22
23
|
Requires-Dist: certifi (>=2024.7.4)
|
23
24
|
Requires-Dist: deepsearch-glm (>=0.21.0,<0.22.0)
|
24
|
-
Requires-Dist: docling-core (>=1.
|
25
|
+
Requires-Dist: docling-core (>=1.3.0,<2.0.0)
|
25
26
|
Requires-Dist: docling-ibm-models (>=1.1.7,<2.0.0)
|
26
27
|
Requires-Dist: docling-parse (>=1.2.0,<2.0.0)
|
27
28
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
28
29
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
29
30
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
31
|
+
Requires-Dist: langchain-huggingface (>=0.0.3,<0.0.4) ; extra == "examples"
|
32
|
+
Requires-Dist: langchain-milvus (>=0.1.4,<0.2.0) ; extra == "examples"
|
33
|
+
Requires-Dist: langchain-text-splitters (>=0.2.4,<0.3.0) ; extra == "examples"
|
34
|
+
Requires-Dist: llama-index-embeddings-huggingface (>=0.3.1,<0.4.0) ; extra == "examples"
|
35
|
+
Requires-Dist: llama-index-llms-huggingface-api (>=0.2.0,<0.3.0) ; extra == "examples"
|
36
|
+
Requires-Dist: llama-index-vector-stores-milvus (>=0.2.1,<0.3.0) ; extra == "examples"
|
30
37
|
Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
|
31
38
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
32
39
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
33
40
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
41
|
+
Requires-Dist: python-dotenv (>=1.0.1,<2.0.0) ; extra == "examples"
|
34
42
|
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
35
43
|
Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
36
44
|
Requires-Dist: scipy (>=1.14.1,<2.0.0)
|
45
|
+
Requires-Dist: torch (>=2.2.2,<2.3.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
|
46
|
+
Requires-Dist: torch (>=2.2.2,<3.0.0) ; sys_platform != "darwin" or platform_machine != "x86_64"
|
47
|
+
Requires-Dist: torchvision (>=0,<1) ; sys_platform != "darwin" or platform_machine != "x86_64"
|
48
|
+
Requires-Dist: torchvision (>=0.17.2,<0.18.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
|
49
|
+
Requires-Dist: typer (>=0.12.5,<0.13.0)
|
37
50
|
Project-URL: Repository, https://github.com/DS4SD/docling
|
38
51
|
Description-Content-Type: text/markdown
|
39
52
|
|
@@ -62,8 +75,7 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
|
|
62
75
|
* 📑 Understands detailed page layout, reading order and recovers table structures
|
63
76
|
* 📝 Extracts metadata from the document, such as title, authors, references and language
|
64
77
|
* 🔍 Optionally applies OCR (use with scanned PDFs)
|
65
|
-
|
66
|
-
For RAG, check out [Quackling](https://github.com/DS4SD/quackling) to get the most out of your docs, be it using LlamaIndex, LangChain or your pipeline.
|
78
|
+
* 🤖 Integrates easily with LLM app / RAG frameworks like 🦙 LlamaIndex and 🦜🔗 LangChain
|
67
79
|
|
68
80
|
## Installation
|
69
81
|
|
@@ -182,6 +194,10 @@ results = doc_converter.convert(conv_input)
|
|
182
194
|
|
183
195
|
You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
|
184
196
|
|
197
|
+
### RAG
|
198
|
+
Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
|
199
|
+
- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
|
200
|
+
- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
|
185
201
|
|
186
202
|
## Technical report
|
187
203
|
|
@@ -1,10 +1,12 @@
|
|
1
1
|
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
docling/backend/abstract_backend.py,sha256=
|
4
|
-
docling/backend/docling_parse_backend.py,sha256=
|
5
|
-
docling/backend/pypdfium2_backend.py,sha256=
|
3
|
+
docling/backend/abstract_backend.py,sha256=clJtGxLedpLriEhpx7oyxjmlwMLPorkv-1tdfZm9GdA,1546
|
4
|
+
docling/backend/docling_parse_backend.py,sha256=RUWWZbx2cUotZeeTkc-Lbg2k8MVFXFxaDjM4sPfaFZE,7475
|
5
|
+
docling/backend/pypdfium2_backend.py,sha256=bIIImVM73wmcVcKMqjl4JF8CD-Qj2W5rZbI4G7clU4s,8877
|
6
|
+
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
docling/cli/main.py,sha256=VUzm4vOijPo2F2Ht20zTnMI5alJLixfC5WK2NJCbyng,8492
|
6
8
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
docling/datamodel/base_models.py,sha256=
|
9
|
+
docling/datamodel/base_models.py,sha256=tE2Sxoe3e_fBZjq3GDo2NCughDMU5xDeAfkQgT72TRI,9168
|
8
10
|
docling/datamodel/document.py,sha256=oXPitPRd9Gyi7ZU4kfEc4K9eMVtTJDx1T-ellTwF3Ak,15716
|
9
11
|
docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
|
10
12
|
docling/document_converter.py,sha256=5OiNafoaVcQhZ8ATF69xRp2KyFyKeSMhmwEFUoCzP-k,10980
|
@@ -16,13 +18,14 @@ docling/models/layout_model.py,sha256=ZFmaLXlRWUfsT1pJCiYVxhQFrBBsiz6Aw0m9GM3UvV
|
|
16
18
|
docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
|
17
19
|
docling/models/table_structure_model.py,sha256=0wOeiRoma6et7FtoJZw2SA3wBd9-R9ivp5uvXBQqeM4,5768
|
18
20
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
|
-
docling/pipeline/base_model_pipeline.py,sha256=
|
21
|
+
docling/pipeline/base_model_pipeline.py,sha256=H5XoADpsJEZls8BI3FnppR2ubltkQwf_er4Qr74rdQ8,561
|
20
22
|
docling/pipeline/standard_model_pipeline.py,sha256=UTjyaEXvz9htYZz-IMTkn11cZwNjgvo_Fl2dfBVnRQs,1442
|
21
23
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
|
-
docling/utils/export.py,sha256=
|
24
|
+
docling/utils/export.py,sha256=ast5p8YgPBwaDx5ClOF1iSJHO8BFEWE3EBBsUiD9MIQ,6474
|
23
25
|
docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
|
24
26
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
25
|
-
docling-1.
|
26
|
-
docling-1.
|
27
|
-
docling-1.
|
28
|
-
docling-1.
|
27
|
+
docling-1.12.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
28
|
+
docling-1.12.1.dist-info/METADATA,sha256=uOBuBvm3hx7K2IS6_iONhO4W-pAywg6kFWgwd106m9k,9544
|
29
|
+
docling-1.12.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
30
|
+
docling-1.12.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
31
|
+
docling-1.12.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|