docling 2.14.0__py3-none-any.whl → 2.15.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docling_parse_backend.py +1 -1
- docling/backend/docling_parse_v2_backend.py +1 -1
- docling/backend/html_backend.py +2 -2
- docling/backend/mspowerpoint_backend.py +15 -11
- docling/backend/pypdfium2_backend.py +1 -1
- docling/cli/main.py +14 -2
- docling/datamodel/document.py +6 -1
- docling/datamodel/pipeline_options.py +1 -1
- docling/document_converter.py +4 -2
- docling/models/base_ocr_model.py +29 -13
- docling/models/layout_model.py +18 -25
- docling/models/table_structure_model.py +20 -0
- {docling-2.14.0.dist-info → docling-2.15.1.dist-info}/METADATA +4 -5
- {docling-2.14.0.dist-info → docling-2.15.1.dist-info}/RECORD +17 -17
- {docling-2.14.0.dist-info → docling-2.15.1.dist-info}/LICENSE +0 -0
- {docling-2.14.0.dist-info → docling-2.15.1.dist-info}/WHEEL +0 -0
- {docling-2.14.0.dist-info → docling-2.15.1.dist-info}/entry_points.txt +0 -0
@@ -132,7 +132,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
132
132
|
return cells
|
133
133
|
|
134
134
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
135
|
-
AREA_THRESHOLD = 32 * 32
|
135
|
+
AREA_THRESHOLD = 0 # 32 * 32
|
136
136
|
|
137
137
|
for i in range(len(self._dpage["images"])):
|
138
138
|
bitmap = self._dpage["images"][i]
|
@@ -140,7 +140,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|
140
140
|
return cells
|
141
141
|
|
142
142
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
143
|
-
AREA_THRESHOLD = 32 * 32
|
143
|
+
AREA_THRESHOLD = 0 # 32 * 32
|
144
144
|
|
145
145
|
images = self._dpage["sanitized"]["images"]["data"]
|
146
146
|
images_header = self._dpage["sanitized"]["images"]["header"]
|
docling/backend/html_backend.py
CHANGED
@@ -37,10 +37,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
37
37
|
|
38
38
|
try:
|
39
39
|
if isinstance(self.path_or_stream, BytesIO):
|
40
|
-
text_stream = self.path_or_stream.getvalue()
|
40
|
+
text_stream = self.path_or_stream.getvalue()
|
41
41
|
self.soup = BeautifulSoup(text_stream, "html.parser")
|
42
42
|
if isinstance(self.path_or_stream, Path):
|
43
|
-
with open(self.path_or_stream, "
|
43
|
+
with open(self.path_or_stream, "rb") as f:
|
44
44
|
html_content = f.read()
|
45
45
|
self.soup = BeautifulSoup(html_content, "html.parser")
|
46
46
|
except Exception as e:
|
@@ -16,7 +16,7 @@ from docling_core.types.doc import (
|
|
16
16
|
TableCell,
|
17
17
|
TableData,
|
18
18
|
)
|
19
|
-
from PIL import Image
|
19
|
+
from PIL import Image, UnidentifiedImageError
|
20
20
|
from pptx import Presentation
|
21
21
|
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
22
22
|
|
@@ -120,6 +120,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
120
120
|
bullet_type = "None"
|
121
121
|
list_text = ""
|
122
122
|
list_label = GroupLabel.LIST
|
123
|
+
doc_label = DocItemLabel.LIST_ITEM
|
123
124
|
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
|
124
125
|
|
125
126
|
# Identify if shape contains lists
|
@@ -276,16 +277,19 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
276
277
|
im_dpi, _ = image.dpi
|
277
278
|
|
278
279
|
# Open it with PIL
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
280
|
+
try:
|
281
|
+
pil_image = Image.open(BytesIO(image_bytes))
|
282
|
+
|
283
|
+
# shape has picture
|
284
|
+
prov = self.generate_prov(shape, slide_ind, "")
|
285
|
+
doc.add_picture(
|
286
|
+
parent=parent_slide,
|
287
|
+
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
|
288
|
+
caption=None,
|
289
|
+
prov=prov,
|
290
|
+
)
|
291
|
+
except (UnidentifiedImageError, OSError) as e:
|
292
|
+
_log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
|
289
293
|
return
|
290
294
|
|
291
295
|
def handle_tables(self, shape, parent_slide, slide_ind, doc):
|
@@ -39,7 +39,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
39
39
|
return self.valid
|
40
40
|
|
41
41
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
42
|
-
AREA_THRESHOLD = 32 * 32
|
42
|
+
AREA_THRESHOLD = 0 # 32 * 32
|
43
43
|
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
44
44
|
pos = obj.get_pos()
|
45
45
|
cropbox = BoundingBox.from_tuple(
|
docling/cli/main.py
CHANGED
@@ -164,6 +164,11 @@ def convert(
|
|
164
164
|
to_formats: List[OutputFormat] = typer.Option(
|
165
165
|
None, "--to", help="Specify output formats. Defaults to Markdown."
|
166
166
|
),
|
167
|
+
headers: str = typer.Option(
|
168
|
+
None,
|
169
|
+
"--headers",
|
170
|
+
help="Specify http request headers used when fetching url input sources in the form of a JSON string",
|
171
|
+
),
|
167
172
|
image_export_mode: Annotated[
|
168
173
|
ImageRefMode,
|
169
174
|
typer.Option(
|
@@ -279,12 +284,19 @@ def convert(
|
|
279
284
|
if from_formats is None:
|
280
285
|
from_formats = [e for e in InputFormat]
|
281
286
|
|
287
|
+
parsed_headers: Optional[Dict[str, str]] = None
|
288
|
+
if headers is not None:
|
289
|
+
headers_t = TypeAdapter(Dict[str, str])
|
290
|
+
parsed_headers = headers_t.validate_json(headers)
|
291
|
+
|
282
292
|
with tempfile.TemporaryDirectory() as tempdir:
|
283
293
|
input_doc_paths: List[Path] = []
|
284
294
|
for src in input_sources:
|
285
295
|
try:
|
286
296
|
# check if we can fetch some remote url
|
287
|
-
source = resolve_source_to_path(
|
297
|
+
source = resolve_source_to_path(
|
298
|
+
source=src, headers=parsed_headers, workdir=Path(tempdir)
|
299
|
+
)
|
288
300
|
input_doc_paths.append(source)
|
289
301
|
except FileNotFoundError:
|
290
302
|
err_console.print(
|
@@ -390,7 +402,7 @@ def convert(
|
|
390
402
|
start_time = time.time()
|
391
403
|
|
392
404
|
conv_results = doc_converter.convert_all(
|
393
|
-
input_doc_paths, raises_on_error=abort_on_error
|
405
|
+
input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
|
394
406
|
)
|
395
407
|
|
396
408
|
output.mkdir(parents=True, exist_ok=True)
|
docling/datamodel/document.py
CHANGED
@@ -227,13 +227,18 @@ class _DummyBackend(AbstractDocumentBackend):
|
|
227
227
|
class _DocumentConversionInput(BaseModel):
|
228
228
|
|
229
229
|
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
230
|
+
headers: Optional[Dict[str, str]] = None
|
230
231
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
231
232
|
|
232
233
|
def docs(
|
233
234
|
self, format_options: Dict[InputFormat, "FormatOption"]
|
234
235
|
) -> Iterable[InputDocument]:
|
235
236
|
for item in self.path_or_stream_iterator:
|
236
|
-
obj =
|
237
|
+
obj = (
|
238
|
+
resolve_source_to_stream(item, self.headers)
|
239
|
+
if isinstance(item, str)
|
240
|
+
else item
|
241
|
+
)
|
237
242
|
format = self._guess_format(obj)
|
238
243
|
backend: Type[AbstractDocumentBackend]
|
239
244
|
if format not in format_options.keys():
|
@@ -139,7 +139,7 @@ class EasyOcrOptions(OcrOptions):
|
|
139
139
|
|
140
140
|
use_gpu: Optional[bool] = None
|
141
141
|
|
142
|
-
confidence_threshold: float = 0.
|
142
|
+
confidence_threshold: float = 0.5
|
143
143
|
|
144
144
|
model_storage_directory: Optional[str] = None
|
145
145
|
recog_network: Optional[str] = "standard"
|
docling/document_converter.py
CHANGED
@@ -176,6 +176,7 @@ class DocumentConverter:
|
|
176
176
|
def convert(
|
177
177
|
self,
|
178
178
|
source: Union[Path, str, DocumentStream], # TODO review naming
|
179
|
+
headers: Optional[Dict[str, str]] = None,
|
179
180
|
raises_on_error: bool = True,
|
180
181
|
max_num_pages: int = sys.maxsize,
|
181
182
|
max_file_size: int = sys.maxsize,
|
@@ -185,6 +186,7 @@ class DocumentConverter:
|
|
185
186
|
raises_on_error=raises_on_error,
|
186
187
|
max_num_pages=max_num_pages,
|
187
188
|
max_file_size=max_file_size,
|
189
|
+
headers=headers,
|
188
190
|
)
|
189
191
|
return next(all_res)
|
190
192
|
|
@@ -192,6 +194,7 @@ class DocumentConverter:
|
|
192
194
|
def convert_all(
|
193
195
|
self,
|
194
196
|
source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
|
197
|
+
headers: Optional[Dict[str, str]] = None,
|
195
198
|
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
196
199
|
max_num_pages: int = sys.maxsize,
|
197
200
|
max_file_size: int = sys.maxsize,
|
@@ -201,8 +204,7 @@ class DocumentConverter:
|
|
201
204
|
max_file_size=max_file_size,
|
202
205
|
)
|
203
206
|
conv_input = _DocumentConversionInput(
|
204
|
-
path_or_stream_iterator=source,
|
205
|
-
limits=limits,
|
207
|
+
path_or_stream_iterator=source, limits=limits, headers=headers
|
206
208
|
)
|
207
209
|
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
208
210
|
|
docling/models/base_ocr_model.py
CHANGED
@@ -8,7 +8,7 @@ import numpy as np
|
|
8
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
9
9
|
from PIL import Image, ImageDraw
|
10
10
|
from rtree import index
|
11
|
-
from scipy.ndimage import find_objects, label
|
11
|
+
from scipy.ndimage import binary_dilation, find_objects, label
|
12
12
|
|
13
13
|
from docling.datamodel.base_models import Cell, OcrCell, Page
|
14
14
|
from docling.datamodel.document import ConversionResult
|
@@ -43,6 +43,12 @@ class BaseOcrModel(BasePageModel):
|
|
43
43
|
|
44
44
|
np_image = np.array(image)
|
45
45
|
|
46
|
+
# Dilate the image by 10 pixels to merge nearby bitmap rectangles
|
47
|
+
structure = np.ones(
|
48
|
+
(20, 20)
|
49
|
+
) # Create a 20x20 structure element (10 pixels in all directions)
|
50
|
+
np_image = binary_dilation(np_image > 0, structure=structure)
|
51
|
+
|
46
52
|
# Find the connected components
|
47
53
|
labeled_image, num_features = label(
|
48
54
|
np_image > 0
|
@@ -72,7 +78,7 @@ class BaseOcrModel(BasePageModel):
|
|
72
78
|
bitmap_rects = []
|
73
79
|
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
|
74
80
|
|
75
|
-
# return full-page rectangle if
|
81
|
+
# return full-page rectangle if page is dominantly covered with bitmaps
|
76
82
|
if self.options.force_full_page_ocr or coverage > max(
|
77
83
|
BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
|
78
84
|
):
|
@@ -85,17 +91,11 @@ class BaseOcrModel(BasePageModel):
|
|
85
91
|
coord_origin=CoordOrigin.TOPLEFT,
|
86
92
|
)
|
87
93
|
]
|
88
|
-
# return individual rectangles if the bitmap coverage is
|
89
|
-
|
90
|
-
|
91
|
-
# skip OCR if the bitmap area on the page is smaller than the options threshold
|
92
|
-
ocr_rects = [
|
93
|
-
rect
|
94
|
-
for rect in ocr_rects
|
95
|
-
if rect.area() / (page.size.width * page.size.height)
|
96
|
-
> self.options.bitmap_area_threshold
|
97
|
-
]
|
94
|
+
# return individual rectangles if the bitmap coverage is above the threshold
|
95
|
+
elif coverage > self.options.bitmap_area_threshold:
|
98
96
|
return ocr_rects
|
97
|
+
else: # overall coverage of bitmaps is too low, drop all bitmap rectangles.
|
98
|
+
return []
|
99
99
|
|
100
100
|
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
101
101
|
def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
|
@@ -138,18 +138,34 @@ class BaseOcrModel(BasePageModel):
|
|
138
138
|
|
139
139
|
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
140
140
|
image = copy.deepcopy(page.image)
|
141
|
+
scale_x = image.width / page.size.width
|
142
|
+
scale_y = image.height / page.size.height
|
143
|
+
|
141
144
|
draw = ImageDraw.Draw(image, "RGBA")
|
142
145
|
|
143
146
|
# Draw OCR rectangles as yellow filled rect
|
144
147
|
for rect in ocr_rects:
|
145
148
|
x0, y0, x1, y1 = rect.as_tuple()
|
149
|
+
y0 *= scale_x
|
150
|
+
y1 *= scale_y
|
151
|
+
x0 *= scale_x
|
152
|
+
x1 *= scale_x
|
153
|
+
|
146
154
|
shade_color = (255, 255, 0, 40) # transparent yellow
|
147
155
|
draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
|
148
156
|
|
149
157
|
# Draw OCR and programmatic cells
|
150
158
|
for tc in page.cells:
|
151
159
|
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
152
|
-
|
160
|
+
y0 *= scale_x
|
161
|
+
y1 *= scale_y
|
162
|
+
x0 *= scale_x
|
163
|
+
x1 *= scale_x
|
164
|
+
|
165
|
+
if y1 <= y0:
|
166
|
+
y1, y0 = y0, y1
|
167
|
+
|
168
|
+
color = "gray"
|
153
169
|
if isinstance(tc, OcrCell):
|
154
170
|
color = "magenta"
|
155
171
|
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
|
docling/models/layout_model.py
CHANGED
@@ -67,29 +67,9 @@ class LayoutModel(BasePageModel):
|
|
67
67
|
- Right: Clusters including FORM, KEY_VALUE_REGION, and PICTURE.
|
68
68
|
Includes label names and confidence scores for each cluster.
|
69
69
|
"""
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
DocItemLabel.LIST_ITEM: (153, 153, 255), # Light Purple
|
74
|
-
DocItemLabel.FORMULA: (192, 192, 192), # Gray
|
75
|
-
DocItemLabel.TABLE: (255, 204, 204), # Light Pink
|
76
|
-
DocItemLabel.PICTURE: (255, 204, 164), # Light Beige
|
77
|
-
DocItemLabel.SECTION_HEADER: (255, 153, 153), # Light Red
|
78
|
-
DocItemLabel.PAGE_HEADER: (204, 255, 204), # Light Green
|
79
|
-
DocItemLabel.PAGE_FOOTER: (
|
80
|
-
204,
|
81
|
-
255,
|
82
|
-
204,
|
83
|
-
), # Light Green (same as Page-Header)
|
84
|
-
DocItemLabel.TITLE: (255, 153, 153), # Light Red (same as Section-Header)
|
85
|
-
DocItemLabel.FOOTNOTE: (200, 200, 255), # Light Blue
|
86
|
-
DocItemLabel.DOCUMENT_INDEX: (220, 220, 220), # Light Gray
|
87
|
-
DocItemLabel.CODE: (125, 125, 125), # Gray
|
88
|
-
DocItemLabel.CHECKBOX_SELECTED: (255, 182, 193), # Pale Green
|
89
|
-
DocItemLabel.CHECKBOX_UNSELECTED: (255, 182, 193), # Light Pink
|
90
|
-
DocItemLabel.FORM: (200, 255, 255), # Light Cyan
|
91
|
-
DocItemLabel.KEY_VALUE_REGION: (183, 65, 14), # Rusty orange
|
92
|
-
}
|
70
|
+
scale_x = page.image.width / page.size.width
|
71
|
+
scale_y = page.image.height / page.size.height
|
72
|
+
|
93
73
|
# Filter clusters for left and right images
|
94
74
|
exclude_labels = {
|
95
75
|
DocItemLabel.FORM,
|
@@ -118,6 +98,11 @@ class LayoutModel(BasePageModel):
|
|
118
98
|
cell_color = (0, 0, 0, 40) # Transparent black for cells
|
119
99
|
for tc in c.cells:
|
120
100
|
cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
|
101
|
+
cx0 *= scale_x
|
102
|
+
cx1 *= scale_x
|
103
|
+
cy0 *= scale_x
|
104
|
+
cy1 *= scale_y
|
105
|
+
|
121
106
|
draw.rectangle(
|
122
107
|
[(cx0, cy0), (cx1, cy1)],
|
123
108
|
outline=None,
|
@@ -125,8 +110,16 @@ class LayoutModel(BasePageModel):
|
|
125
110
|
)
|
126
111
|
# Draw cluster rectangle
|
127
112
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
128
|
-
|
129
|
-
|
113
|
+
x0 *= scale_x
|
114
|
+
x1 *= scale_x
|
115
|
+
y0 *= scale_x
|
116
|
+
y1 *= scale_y
|
117
|
+
|
118
|
+
cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
|
119
|
+
cluster_outline_color = (
|
120
|
+
*list(DocItemLabel.get_color(c.label)),
|
121
|
+
255,
|
122
|
+
)
|
130
123
|
draw.rectangle(
|
131
124
|
[(x0, y0), (x1, y1)],
|
132
125
|
outline=cluster_outline_color,
|
@@ -66,23 +66,43 @@ class TableStructureModel(BasePageModel):
|
|
66
66
|
show: bool = False,
|
67
67
|
):
|
68
68
|
assert page._backend is not None
|
69
|
+
assert page.size is not None
|
69
70
|
|
70
71
|
image = (
|
71
72
|
page._backend.get_page_image()
|
72
73
|
) # make new image to avoid drawing on the saved ones
|
74
|
+
|
75
|
+
scale_x = image.width / page.size.width
|
76
|
+
scale_y = image.height / page.size.height
|
77
|
+
|
73
78
|
draw = ImageDraw.Draw(image)
|
74
79
|
|
75
80
|
for table_element in tbl_list:
|
76
81
|
x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
|
82
|
+
y0 *= scale_x
|
83
|
+
y1 *= scale_y
|
84
|
+
x0 *= scale_x
|
85
|
+
x1 *= scale_x
|
86
|
+
|
77
87
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
78
88
|
|
79
89
|
for cell in table_element.cluster.cells:
|
80
90
|
x0, y0, x1, y1 = cell.bbox.as_tuple()
|
91
|
+
x0 *= scale_x
|
92
|
+
x1 *= scale_x
|
93
|
+
y0 *= scale_x
|
94
|
+
y1 *= scale_y
|
95
|
+
|
81
96
|
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
|
82
97
|
|
83
98
|
for tc in table_element.table_cells:
|
84
99
|
if tc.bbox is not None:
|
85
100
|
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
101
|
+
x0 *= scale_x
|
102
|
+
x1 *= scale_x
|
103
|
+
y0 *= scale_x
|
104
|
+
y1 *= scale_y
|
105
|
+
|
86
106
|
if tc.column_header:
|
87
107
|
width = 3
|
88
108
|
else:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.15.1
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -26,7 +26,7 @@ Provides-Extra: tesserocr
|
|
26
26
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
27
27
|
Requires-Dist: certifi (>=2024.7.4)
|
28
28
|
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
29
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
29
|
+
Requires-Dist: docling-core[chunking] (>=2.13.1,<3.0.0)
|
30
30
|
Requires-Dist: docling-ibm-models (>=3.1.0,<4.0.0)
|
31
31
|
Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
|
32
32
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -45,7 +45,7 @@ Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
|
45
45
|
Requires-Dist: python-docx (>=1.1.2,<2.0.0)
|
46
46
|
Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
|
47
47
|
Requires-Dist: rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; (python_version < "3.13") and (extra == "rapidocr")
|
48
|
-
Requires-Dist: requests (>=2.32.
|
48
|
+
Requires-Dist: requests (>=2.32.2,<3.0.0)
|
49
49
|
Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
50
50
|
Requires-Dist: scipy (>=1.6.0,<2.0.0)
|
51
51
|
Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
|
@@ -84,7 +84,7 @@ Docling parses documents and exports them to the desired format with ease and sp
|
|
84
84
|
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
|
85
85
|
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
86
86
|
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
87
|
-
* 🤖
|
87
|
+
* 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
88
88
|
* 🔍 OCR support for scanned PDFs
|
89
89
|
* 💻 Simple and convenient CLI
|
90
90
|
|
@@ -94,7 +94,6 @@ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty
|
|
94
94
|
|
95
95
|
* ♾️ Equation & code extraction
|
96
96
|
* 📝 Metadata extraction, including title, authors, references & language
|
97
|
-
* 🦜🔗 Native LangChain extension
|
98
97
|
|
99
98
|
## Installation
|
100
99
|
|
@@ -2,39 +2,39 @@ docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
3
|
docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
|
4
4
|
docling/backend/asciidoc_backend.py,sha256=kXZxOLk_LvLFVZwnJVVwjmvc3QWZ0iiG7VnwjgtC3hI,14051
|
5
|
-
docling/backend/docling_parse_backend.py,sha256=
|
6
|
-
docling/backend/docling_parse_v2_backend.py,sha256
|
7
|
-
docling/backend/html_backend.py,sha256=
|
5
|
+
docling/backend/docling_parse_backend.py,sha256=cJLkuOmfCtshRrwsv7WWayRNeMQASZv76v3nUHucqgM,7636
|
6
|
+
docling/backend/docling_parse_v2_backend.py,sha256=-lLsorxhK_Awrql_zXPen2LX0Gt9UvcDLMcmXf7_LKc,8642
|
7
|
+
docling/backend/html_backend.py,sha256=O8qXaw7MzOIdaxbBcjHieM9Ce4GEdtBj9YW0vpJspuA,15560
|
8
8
|
docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
|
9
9
|
docling/backend/msexcel_backend.py,sha256=23qUEScqr5GhY06xiqg-eBQ_JlAqO0FkPEmX6554sVA,12040
|
10
|
-
docling/backend/mspowerpoint_backend.py,sha256=
|
10
|
+
docling/backend/mspowerpoint_backend.py,sha256=kOGawhcn0BFq4M_C6kW0mY8vMIB24_6R6q6GaszbSt0,15957
|
11
11
|
docling/backend/msword_backend.py,sha256=K1D_h0ulLA6KQsPe62327cDVkQqV1f7EetCHo66wCKw,19233
|
12
12
|
docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
|
13
|
-
docling/backend/pypdfium2_backend.py,sha256=
|
13
|
+
docling/backend/pypdfium2_backend.py,sha256=Exb3NBp3x2YSLoNfmXq4NefShgooJXsxTXrJ4JbTzcc,9001
|
14
14
|
docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
15
|
docling/backend/xml/pubmed_backend.py,sha256=LMnpowjnxa5SydfNC00Ll840BYraL8dCJu-FfC9iSKk,20447
|
16
16
|
docling/backend/xml/uspto_backend.py,sha256=2YsnB-WRARIAaHPL6gxHePP24GQGi-Up2_K8ZapD3k4,70974
|
17
17
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
18
18
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
|
-
docling/cli/main.py,sha256=
|
19
|
+
docling/cli/main.py,sha256=NR7NEt8Sf3FE9D7sHpEmABM9mFMTMO5w0VPwYIIvVsk,15481
|
20
20
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
21
|
docling/datamodel/base_models.py,sha256=50Jf5zk9c4-zmnOzZLoPBnHQhTX0_OFQzIkKgnKK1o4,6229
|
22
|
-
docling/datamodel/document.py,sha256=
|
23
|
-
docling/datamodel/pipeline_options.py,sha256=
|
22
|
+
docling/datamodel/document.py,sha256=OHM6bm0a-62xnAZ8DFlMHzATmbgNcfMxQoQO2udaW5Q,13071
|
23
|
+
docling/datamodel/pipeline_options.py,sha256=wKFzw8sAim6emQGsjuS12n7FfpMo8HVNoMOPhkXTkVo,7734
|
24
24
|
docling/datamodel/settings.py,sha256=Sw0rN_f8rdLV1eNvVeKiyET2Oe6oz9jtW3lJzniW9Do,1302
|
25
|
-
docling/document_converter.py,sha256=
|
25
|
+
docling/document_converter.py,sha256=_pk0sHuPXJ14NEutatf5bK2VyNiU5cvYsVbh1HIgrIw,12431
|
26
26
|
docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
|
27
27
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
28
28
|
docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
|
29
|
-
docling/models/base_ocr_model.py,sha256=
|
29
|
+
docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_VRTE,6919
|
30
30
|
docling/models/ds_glm_model.py,sha256=CkhsP0cEWwm4wb1g3cLFriVGpVtELiUK3REDMkPwAMw,13028
|
31
31
|
docling/models/easyocr_model.py,sha256=Kakb20ioBxDmNsIqoGvSSs_vbqAWN3QQNHYtEi-eErg,4990
|
32
|
-
docling/models/layout_model.py,sha256=
|
32
|
+
docling/models/layout_model.py,sha256=Xo8sclRTOO_V8Cr4RwuxB67vSWKF0LZ5nJRYU1WI--k,9063
|
33
33
|
docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
|
34
34
|
docling/models/page_assemble_model.py,sha256=qdEX0AIb76ZOqJV6O9j-7r67WmuIkUlwbb2PsL7eFK4,7608
|
35
35
|
docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
|
36
36
|
docling/models/rapid_ocr_model.py,sha256=LOIvczJs3_db2o8mtrKk-pIXgC-xqWqRLu2cjA3wvy4,4980
|
37
|
-
docling/models/table_structure_model.py,sha256=
|
37
|
+
docling/models/table_structure_model.py,sha256=fUpCHthO4Uk3BhA99a85BHBm51fmdE9kfqhAk3WjuBw,9392
|
38
38
|
docling/models/tesseract_ocr_cli_model.py,sha256=aKQBaty4cYu6zG_C5uy6Zm3eeRQo5fxIierbKixa2kc,6622
|
39
39
|
docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUsOkuz0neY,6095
|
40
40
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -49,8 +49,8 @@ docling/utils/glm_utils.py,sha256=IB19wToGath97gD3jAA3G_rQSptnZKhQCWLvPUCnkww,11
|
|
49
49
|
docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
|
50
50
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
51
51
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
52
|
-
docling-2.
|
53
|
-
docling-2.
|
54
|
-
docling-2.
|
55
|
-
docling-2.
|
56
|
-
docling-2.
|
52
|
+
docling-2.15.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
53
|
+
docling-2.15.1.dist-info/METADATA,sha256=6WRzA633us43nw7RHwhX_jwizh2JSpGWxNh0pJq2ZYs,7739
|
54
|
+
docling-2.15.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
55
|
+
docling-2.15.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
56
|
+
docling-2.15.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|