docling 2.27.0__py3-none-any.whl → 2.28.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docling_parse_v4_backend.py +20 -13
- docling/backend/mspowerpoint_backend.py +18 -0
- docling/backend/msword_backend.py +56 -14
- docling/cli/main.py +81 -38
- docling/datamodel/pipeline_options.py +28 -2
- docling/document_converter.py +29 -17
- docling/models/hf_mlx_model.py +137 -0
- docling/models/page_preprocessing_model.py +7 -1
- docling/pipeline/vlm_pipeline.py +78 -398
- {docling-2.27.0.dist-info → docling-2.28.1.dist-info}/METADATA +27 -32
- {docling-2.27.0.dist-info → docling-2.28.1.dist-info}/RECORD +14 -13
- {docling-2.27.0.dist-info → docling-2.28.1.dist-info}/LICENSE +0 -0
- {docling-2.27.0.dist-info → docling-2.28.1.dist-info}/WHEEL +0 -0
- {docling-2.27.0.dist-info → docling-2.28.1.dist-info}/entry_points.txt +0 -0
docling/pipeline/vlm_pipeline.py
CHANGED
@@ -1,30 +1,13 @@
|
|
1
|
-
import itertools
|
2
1
|
import logging
|
3
|
-
import re
|
4
2
|
import warnings
|
5
3
|
from io import BytesIO
|
6
|
-
|
7
|
-
# from io import BytesIO
|
8
4
|
from pathlib import Path
|
9
|
-
from typing import Optional
|
5
|
+
from typing import List, Optional, Union, cast
|
10
6
|
|
11
|
-
from docling_core.types import DoclingDocument
|
12
|
-
from docling_core.types.doc import
|
13
|
-
|
14
|
-
|
15
|
-
DocItemLabel,
|
16
|
-
DoclingDocument,
|
17
|
-
GroupLabel,
|
18
|
-
ImageRef,
|
19
|
-
ImageRefMode,
|
20
|
-
PictureItem,
|
21
|
-
ProvenanceItem,
|
22
|
-
Size,
|
23
|
-
TableCell,
|
24
|
-
TableData,
|
25
|
-
TableItem,
|
26
|
-
)
|
27
|
-
from docling_core.types.doc.tokens import DocumentToken, TableToken
|
7
|
+
# from docling_core.types import DoclingDocument
|
8
|
+
from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem
|
9
|
+
from docling_core.types.doc.document import DocTagsDocument
|
10
|
+
from PIL import Image as PILImage
|
28
11
|
|
29
12
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
30
13
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
@@ -32,11 +15,12 @@ from docling.backend.pdf_backend import PdfDocumentBackend
|
|
32
15
|
from docling.datamodel.base_models import InputFormat, Page
|
33
16
|
from docling.datamodel.document import ConversionResult, InputDocument
|
34
17
|
from docling.datamodel.pipeline_options import (
|
35
|
-
|
18
|
+
InferenceFramework,
|
36
19
|
ResponseFormat,
|
37
20
|
VlmPipelineOptions,
|
38
21
|
)
|
39
22
|
from docling.datamodel.settings import settings
|
23
|
+
from docling.models.hf_mlx_model import HuggingFaceMlxModel
|
40
24
|
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
41
25
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
42
26
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
@@ -50,12 +34,6 @@ class VlmPipeline(PaginatedPipeline):
|
|
50
34
|
super().__init__(pipeline_options)
|
51
35
|
self.keep_backend = True
|
52
36
|
|
53
|
-
warnings.warn(
|
54
|
-
"The VlmPipeline is currently experimental and may change in upcoming versions without notice.",
|
55
|
-
category=UserWarning,
|
56
|
-
stacklevel=2,
|
57
|
-
)
|
58
|
-
|
59
37
|
self.pipeline_options: VlmPipelineOptions
|
60
38
|
|
61
39
|
artifacts_path: Optional[Path] = None
|
@@ -79,14 +57,27 @@ class VlmPipeline(PaginatedPipeline):
|
|
79
57
|
|
80
58
|
self.keep_images = self.pipeline_options.generate_page_images
|
81
59
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
60
|
+
if (
|
61
|
+
self.pipeline_options.vlm_options.inference_framework
|
62
|
+
== InferenceFramework.MLX
|
63
|
+
):
|
64
|
+
self.build_pipe = [
|
65
|
+
HuggingFaceMlxModel(
|
66
|
+
enabled=True, # must be always enabled for this pipeline to make sense.
|
67
|
+
artifacts_path=artifacts_path,
|
68
|
+
accelerator_options=pipeline_options.accelerator_options,
|
69
|
+
vlm_options=self.pipeline_options.vlm_options,
|
70
|
+
),
|
71
|
+
]
|
72
|
+
else:
|
73
|
+
self.build_pipe = [
|
74
|
+
HuggingFaceVlmModel(
|
75
|
+
enabled=True, # must be always enabled for this pipeline to make sense.
|
76
|
+
artifacts_path=artifacts_path,
|
77
|
+
accelerator_options=pipeline_options.accelerator_options,
|
78
|
+
vlm_options=self.pipeline_options.vlm_options,
|
79
|
+
),
|
80
|
+
]
|
90
81
|
|
91
82
|
self.enrichment_pipe = [
|
92
83
|
# Other models working on `NodeItem` elements in the DoclingDocument
|
@@ -100,6 +91,17 @@ class VlmPipeline(PaginatedPipeline):
|
|
100
91
|
|
101
92
|
return page
|
102
93
|
|
94
|
+
def extract_text_from_backend(
|
95
|
+
self, page: Page, bbox: Union[BoundingBox, None]
|
96
|
+
) -> str:
|
97
|
+
# Convert bounding box normalized to 0-100 into page coordinates for cropping
|
98
|
+
text = ""
|
99
|
+
if bbox:
|
100
|
+
if page.size:
|
101
|
+
if page._backend:
|
102
|
+
text = page._backend.get_text_in_rect(bbox)
|
103
|
+
return text
|
104
|
+
|
103
105
|
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
104
106
|
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
105
107
|
|
@@ -107,7 +109,45 @@ class VlmPipeline(PaginatedPipeline):
|
|
107
109
|
self.pipeline_options.vlm_options.response_format
|
108
110
|
== ResponseFormat.DOCTAGS
|
109
111
|
):
|
110
|
-
|
112
|
+
doctags_list = []
|
113
|
+
image_list = []
|
114
|
+
for page in conv_res.pages:
|
115
|
+
predicted_doctags = ""
|
116
|
+
img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
|
117
|
+
if page.predictions.vlm_response:
|
118
|
+
predicted_doctags = page.predictions.vlm_response.text
|
119
|
+
if page.image:
|
120
|
+
img = page.image
|
121
|
+
image_list.append(img)
|
122
|
+
doctags_list.append(predicted_doctags)
|
123
|
+
|
124
|
+
doctags_list_c = cast(List[Union[Path, str]], doctags_list)
|
125
|
+
image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
|
126
|
+
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
|
127
|
+
doctags_list_c, image_list_c
|
128
|
+
)
|
129
|
+
conv_res.document.load_from_doctags(doctags_doc)
|
130
|
+
|
131
|
+
# If forced backend text, replace model predicted text with backend one
|
132
|
+
if page.size:
|
133
|
+
if self.force_backend_text:
|
134
|
+
scale = self.pipeline_options.images_scale
|
135
|
+
for element, _level in conv_res.document.iterate_items():
|
136
|
+
if (
|
137
|
+
not isinstance(element, TextItem)
|
138
|
+
or len(element.prov) == 0
|
139
|
+
):
|
140
|
+
continue
|
141
|
+
crop_bbox = (
|
142
|
+
element.prov[0]
|
143
|
+
.bbox.scaled(scale=scale)
|
144
|
+
.to_top_left_origin(
|
145
|
+
page_height=page.size.height * scale
|
146
|
+
)
|
147
|
+
)
|
148
|
+
txt = self.extract_text_from_backend(page, crop_bbox)
|
149
|
+
element.text = txt
|
150
|
+
element.orig = txt
|
111
151
|
elif (
|
112
152
|
self.pipeline_options.vlm_options.response_format
|
113
153
|
== ResponseFormat.MARKDOWN
|
@@ -165,366 +205,6 @@ class VlmPipeline(PaginatedPipeline):
|
|
165
205
|
)
|
166
206
|
return backend.convert()
|
167
207
|
|
168
|
-
def _turn_tags_into_doc(self, pages: list[Page]) -> DoclingDocument:
|
169
|
-
###############################################
|
170
|
-
# Tag definitions and color mappings
|
171
|
-
###############################################
|
172
|
-
|
173
|
-
# Maps the recognized tag to a Docling label.
|
174
|
-
# Code items will be given DocItemLabel.CODE
|
175
|
-
tag_to_doclabel = {
|
176
|
-
"title": DocItemLabel.TITLE,
|
177
|
-
"document_index": DocItemLabel.DOCUMENT_INDEX,
|
178
|
-
"otsl": DocItemLabel.TABLE,
|
179
|
-
"section_header_level_1": DocItemLabel.SECTION_HEADER,
|
180
|
-
"checkbox_selected": DocItemLabel.CHECKBOX_SELECTED,
|
181
|
-
"checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED,
|
182
|
-
"text": DocItemLabel.TEXT,
|
183
|
-
"page_header": DocItemLabel.PAGE_HEADER,
|
184
|
-
"page_footer": DocItemLabel.PAGE_FOOTER,
|
185
|
-
"formula": DocItemLabel.FORMULA,
|
186
|
-
"caption": DocItemLabel.CAPTION,
|
187
|
-
"picture": DocItemLabel.PICTURE,
|
188
|
-
"list_item": DocItemLabel.LIST_ITEM,
|
189
|
-
"footnote": DocItemLabel.FOOTNOTE,
|
190
|
-
"code": DocItemLabel.CODE,
|
191
|
-
}
|
192
|
-
|
193
|
-
# Maps each tag to an associated bounding box color.
|
194
|
-
tag_to_color = {
|
195
|
-
"title": "blue",
|
196
|
-
"document_index": "darkblue",
|
197
|
-
"otsl": "green",
|
198
|
-
"section_header_level_1": "purple",
|
199
|
-
"checkbox_selected": "black",
|
200
|
-
"checkbox_unselected": "gray",
|
201
|
-
"text": "red",
|
202
|
-
"page_header": "orange",
|
203
|
-
"page_footer": "cyan",
|
204
|
-
"formula": "pink",
|
205
|
-
"caption": "magenta",
|
206
|
-
"picture": "yellow",
|
207
|
-
"list_item": "brown",
|
208
|
-
"footnote": "darkred",
|
209
|
-
"code": "lightblue",
|
210
|
-
}
|
211
|
-
|
212
|
-
def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
|
213
|
-
"""Extracts <loc_...> bounding box coords from the chunk, normalized by / 500."""
|
214
|
-
coords = re.findall(r"<loc_(\d+)>", text_chunk)
|
215
|
-
if len(coords) == 4:
|
216
|
-
l, t, r, b = map(float, coords)
|
217
|
-
return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
|
218
|
-
return None
|
219
|
-
|
220
|
-
def extract_inner_text(text_chunk: str) -> str:
|
221
|
-
"""Strips all <...> tags inside the chunk to get the raw text content."""
|
222
|
-
return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
|
223
|
-
|
224
|
-
def extract_text_from_backend(page: Page, bbox: BoundingBox | None) -> str:
|
225
|
-
# Convert bounding box normalized to 0-100 into page coordinates for cropping
|
226
|
-
text = ""
|
227
|
-
if bbox:
|
228
|
-
if page.size:
|
229
|
-
bbox.l = bbox.l * page.size.width
|
230
|
-
bbox.t = bbox.t * page.size.height
|
231
|
-
bbox.r = bbox.r * page.size.width
|
232
|
-
bbox.b = bbox.b * page.size.height
|
233
|
-
if page._backend:
|
234
|
-
text = page._backend.get_text_in_rect(bbox)
|
235
|
-
return text
|
236
|
-
|
237
|
-
def otsl_parse_texts(texts, tokens):
|
238
|
-
split_word = TableToken.OTSL_NL.value
|
239
|
-
split_row_tokens = [
|
240
|
-
list(y)
|
241
|
-
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
|
242
|
-
if not x
|
243
|
-
]
|
244
|
-
table_cells = []
|
245
|
-
r_idx = 0
|
246
|
-
c_idx = 0
|
247
|
-
|
248
|
-
def count_right(tokens, c_idx, r_idx, which_tokens):
|
249
|
-
span = 0
|
250
|
-
c_idx_iter = c_idx
|
251
|
-
while tokens[r_idx][c_idx_iter] in which_tokens:
|
252
|
-
c_idx_iter += 1
|
253
|
-
span += 1
|
254
|
-
if c_idx_iter >= len(tokens[r_idx]):
|
255
|
-
return span
|
256
|
-
return span
|
257
|
-
|
258
|
-
def count_down(tokens, c_idx, r_idx, which_tokens):
|
259
|
-
span = 0
|
260
|
-
r_idx_iter = r_idx
|
261
|
-
while tokens[r_idx_iter][c_idx] in which_tokens:
|
262
|
-
r_idx_iter += 1
|
263
|
-
span += 1
|
264
|
-
if r_idx_iter >= len(tokens):
|
265
|
-
return span
|
266
|
-
return span
|
267
|
-
|
268
|
-
for i, text in enumerate(texts):
|
269
|
-
cell_text = ""
|
270
|
-
if text in [
|
271
|
-
TableToken.OTSL_FCEL.value,
|
272
|
-
TableToken.OTSL_ECEL.value,
|
273
|
-
TableToken.OTSL_CHED.value,
|
274
|
-
TableToken.OTSL_RHED.value,
|
275
|
-
TableToken.OTSL_SROW.value,
|
276
|
-
]:
|
277
|
-
row_span = 1
|
278
|
-
col_span = 1
|
279
|
-
right_offset = 1
|
280
|
-
if text != TableToken.OTSL_ECEL.value:
|
281
|
-
cell_text = texts[i + 1]
|
282
|
-
right_offset = 2
|
283
|
-
|
284
|
-
# Check next element(s) for lcel / ucel / xcel, set properly row_span, col_span
|
285
|
-
next_right_cell = ""
|
286
|
-
if i + right_offset < len(texts):
|
287
|
-
next_right_cell = texts[i + right_offset]
|
288
|
-
|
289
|
-
next_bottom_cell = ""
|
290
|
-
if r_idx + 1 < len(split_row_tokens):
|
291
|
-
if c_idx < len(split_row_tokens[r_idx + 1]):
|
292
|
-
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
|
293
|
-
|
294
|
-
if next_right_cell in [
|
295
|
-
TableToken.OTSL_LCEL.value,
|
296
|
-
TableToken.OTSL_XCEL.value,
|
297
|
-
]:
|
298
|
-
# we have horisontal spanning cell or 2d spanning cell
|
299
|
-
col_span += count_right(
|
300
|
-
split_row_tokens,
|
301
|
-
c_idx + 1,
|
302
|
-
r_idx,
|
303
|
-
[TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
|
304
|
-
)
|
305
|
-
if next_bottom_cell in [
|
306
|
-
TableToken.OTSL_UCEL.value,
|
307
|
-
TableToken.OTSL_XCEL.value,
|
308
|
-
]:
|
309
|
-
# we have a vertical spanning cell or 2d spanning cell
|
310
|
-
row_span += count_down(
|
311
|
-
split_row_tokens,
|
312
|
-
c_idx,
|
313
|
-
r_idx + 1,
|
314
|
-
[TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
|
315
|
-
)
|
316
|
-
|
317
|
-
table_cells.append(
|
318
|
-
TableCell(
|
319
|
-
text=cell_text.strip(),
|
320
|
-
row_span=row_span,
|
321
|
-
col_span=col_span,
|
322
|
-
start_row_offset_idx=r_idx,
|
323
|
-
end_row_offset_idx=r_idx + row_span,
|
324
|
-
start_col_offset_idx=c_idx,
|
325
|
-
end_col_offset_idx=c_idx + col_span,
|
326
|
-
)
|
327
|
-
)
|
328
|
-
if text in [
|
329
|
-
TableToken.OTSL_FCEL.value,
|
330
|
-
TableToken.OTSL_ECEL.value,
|
331
|
-
TableToken.OTSL_CHED.value,
|
332
|
-
TableToken.OTSL_RHED.value,
|
333
|
-
TableToken.OTSL_SROW.value,
|
334
|
-
TableToken.OTSL_LCEL.value,
|
335
|
-
TableToken.OTSL_UCEL.value,
|
336
|
-
TableToken.OTSL_XCEL.value,
|
337
|
-
]:
|
338
|
-
c_idx += 1
|
339
|
-
if text == TableToken.OTSL_NL.value:
|
340
|
-
r_idx += 1
|
341
|
-
c_idx = 0
|
342
|
-
return table_cells, split_row_tokens
|
343
|
-
|
344
|
-
def otsl_extract_tokens_and_text(s: str):
|
345
|
-
# Pattern to match anything enclosed by < > (including the angle brackets themselves)
|
346
|
-
pattern = r"(<[^>]+>)"
|
347
|
-
# Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
|
348
|
-
tokens = re.findall(pattern, s)
|
349
|
-
# Remove any tokens that start with "<loc_"
|
350
|
-
tokens = [
|
351
|
-
token
|
352
|
-
for token in tokens
|
353
|
-
if not (
|
354
|
-
token.startswith(rf"<{DocumentToken.LOC.value}")
|
355
|
-
or token
|
356
|
-
in [
|
357
|
-
rf"<{DocumentToken.OTSL.value}>",
|
358
|
-
rf"</{DocumentToken.OTSL.value}>",
|
359
|
-
]
|
360
|
-
)
|
361
|
-
]
|
362
|
-
# Split the string by those tokens to get the in-between text
|
363
|
-
text_parts = re.split(pattern, s)
|
364
|
-
text_parts = [
|
365
|
-
token
|
366
|
-
for token in text_parts
|
367
|
-
if not (
|
368
|
-
token.startswith(rf"<{DocumentToken.LOC.value}")
|
369
|
-
or token
|
370
|
-
in [
|
371
|
-
rf"<{DocumentToken.OTSL.value}>",
|
372
|
-
rf"</{DocumentToken.OTSL.value}>",
|
373
|
-
]
|
374
|
-
)
|
375
|
-
]
|
376
|
-
# Remove any empty or purely whitespace strings from text_parts
|
377
|
-
text_parts = [part for part in text_parts if part.strip()]
|
378
|
-
|
379
|
-
return tokens, text_parts
|
380
|
-
|
381
|
-
def parse_table_content(otsl_content: str) -> TableData:
|
382
|
-
tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
|
383
|
-
table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
|
384
|
-
|
385
|
-
return TableData(
|
386
|
-
num_rows=len(split_row_tokens),
|
387
|
-
num_cols=(
|
388
|
-
max(len(row) for row in split_row_tokens) if split_row_tokens else 0
|
389
|
-
),
|
390
|
-
table_cells=table_cells,
|
391
|
-
)
|
392
|
-
|
393
|
-
doc = DoclingDocument(name="Document")
|
394
|
-
for pg_idx, page in enumerate(pages):
|
395
|
-
xml_content = ""
|
396
|
-
predicted_text = ""
|
397
|
-
if page.predictions.vlm_response:
|
398
|
-
predicted_text = page.predictions.vlm_response.text
|
399
|
-
image = page.image
|
400
|
-
|
401
|
-
page_no = pg_idx + 1
|
402
|
-
bounding_boxes = []
|
403
|
-
|
404
|
-
if page.size:
|
405
|
-
pg_width = page.size.width
|
406
|
-
pg_height = page.size.height
|
407
|
-
size = Size(width=pg_width, height=pg_height)
|
408
|
-
parent_page = doc.add_page(page_no=page_no, size=size)
|
409
|
-
|
410
|
-
"""
|
411
|
-
1. Finds all <tag>...</tag> blocks in the entire string (multi-line friendly) in the order they appear.
|
412
|
-
2. For each chunk, extracts bounding box (if any) and inner text.
|
413
|
-
3. Adds the item to a DoclingDocument structure with the right label.
|
414
|
-
4. Tracks bounding boxes + color in a separate list for later visualization.
|
415
|
-
"""
|
416
|
-
|
417
|
-
# Regex for all recognized tags
|
418
|
-
tag_pattern = (
|
419
|
-
rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
|
420
|
-
rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
|
421
|
-
rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
|
422
|
-
rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
|
423
|
-
rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
|
424
|
-
rf"{DocItemLabel.LIST_ITEM}|{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
|
425
|
-
rf"{DocItemLabel.SECTION_HEADER}_level_1|{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
|
426
|
-
)
|
427
|
-
|
428
|
-
# DocumentToken.OTSL
|
429
|
-
pattern = re.compile(tag_pattern, re.DOTALL)
|
430
|
-
|
431
|
-
# Go through each match in order
|
432
|
-
for match in pattern.finditer(predicted_text):
|
433
|
-
full_chunk = match.group(0)
|
434
|
-
tag_name = match.group("tag")
|
435
|
-
|
436
|
-
bbox = extract_bounding_box(full_chunk)
|
437
|
-
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
|
438
|
-
color = tag_to_color.get(tag_name, "white")
|
439
|
-
|
440
|
-
# Store bounding box + color
|
441
|
-
if bbox:
|
442
|
-
bounding_boxes.append((bbox, color))
|
443
|
-
|
444
|
-
if tag_name == DocumentToken.OTSL.value:
|
445
|
-
table_data = parse_table_content(full_chunk)
|
446
|
-
bbox = extract_bounding_box(full_chunk)
|
447
|
-
|
448
|
-
if bbox:
|
449
|
-
prov = ProvenanceItem(
|
450
|
-
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
451
|
-
charspan=(0, 0),
|
452
|
-
page_no=page_no,
|
453
|
-
)
|
454
|
-
doc.add_table(data=table_data, prov=prov)
|
455
|
-
else:
|
456
|
-
doc.add_table(data=table_data)
|
457
|
-
|
458
|
-
elif tag_name == DocItemLabel.PICTURE:
|
459
|
-
text_caption_content = extract_inner_text(full_chunk)
|
460
|
-
if image:
|
461
|
-
if bbox:
|
462
|
-
im_width, im_height = image.size
|
463
|
-
|
464
|
-
crop_box = (
|
465
|
-
int(bbox.l * im_width),
|
466
|
-
int(bbox.t * im_height),
|
467
|
-
int(bbox.r * im_width),
|
468
|
-
int(bbox.b * im_height),
|
469
|
-
)
|
470
|
-
cropped_image = image.crop(crop_box)
|
471
|
-
pic = doc.add_picture(
|
472
|
-
parent=None,
|
473
|
-
image=ImageRef.from_pil(image=cropped_image, dpi=72),
|
474
|
-
prov=(
|
475
|
-
ProvenanceItem(
|
476
|
-
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
477
|
-
charspan=(0, 0),
|
478
|
-
page_no=page_no,
|
479
|
-
)
|
480
|
-
),
|
481
|
-
)
|
482
|
-
# If there is a caption to an image, add it as well
|
483
|
-
if len(text_caption_content) > 0:
|
484
|
-
caption_item = doc.add_text(
|
485
|
-
label=DocItemLabel.CAPTION,
|
486
|
-
text=text_caption_content,
|
487
|
-
parent=None,
|
488
|
-
)
|
489
|
-
pic.captions.append(caption_item.get_ref())
|
490
|
-
else:
|
491
|
-
if bbox:
|
492
|
-
# In case we don't have access to an binary of an image
|
493
|
-
doc.add_picture(
|
494
|
-
parent=None,
|
495
|
-
prov=ProvenanceItem(
|
496
|
-
bbox=bbox, charspan=(0, 0), page_no=page_no
|
497
|
-
),
|
498
|
-
)
|
499
|
-
# If there is a caption to an image, add it as well
|
500
|
-
if len(text_caption_content) > 0:
|
501
|
-
caption_item = doc.add_text(
|
502
|
-
label=DocItemLabel.CAPTION,
|
503
|
-
text=text_caption_content,
|
504
|
-
parent=None,
|
505
|
-
)
|
506
|
-
pic.captions.append(caption_item.get_ref())
|
507
|
-
else:
|
508
|
-
# For everything else, treat as text
|
509
|
-
if self.force_backend_text:
|
510
|
-
text_content = extract_text_from_backend(page, bbox)
|
511
|
-
else:
|
512
|
-
text_content = extract_inner_text(full_chunk)
|
513
|
-
doc.add_text(
|
514
|
-
label=doc_label,
|
515
|
-
text=text_content,
|
516
|
-
prov=(
|
517
|
-
ProvenanceItem(
|
518
|
-
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
519
|
-
charspan=(0, len(text_content)),
|
520
|
-
page_no=page_no,
|
521
|
-
)
|
522
|
-
if bbox
|
523
|
-
else None
|
524
|
-
),
|
525
|
-
)
|
526
|
-
return doc
|
527
|
-
|
528
208
|
@classmethod
|
529
209
|
def get_default_options(cls) -> VlmPipelineOptions:
|
530
210
|
return VlmPipelineOptions()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.28.1
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/docling-project/docling
|
6
6
|
License: MIT
|
@@ -28,7 +28,7 @@ Provides-Extra: vlm
|
|
28
28
|
Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
|
29
29
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
30
30
|
Requires-Dist: certifi (>=2024.7.4)
|
31
|
-
Requires-Dist: docling-core[chunking] (>=2.23.
|
31
|
+
Requires-Dist: docling-core[chunking] (>=2.23.1,<3.0.0)
|
32
32
|
Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
|
33
33
|
Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
|
34
34
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -86,6 +86,7 @@ Description-Content-Type: text/markdown
|
|
86
86
|
[](https://opensource.org/licenses/MIT)
|
87
87
|
[](https://pepy.tech/projects/docling)
|
88
88
|
[](https://apify.com/vancura/docling)
|
89
|
+
[](https://lfaidata.foundation/projects/)
|
89
90
|
|
90
91
|
Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
|
91
92
|
|
@@ -98,12 +99,12 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
98
99
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
99
100
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
100
101
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
102
|
+
* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕
|
101
103
|
* 💻 Simple and convenient CLI
|
102
104
|
|
103
105
|
### Coming soon
|
104
106
|
|
105
107
|
* 📝 Metadata extraction, including title, authors, references & language
|
106
|
-
* 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
|
107
108
|
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
|
108
109
|
* 📝 Complex chemistry understanding (Molecular structures)
|
109
110
|
|
@@ -120,7 +121,7 @@ More [detailed installation instructions](https://docling-project.github.io/docl
|
|
120
121
|
|
121
122
|
## Getting started
|
122
123
|
|
123
|
-
To convert individual documents, use `convert()`, for example:
|
124
|
+
To convert individual documents with python, use `convert()`, for example:
|
124
125
|
|
125
126
|
```python
|
126
127
|
from docling.document_converter import DocumentConverter
|
@@ -134,6 +135,22 @@ print(result.document.export_to_markdown()) # output: "## Docling Technical Rep
|
|
134
135
|
More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
|
135
136
|
the docs.
|
136
137
|
|
138
|
+
## CLI
|
139
|
+
|
140
|
+
Docling has a built-in CLI to run conversions.
|
141
|
+
|
142
|
+
```bash
|
143
|
+
docling https://arxiv.org/pdf/2206.01062
|
144
|
+
```
|
145
|
+
|
146
|
+
You can also use 🥚[SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview) and other VLMs via Docling CLI:
|
147
|
+
```bash
|
148
|
+
docling --pipeline vlm --vlm-model smoldocling https://arxiv.org/pdf/2206.01062
|
149
|
+
```
|
150
|
+
This will use MLX acceleration on supported Apple Silicon hardware.
|
151
|
+
|
152
|
+
Read more [here](https://docling-project.github.io/docling/usage/)
|
153
|
+
|
137
154
|
## Documentation
|
138
155
|
|
139
156
|
Check out Docling's [documentation](https://docling-project.github.io/docling/), for details on
|
@@ -150,32 +167,6 @@ To further accelerate your AI application development, check out Docling's nativ
|
|
150
167
|
[integrations](https://docling-project.github.io/docling/integrations/) with popular frameworks
|
151
168
|
and tools.
|
152
169
|
|
153
|
-
## Apify Actor
|
154
|
-
|
155
|
-
<a href="https://apify.com/vancura/docling?fpr=docling"><img src="https://apify.com/ext/run-on-apify.png" alt="Run Docling Actor on Apify" width="176" height="39" /></a>
|
156
|
-
|
157
|
-
You can run Docling in the cloud without installation using the [Docling Actor](https://apify.com/vancura/docling?fpr=docling) on Apify platform. Simply provide a document URL and get the processed result:
|
158
|
-
|
159
|
-
```bash
|
160
|
-
apify call vancura/docling -i '{
|
161
|
-
"options": {
|
162
|
-
"to_formats": ["md", "json", "html", "text", "doctags"]
|
163
|
-
},
|
164
|
-
"http_sources": [
|
165
|
-
{"url": "https://vancura.dev/assets/actor-test/facial-hairstyles-and-filtering-facepiece-respirators.pdf"},
|
166
|
-
{"url": "https://arxiv.org/pdf/2408.09869"}
|
167
|
-
]
|
168
|
-
}'
|
169
|
-
```
|
170
|
-
|
171
|
-
The Actor stores results in:
|
172
|
-
|
173
|
-
* Processed document in key-value store (`OUTPUT_RESULT`)
|
174
|
-
* Processing logs (`DOCLING_LOG`)
|
175
|
-
* Dataset record with result URL and status
|
176
|
-
|
177
|
-
Read more about the [Docling Actor](.actor/README.md), including how to use it via the Apify API and CLI.
|
178
|
-
|
179
170
|
## Get help and support
|
180
171
|
|
181
172
|
Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling/discussions).
|
@@ -210,9 +201,13 @@ If you use Docling in your projects, please consider citing the following:
|
|
210
201
|
The Docling codebase is under MIT license.
|
211
202
|
For individual model usage, please refer to the model licenses found in the original packages.
|
212
203
|
|
213
|
-
##
|
204
|
+
## LF AI & Data
|
205
|
+
|
206
|
+
Docling is hosted as a project in the [LF AI & Data Foundation](https://lfaidata.foundation/projects/).
|
207
|
+
|
208
|
+
### IBM ❤️ Open Source AI
|
214
209
|
|
215
|
-
|
210
|
+
The project was started by the AI for knowledge team at IBM Research Zurich.
|
216
211
|
|
217
212
|
[supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
|
218
213
|
[docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
|