docling 1.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,276 @@
1
+ import functools
2
+ import logging
3
+ import tempfile
4
+ import time
5
+ import traceback
6
+ from pathlib import Path
7
+ from typing import Iterable, Optional, Type, Union
8
+
9
+ import requests
10
+ from docling_core.types import Document
11
+ from PIL import ImageDraw
12
+ from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
13
+
14
+ from docling.backend.abstract_backend import PdfDocumentBackend
15
+ from docling.datamodel.base_models import (
16
+ AssembledUnit,
17
+ AssembleOptions,
18
+ ConversionStatus,
19
+ Page,
20
+ PipelineOptions,
21
+ )
22
+ from docling.datamodel.document import (
23
+ ConvertedDocument,
24
+ DocumentConversionInput,
25
+ InputDocument,
26
+ )
27
+ from docling.datamodel.settings import settings
28
+ from docling.models.ds_glm_model import GlmModel
29
+ from docling.models.page_assemble_model import PageAssembleModel
30
+ from docling.pipeline.base_model_pipeline import BaseModelPipeline
31
+ from docling.pipeline.standard_model_pipeline import StandardModelPipeline
32
+ from docling.utils.utils import chunkify, create_hash
33
+
34
+ _log = logging.getLogger(__name__)
35
+
36
+
37
+ class DocumentConverter:
38
+ _default_download_filename = "file.pdf"
39
+
40
+ def __init__(
41
+ self,
42
+ artifacts_path: Optional[Union[Path, str]] = None,
43
+ pipeline_options: PipelineOptions = PipelineOptions(),
44
+ pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
45
+ pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
46
+ assemble_options: AssembleOptions = AssembleOptions(),
47
+ ):
48
+ if not artifacts_path:
49
+ artifacts_path = self.download_models_hf()
50
+
51
+ artifacts_path = Path(artifacts_path)
52
+
53
+ self.model_pipeline = pipeline_cls(
54
+ artifacts_path=artifacts_path, pipeline_options=pipeline_options
55
+ )
56
+
57
+ self.page_assemble_model = PageAssembleModel(config={})
58
+ self.glm_model = GlmModel(config={})
59
+ self.pdf_backend = pdf_backend
60
+ self.assemble_options = assemble_options
61
+
62
+ @staticmethod
63
+ def download_models_hf(
64
+ local_dir: Optional[Path] = None, force: bool = False
65
+ ) -> Path:
66
+ from huggingface_hub import snapshot_download
67
+
68
+ download_path = snapshot_download(
69
+ repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
70
+ )
71
+
72
+ return Path(download_path)
73
+
74
+ def convert(self, input: DocumentConversionInput) -> Iterable[ConvertedDocument]:
75
+
76
+ for input_batch in chunkify(
77
+ input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
78
+ ):
79
+ _log.info(f"Going to convert document batch...")
80
+ # parallel processing only within input_batch
81
+ # with ThreadPoolExecutor(
82
+ # max_workers=settings.perf.doc_batch_concurrency
83
+ # ) as pool:
84
+ # yield from pool.map(self.process_document, input_batch)
85
+
86
+ # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
87
+ yield from map(self.process_document, input_batch)
88
+
89
+ def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
90
+ """Convert a single document.
91
+
92
+ Args:
93
+ source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
94
+
95
+ Raises:
96
+ ValueError: If source is of unexpected type.
97
+ RuntimeError: If conversion fails.
98
+
99
+ Returns:
100
+ Document: The converted document object.
101
+ """
102
+ with tempfile.TemporaryDirectory() as temp_dir:
103
+ try:
104
+ http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
105
+ res = requests.get(http_url, stream=True)
106
+ res.raise_for_status()
107
+ fname = None
108
+ # try to get filename from response header
109
+ if cont_disp := res.headers.get("Content-Disposition"):
110
+ for par in cont_disp.strip().split(";"):
111
+ # currently only handling directive "filename" (not "*filename")
112
+ if (split := par.split("=")) and split[0].strip() == "filename":
113
+ fname = "=".join(split[1:]).strip().strip("'\"") or None
114
+ break
115
+ # otherwise, use name from URL:
116
+ if fname is None:
117
+ fname = Path(http_url.path).name or self._default_download_filename
118
+ local_path = Path(temp_dir) / fname
119
+ with open(local_path, "wb") as f:
120
+ for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
121
+ f.write(chunk)
122
+ except ValidationError:
123
+ try:
124
+ local_path = TypeAdapter(Path).validate_python(source)
125
+ except ValidationError:
126
+ raise ValueError(
127
+ f"Unexpected file path type encountered: {type(source)}"
128
+ )
129
+ conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
130
+ converted_docs_iter = self.convert(conv_inp)
131
+ converted_doc: ConvertedDocument = next(converted_docs_iter)
132
+ if converted_doc.status not in {
133
+ ConversionStatus.SUCCESS,
134
+ ConversionStatus.SUCCESS_WITH_ERRORS,
135
+ }:
136
+ raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
137
+ doc = converted_doc.to_ds_document()
138
+ return doc
139
+
140
+ def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
141
+ start_doc_time = time.time()
142
+ converted_doc = ConvertedDocument(input=in_doc)
143
+
144
+ if not in_doc.valid:
145
+ converted_doc.status = ConversionStatus.FAILURE
146
+ return converted_doc
147
+
148
+ for i in range(0, in_doc.page_count):
149
+ converted_doc.pages.append(Page(page_no=i))
150
+
151
+ all_assembled_pages = []
152
+
153
+ try:
154
+ # Iterate batches of pages (page_batch_size) in the doc
155
+ for page_batch in chunkify(
156
+ converted_doc.pages, settings.perf.page_batch_size
157
+ ):
158
+
159
+ start_pb_time = time.time()
160
+ # Pipeline
161
+
162
+ # 1. Initialise the page resources
163
+ init_pages = map(
164
+ functools.partial(self.initialize_page, in_doc), page_batch
165
+ )
166
+
167
+ # 2. Populate page image
168
+ pages_with_images = map(
169
+ functools.partial(self.populate_page_images, in_doc), init_pages
170
+ )
171
+
172
+ # 3. Populate programmatic page cells
173
+ pages_with_cells = map(
174
+ functools.partial(self.parse_page_cells, in_doc),
175
+ pages_with_images,
176
+ )
177
+
178
+ # 4. Run pipeline stages
179
+ pipeline_pages = self.model_pipeline.apply(pages_with_cells)
180
+
181
+ # 5. Assemble page elements (per page)
182
+ assembled_pages = self.page_assemble_model(pipeline_pages)
183
+
184
+ # exhaust assembled_pages
185
+ for assembled_page in assembled_pages:
186
+ # Free up mem resources before moving on with next batch
187
+
188
+ # Remove page images (can be disabled)
189
+ if self.assemble_options.images_scale is None:
190
+ assembled_page._image_cache = {}
191
+
192
+ # Unload backend
193
+ assembled_page._backend.unload()
194
+
195
+ all_assembled_pages.append(assembled_page)
196
+
197
+ end_pb_time = time.time() - start_pb_time
198
+ _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
199
+
200
+ # Free up mem resources of PDF backend
201
+ in_doc._backend.unload()
202
+
203
+ converted_doc.pages = all_assembled_pages
204
+ self.assemble_doc(converted_doc)
205
+
206
+ converted_doc.status = ConversionStatus.SUCCESS
207
+
208
+ except Exception as e:
209
+ converted_doc.status = ConversionStatus.FAILURE
210
+ trace = "\n".join(traceback.format_exception(e))
211
+ _log.info(f"Encountered an error during conversion: {trace}")
212
+
213
+ end_doc_time = time.time() - start_doc_time
214
+ _log.info(
215
+ f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
216
+ )
217
+
218
+ return converted_doc
219
+
220
+ # Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
221
+ def initialize_page(self, doc: InputDocument, page: Page) -> Page:
222
+ page._backend = doc._backend.load_page(page.page_no)
223
+ page.size = page._backend.get_size()
224
+ page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
225
+
226
+ return page
227
+
228
+ # Generate the page image and store it in the page object
229
+ def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
230
+ # default scale
231
+ page.get_image(scale=1.0)
232
+
233
+ # user requested scales
234
+ if self.assemble_options.images_scale is not None:
235
+ page._default_image_scale = self.assemble_options.images_scale
236
+ page.get_image(
237
+ scale=self.assemble_options.images_scale
238
+ ) # this will trigger storing the image in the internal cache
239
+
240
+ return page
241
+
242
+ # Extract and populate the page cells and store it in the page object
243
+ def parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
244
+ page.cells = page._backend.get_text_cells()
245
+
246
+ # DEBUG code:
247
+ def draw_text_boxes(image, cells):
248
+ draw = ImageDraw.Draw(image)
249
+ for c in cells:
250
+ x0, y0, x1, y1 = c.bbox.as_tuple()
251
+ draw.rectangle([(x0, y0), (x1, y1)], outline="red")
252
+ image.show()
253
+
254
+ # draw_text_boxes(page.get_image(scale=1.0), cells)
255
+
256
+ return page
257
+
258
+ def assemble_doc(self, converted_doc: ConvertedDocument):
259
+ all_elements = []
260
+ all_headers = []
261
+ all_body = []
262
+
263
+ for p in converted_doc.pages:
264
+
265
+ for el in p.assembled.body:
266
+ all_body.append(el)
267
+ for el in p.assembled.headers:
268
+ all_headers.append(el)
269
+ for el in p.assembled.elements:
270
+ all_elements.append(el)
271
+
272
+ converted_doc.assembled = AssembledUnit(
273
+ elements=all_elements, headers=all_headers, body=all_body
274
+ )
275
+
276
+ converted_doc.output = self.glm_model(converted_doc)
File without changes
@@ -0,0 +1,124 @@
1
+ import copy
2
+ import logging
3
+ from abc import abstractmethod
4
+ from typing import Iterable, List, Tuple
5
+
6
+ import numpy
7
+ import numpy as np
8
+ from PIL import Image, ImageDraw
9
+ from rtree import index
10
+ from scipy.ndimage import find_objects, label
11
+
12
+ from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
13
+
14
+ _log = logging.getLogger(__name__)
15
+
16
+
17
+ class BaseOcrModel:
18
+ def __init__(self, config):
19
+ self.config = config
20
+ self.enabled = config["enabled"]
21
+
22
+ # Computes the optimum amount and coordinates of rectangles to OCR on a given page
23
+ def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
24
+ BITMAP_COVERAGE_TRESHOLD = 0.75
25
+
26
+ def find_ocr_rects(size, bitmap_rects):
27
+ image = Image.new(
28
+ "1", (round(size.width), round(size.height))
29
+ ) # '1' mode is binary
30
+
31
+ # Draw all bitmap rects into a binary image
32
+ draw = ImageDraw.Draw(image)
33
+ for rect in bitmap_rects:
34
+ x0, y0, x1, y1 = rect.as_tuple()
35
+ x0, y0, x1, y1 = round(x0), round(y0), round(x1), round(y1)
36
+ draw.rectangle([(x0, y0), (x1, y1)], fill=1)
37
+
38
+ np_image = np.array(image)
39
+
40
+ # Find the connected components
41
+ labeled_image, num_features = label(
42
+ np_image > 0
43
+ ) # Label black (0 value) regions
44
+
45
+ # Find enclosing bounding boxes for each connected component.
46
+ slices = find_objects(labeled_image)
47
+ bounding_boxes = [
48
+ BoundingBox(
49
+ l=slc[1].start,
50
+ t=slc[0].start,
51
+ r=slc[1].stop - 1,
52
+ b=slc[0].stop - 1,
53
+ coord_origin=CoordOrigin.TOPLEFT,
54
+ )
55
+ for slc in slices
56
+ ]
57
+
58
+ # Compute area fraction on page covered by bitmaps
59
+ area_frac = np.sum(np_image > 0) / (size.width * size.height)
60
+
61
+ return (area_frac, bounding_boxes) # fraction covered # boxes
62
+
63
+ bitmap_rects = page._backend.get_bitmap_rects()
64
+ coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
65
+
66
+ # return full-page rectangle if sufficiently covered with bitmaps
67
+ if coverage > BITMAP_COVERAGE_TRESHOLD:
68
+ return [
69
+ BoundingBox(
70
+ l=0,
71
+ t=0,
72
+ r=page.size.width,
73
+ b=page.size.height,
74
+ coord_origin=CoordOrigin.TOPLEFT,
75
+ )
76
+ ]
77
+ # return individual rectangles if the bitmap coverage is smaller
78
+ elif coverage < BITMAP_COVERAGE_TRESHOLD:
79
+ return ocr_rects
80
+
81
+ # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
82
+ def filter_ocr_cells(self, ocr_cells, programmatic_cells):
83
+ # Create R-tree index for programmatic cells
84
+ p = index.Property()
85
+ p.dimension = 2
86
+ idx = index.Index(properties=p)
87
+ for i, cell in enumerate(programmatic_cells):
88
+ idx.insert(i, cell.bbox.as_tuple())
89
+
90
+ def is_overlapping_with_existing_cells(ocr_cell):
91
+ # Query the R-tree to get overlapping rectangles
92
+ possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
93
+
94
+ return (
95
+ len(possible_matches_index) > 0
96
+ ) # this is a weak criterion but it works.
97
+
98
+ filtered_ocr_cells = [
99
+ rect for rect in ocr_cells if not is_overlapping_with_existing_cells(rect)
100
+ ]
101
+ return filtered_ocr_cells
102
+
103
+ def draw_ocr_rects_and_cells(self, page, ocr_rects):
104
+ image = copy.deepcopy(page.image)
105
+ draw = ImageDraw.Draw(image, "RGBA")
106
+
107
+ # Draw OCR rectangles as yellow filled rect
108
+ for rect in ocr_rects:
109
+ x0, y0, x1, y1 = rect.as_tuple()
110
+ shade_color = (255, 255, 0, 40) # transparent yellow
111
+ draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
112
+
113
+ # Draw OCR and programmatic cells
114
+ for tc in page.cells:
115
+ x0, y0, x1, y1 = tc.bbox.as_tuple()
116
+ color = "red"
117
+ if isinstance(tc, OcrCell):
118
+ color = "magenta"
119
+ draw.rectangle([(x0, y0), (x1, y1)], outline=color)
120
+ image.show()
121
+
122
+ @abstractmethod
123
+ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
124
+ pass
@@ -0,0 +1,82 @@
1
+ import copy
2
+ import random
3
+
4
+ from deepsearch_glm.nlp_utils import init_nlp_model
5
+ from deepsearch_glm.utils.ds_utils import to_legacy_document_format
6
+ from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
7
+ from docling_core.types import BaseText
8
+ from docling_core.types import Document as DsDocument
9
+ from docling_core.types import Ref
10
+ from PIL import ImageDraw
11
+
12
+ from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
13
+ from docling.datamodel.document import ConvertedDocument
14
+
15
+
16
+ class GlmModel:
17
+ def __init__(self, config):
18
+ self.config = config
19
+ load_pretrained_nlp_models()
20
+ model = init_nlp_model(model_names="language;term;reference")
21
+ self.model = model
22
+
23
+ def __call__(self, document: ConvertedDocument) -> DsDocument:
24
+ ds_doc = document.to_ds_document()
25
+ ds_doc_dict = ds_doc.model_dump(by_alias=True)
26
+
27
+ glm_doc = self.model.apply_on_doc(ds_doc_dict)
28
+ ds_doc_dict = to_legacy_document_format(
29
+ glm_doc, ds_doc_dict, update_name_label=True
30
+ )
31
+
32
+ exported_doc = DsDocument.model_validate(ds_doc_dict)
33
+
34
+ # DEBUG code:
35
+ def draw_clusters_and_cells(ds_document, page_no):
36
+ clusters_to_draw = []
37
+ image = copy.deepcopy(document.pages[page_no].image)
38
+ for ix, elem in enumerate(ds_document.main_text):
39
+ if isinstance(elem, BaseText):
40
+ prov = elem.prov[0]
41
+ elif isinstance(elem, Ref):
42
+ _, arr, index = elem.ref.split("/")
43
+ index = int(index)
44
+ if arr == "tables":
45
+ prov = ds_document.tables[index].prov[0]
46
+ elif arr == "figures":
47
+ prov = ds_document.figures[index].prov[0]
48
+ else:
49
+ prov = None
50
+
51
+ if prov and prov.page == page_no:
52
+ clusters_to_draw.append(
53
+ Cluster(
54
+ id=ix,
55
+ label=elem.name,
56
+ bbox=BoundingBox.from_tuple(
57
+ coord=prov.bbox,
58
+ origin=CoordOrigin.BOTTOMLEFT,
59
+ ).to_top_left_origin(document.pages[page_no].size.height),
60
+ )
61
+ )
62
+
63
+ draw = ImageDraw.Draw(image)
64
+ for c in clusters_to_draw:
65
+ x0, y0, x1, y1 = c.bbox.as_tuple()
66
+ draw.rectangle([(x0, y0), (x1, y1)], outline="red")
67
+ draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255))
68
+
69
+ cell_color = (
70
+ random.randint(30, 140),
71
+ random.randint(30, 140),
72
+ random.randint(30, 140),
73
+ )
74
+ for tc in c.cells: # [:1]:
75
+ x0, y0, x1, y1 = tc.bbox.as_tuple()
76
+ draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
77
+ image.show()
78
+
79
+ # draw_clusters_and_cells(ds_doc, 0)
80
+ # draw_clusters_and_cells(exported_doc, 0)
81
+
82
+ return exported_doc
@@ -0,0 +1,70 @@
1
+ import logging
2
+ from typing import Iterable
3
+
4
+ import numpy
5
+
6
+ from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
7
+ from docling.models.base_ocr_model import BaseOcrModel
8
+
9
+ _log = logging.getLogger(__name__)
10
+
11
+
12
+ class EasyOcrModel(BaseOcrModel):
13
+ def __init__(self, config):
14
+ super().__init__(config)
15
+
16
+ self.scale = 3 # multiplier for 72 dpi == 216 dpi.
17
+
18
+ if self.enabled:
19
+ import easyocr
20
+
21
+ self.reader = easyocr.Reader(config["lang"])
22
+
23
+ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
24
+
25
+ if not self.enabled:
26
+ yield from page_batch
27
+ return
28
+
29
+ for page in page_batch:
30
+ ocr_rects = self.get_ocr_rects(page)
31
+
32
+ all_ocr_cells = []
33
+ for ocr_rect in ocr_rects:
34
+ high_res_image = page._backend.get_page_image(
35
+ scale=self.scale, cropbox=ocr_rect
36
+ )
37
+ im = numpy.array(high_res_image)
38
+ result = self.reader.readtext(im)
39
+
40
+ del high_res_image
41
+ del im
42
+
43
+ cells = [
44
+ OcrCell(
45
+ id=ix,
46
+ text=line[1],
47
+ confidence=line[2],
48
+ bbox=BoundingBox.from_tuple(
49
+ coord=(
50
+ (line[0][0][0] / self.scale) + ocr_rect.l,
51
+ (line[0][0][1] / self.scale) + ocr_rect.t,
52
+ (line[0][2][0] / self.scale) + ocr_rect.l,
53
+ (line[0][2][1] / self.scale) + ocr_rect.t,
54
+ ),
55
+ origin=CoordOrigin.TOPLEFT,
56
+ ),
57
+ )
58
+ for ix, line in enumerate(result)
59
+ ]
60
+ all_ocr_cells.extend(cells)
61
+
62
+ ## Remove OCR cells which overlap with programmatic cells.
63
+ filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
64
+
65
+ page.cells.extend(filtered_ocr_cells)
66
+
67
+ # DEBUG code:
68
+ # self.draw_ocr_rects_and_cells(page, ocr_rects)
69
+
70
+ yield page