docling 1.6.2__py3-none-any.whl → 1.19.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,14 +4,16 @@ from pathlib import Path, PurePath
4
4
  from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
5
5
 
6
6
  from docling_core.types import BaseCell, BaseText
7
- from docling_core.types import BoundingBox as DsBoundingBox
8
7
  from docling_core.types import Document as DsDocument
9
8
  from docling_core.types import DocumentDescription as DsDocumentDescription
10
9
  from docling_core.types import FileInfoObject as DsFileInfoObject
11
10
  from docling_core.types import PageDimensions, PageReference, Prov, Ref
12
11
  from docling_core.types import Table as DsSchemaTable
13
12
  from docling_core.types import TableCell
13
+ from docling_core.types.doc.base import BoundingBox as DsBoundingBox
14
+ from docling_core.types.doc.base import Figure
14
15
  from pydantic import BaseModel
16
+ from typing_extensions import deprecated
15
17
 
16
18
  from docling.backend.abstract_backend import PdfDocumentBackend
17
19
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
@@ -19,6 +21,7 @@ from docling.datamodel.base_models import (
19
21
  AssembledUnit,
20
22
  ConversionStatus,
21
23
  DocumentStream,
24
+ ErrorItem,
22
25
  FigureElement,
23
26
  Page,
24
27
  PageElement,
@@ -48,6 +51,15 @@ layout_label_to_ds_type = {
48
51
  "Text": "paragraph",
49
52
  }
50
53
 
54
+ _EMPTY_DOC = DsDocument(
55
+ _name="",
56
+ description=DsDocumentDescription(logs=[]),
57
+ file_info=DsFileInfoObject(
58
+ filename="",
59
+ document_hash="",
60
+ ),
61
+ )
62
+
51
63
 
52
64
  class InputDocument(BaseModel):
53
65
  file: PurePath = None
@@ -79,7 +91,9 @@ class InputDocument(BaseModel):
79
91
  self.valid = False
80
92
  else:
81
93
  self.document_hash = create_file_hash(path_or_stream)
82
- self._backend = pdf_backend(path_or_stream=path_or_stream)
94
+ self._backend = pdf_backend(
95
+ path_or_stream=path_or_stream, document_hash=self.document_hash
96
+ )
83
97
 
84
98
  elif isinstance(path_or_stream, BytesIO):
85
99
  self.file = PurePath(filename)
@@ -89,7 +103,9 @@ class InputDocument(BaseModel):
89
103
  self.valid = False
90
104
  else:
91
105
  self.document_hash = create_file_hash(path_or_stream)
92
- self._backend = pdf_backend(path_or_stream=path_or_stream)
106
+ self._backend = pdf_backend(
107
+ path_or_stream=path_or_stream, document_hash=self.document_hash
108
+ )
93
109
 
94
110
  if self.document_hash and self._backend.page_count() > 0:
95
111
  self.page_count = self._backend.page_count()
@@ -110,18 +126,19 @@ class InputDocument(BaseModel):
110
126
  # raise
111
127
 
112
128
 
129
+ @deprecated("Use `ConversionResult` instead.")
113
130
  class ConvertedDocument(BaseModel):
114
131
  input: InputDocument
115
132
 
116
133
  status: ConversionStatus = ConversionStatus.PENDING # failure, success
117
- errors: List[Dict] = [] # structure to keep errors
134
+ errors: List[ErrorItem] = [] # structure to keep errors
118
135
 
119
136
  pages: List[Page] = []
120
- assembled: Optional[AssembledUnit] = None
137
+ assembled: AssembledUnit = AssembledUnit()
121
138
 
122
- output: Optional[DsDocument] = None
139
+ output: DsDocument = _EMPTY_DOC
123
140
 
124
- def to_ds_document(self) -> DsDocument:
141
+ def _to_ds_document(self) -> DsDocument:
125
142
  title = ""
126
143
  desc = DsDocumentDescription(logs=[])
127
144
 
@@ -206,6 +223,8 @@ class ConvertedDocument(BaseModel):
206
223
  celltype = "col_header"
207
224
  elif cell.row_header:
208
225
  celltype = "row_header"
226
+ elif cell.row_section:
227
+ celltype = "row_section"
209
228
 
210
229
  def make_spans(cell):
211
230
  for rspan in range(
@@ -261,7 +280,7 @@ class ConvertedDocument(BaseModel):
261
280
  ),
262
281
  )
263
282
  figures.append(
264
- BaseCell(
283
+ Figure(
265
284
  prov=[
266
285
  Prov(
267
286
  bbox=target_bbox,
@@ -292,16 +311,91 @@ class ConvertedDocument(BaseModel):
292
311
  return ds_doc
293
312
 
294
313
  def render_as_dict(self):
295
- if self.output:
296
- return self.output.model_dump(by_alias=True, exclude_none=True)
297
- else:
298
- return {}
314
+ return self.output.model_dump(by_alias=True, exclude_none=True)
315
+
316
+ def render_as_markdown(
317
+ self,
318
+ delim: str = "\n\n",
319
+ main_text_start: int = 0,
320
+ main_text_stop: Optional[int] = None,
321
+ main_text_labels: list[str] = [
322
+ "title",
323
+ "subtitle-level-1",
324
+ "paragraph",
325
+ "caption",
326
+ "table",
327
+ "figure",
328
+ ],
329
+ strict_text: bool = False,
330
+ image_placeholder: str = "<!-- image -->",
331
+ ):
332
+ return self.output.export_to_markdown(
333
+ delim=delim,
334
+ main_text_start=main_text_start,
335
+ main_text_stop=main_text_stop,
336
+ main_text_labels=main_text_labels,
337
+ strict_text=strict_text,
338
+ image_placeholder=image_placeholder,
339
+ )
299
340
 
300
- def render_as_markdown(self):
301
- if self.output:
302
- return self.output.export_to_markdown()
303
- else:
304
- return ""
341
+ def render_as_text(
342
+ self,
343
+ delim: str = "\n\n",
344
+ main_text_start: int = 0,
345
+ main_text_stop: Optional[int] = None,
346
+ main_text_labels: list[str] = [
347
+ "title",
348
+ "subtitle-level-1",
349
+ "paragraph",
350
+ "caption",
351
+ ],
352
+ ):
353
+ return self.output.export_to_markdown(
354
+ delim=delim,
355
+ main_text_start=main_text_start,
356
+ main_text_stop=main_text_stop,
357
+ main_text_labels=main_text_labels,
358
+ strict_text=True,
359
+ )
360
+
361
+ def render_as_doctags(
362
+ self,
363
+ delim: str = "\n\n",
364
+ main_text_start: int = 0,
365
+ main_text_stop: Optional[int] = None,
366
+ main_text_labels: list[str] = [
367
+ "title",
368
+ "subtitle-level-1",
369
+ "paragraph",
370
+ "caption",
371
+ "table",
372
+ "figure",
373
+ ],
374
+ xsize: int = 100,
375
+ ysize: int = 100,
376
+ add_location: bool = True,
377
+ add_content: bool = True,
378
+ add_page_index: bool = True,
379
+ # table specific flags
380
+ add_table_cell_location: bool = False,
381
+ add_table_cell_label: bool = True,
382
+ add_table_cell_text: bool = True,
383
+ ) -> str:
384
+ return self.output.export_to_document_tokens(
385
+ delim=delim,
386
+ main_text_start=main_text_start,
387
+ main_text_stop=main_text_stop,
388
+ main_text_labels=main_text_labels,
389
+ xsize=xsize,
390
+ ysize=ysize,
391
+ add_location=add_location,
392
+ add_content=add_content,
393
+ add_page_index=add_page_index,
394
+ # table specific flags
395
+ add_table_cell_location=add_table_cell_location,
396
+ add_table_cell_label=add_table_cell_label,
397
+ add_table_cell_text=add_table_cell_text,
398
+ )
305
399
 
306
400
  def render_element_images(
307
401
  self, element_types: Tuple[PageElement] = (FigureElement,)
@@ -318,6 +412,10 @@ class ConvertedDocument(BaseModel):
318
412
  yield element, cropped_im
319
413
 
320
414
 
415
+ class ConversionResult(ConvertedDocument):
416
+ pass
417
+
418
+
321
419
  class DocumentConversionInput(BaseModel):
322
420
 
323
421
  _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
@@ -0,0 +1,67 @@
1
+ from enum import Enum, auto
2
+ from typing import List, Literal, Optional, Union
3
+
4
+ from pydantic import BaseModel, ConfigDict, Field
5
+
6
+
7
+ class TableFormerMode(str, Enum):
8
+ FAST = auto()
9
+ ACCURATE = auto()
10
+
11
+
12
+ class TableStructureOptions(BaseModel):
13
+ do_cell_matching: bool = (
14
+ True
15
+ # True: Matches predictions back to PDF cells. Can break table output if PDF cells
16
+ # are merged across table columns.
17
+ # False: Let table structure model define the text cells, ignore PDF cells.
18
+ )
19
+ mode: TableFormerMode = TableFormerMode.FAST
20
+
21
+
22
+ class OcrOptions(BaseModel):
23
+ kind: str
24
+
25
+
26
+ class EasyOcrOptions(OcrOptions):
27
+ kind: Literal["easyocr"] = "easyocr"
28
+ lang: List[str] = ["fr", "de", "es", "en"]
29
+ use_gpu: bool = True # same default as easyocr.Reader
30
+ model_storage_directory: Optional[str] = None
31
+ download_enabled: bool = True # same default as easyocr.Reader
32
+
33
+ model_config = ConfigDict(
34
+ extra="forbid",
35
+ protected_namespaces=(),
36
+ )
37
+
38
+
39
+ class TesseractCliOcrOptions(OcrOptions):
40
+ kind: Literal["tesseract"] = "tesseract"
41
+ lang: List[str] = ["fra", "deu", "spa", "eng"]
42
+ tesseract_cmd: str = "tesseract"
43
+ path: Optional[str] = None
44
+
45
+ model_config = ConfigDict(
46
+ extra="forbid",
47
+ )
48
+
49
+
50
+ class TesseractOcrOptions(OcrOptions):
51
+ kind: Literal["tesserocr"] = "tesserocr"
52
+ lang: List[str] = ["fra", "deu", "spa", "eng"]
53
+ path: Optional[str] = None
54
+
55
+ model_config = ConfigDict(
56
+ extra="forbid",
57
+ )
58
+
59
+
60
+ class PipelineOptions(BaseModel):
61
+ do_table_structure: bool = True # True: perform table structure extraction
62
+ do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
63
+
64
+ table_structure_options: TableStructureOptions = TableStructureOptions()
65
+ ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
66
+ Field(EasyOcrOptions(), discriminator="kind")
67
+ )
@@ -7,7 +7,6 @@ from pathlib import Path
7
7
  from typing import Iterable, Optional, Type, Union
8
8
 
9
9
  import requests
10
- from docling_core.types import Document
11
10
  from PIL import ImageDraw
12
11
  from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
13
12
 
@@ -16,14 +15,16 @@ from docling.datamodel.base_models import (
16
15
  AssembledUnit,
17
16
  AssembleOptions,
18
17
  ConversionStatus,
18
+ DoclingComponentType,
19
+ ErrorItem,
19
20
  Page,
20
- PipelineOptions,
21
21
  )
22
22
  from docling.datamodel.document import (
23
- ConvertedDocument,
23
+ ConversionResult,
24
24
  DocumentConversionInput,
25
25
  InputDocument,
26
26
  )
27
+ from docling.datamodel.pipeline_options import PipelineOptions
27
28
  from docling.datamodel.settings import settings
28
29
  from docling.models.ds_glm_model import GlmModel
29
30
  from docling.models.page_assemble_model import PageAssembleModel
@@ -66,12 +67,15 @@ class DocumentConverter:
66
67
  from huggingface_hub import snapshot_download
67
68
 
68
69
  download_path = snapshot_download(
69
- repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
70
+ repo_id="ds4sd/docling-models",
71
+ force_download=force,
72
+ local_dir=local_dir,
73
+ revision="v2.0.0",
70
74
  )
71
75
 
72
76
  return Path(download_path)
73
77
 
74
- def convert(self, input: DocumentConversionInput) -> Iterable[ConvertedDocument]:
78
+ def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
75
79
 
76
80
  for input_batch in chunkify(
77
81
  input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
@@ -84,9 +88,9 @@ class DocumentConverter:
84
88
  # yield from pool.map(self.process_document, input_batch)
85
89
 
86
90
  # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
87
- yield from map(self.process_document, input_batch)
91
+ yield from map(self._process_document, input_batch)
88
92
 
89
- def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
93
+ def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
90
94
  """Convert a single document.
91
95
 
92
96
  Args:
@@ -97,7 +101,7 @@ class DocumentConverter:
97
101
  RuntimeError: If conversion fails.
98
102
 
99
103
  Returns:
100
- Document: The converted document object.
104
+ ConversionResult: The conversion result object.
101
105
  """
102
106
  with tempfile.TemporaryDirectory() as temp_dir:
103
107
  try:
@@ -127,51 +131,49 @@ class DocumentConverter:
127
131
  f"Unexpected file path type encountered: {type(source)}"
128
132
  )
129
133
  conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
130
- converted_docs_iter = self.convert(conv_inp)
131
- converted_doc: ConvertedDocument = next(converted_docs_iter)
132
- if converted_doc.status not in {
134
+ conv_res_iter = self.convert(conv_inp)
135
+ conv_res: ConversionResult = next(conv_res_iter)
136
+ if conv_res.status not in {
133
137
  ConversionStatus.SUCCESS,
134
- ConversionStatus.SUCCESS_WITH_ERRORS,
138
+ ConversionStatus.PARTIAL_SUCCESS,
135
139
  }:
136
- raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
137
- doc = converted_doc.to_ds_document()
138
- return doc
140
+ raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
141
+ return conv_res
139
142
 
140
- def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
143
+ def _process_document(self, in_doc: InputDocument) -> ConversionResult:
141
144
  start_doc_time = time.time()
142
- converted_doc = ConvertedDocument(input=in_doc)
145
+ conv_res = ConversionResult(input=in_doc)
146
+
147
+ _log.info(f"Processing document {in_doc.file.name}")
143
148
 
144
149
  if not in_doc.valid:
145
- converted_doc.status = ConversionStatus.FAILURE
146
- return converted_doc
150
+ conv_res.status = ConversionStatus.FAILURE
151
+ return conv_res
147
152
 
148
153
  for i in range(0, in_doc.page_count):
149
- converted_doc.pages.append(Page(page_no=i))
154
+ conv_res.pages.append(Page(page_no=i))
150
155
 
151
156
  all_assembled_pages = []
152
157
 
153
158
  try:
154
159
  # Iterate batches of pages (page_batch_size) in the doc
155
- for page_batch in chunkify(
156
- converted_doc.pages, settings.perf.page_batch_size
157
- ):
158
-
160
+ for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
159
161
  start_pb_time = time.time()
160
162
  # Pipeline
161
163
 
162
164
  # 1. Initialise the page resources
163
165
  init_pages = map(
164
- functools.partial(self.initialize_page, in_doc), page_batch
166
+ functools.partial(self._initialize_page, in_doc), page_batch
165
167
  )
166
168
 
167
169
  # 2. Populate page image
168
170
  pages_with_images = map(
169
- functools.partial(self.populate_page_images, in_doc), init_pages
171
+ functools.partial(self._populate_page_images, in_doc), init_pages
170
172
  )
171
173
 
172
174
  # 3. Populate programmatic page cells
173
175
  pages_with_cells = map(
174
- functools.partial(self.parse_page_cells, in_doc),
176
+ functools.partial(self._parse_page_cells, in_doc),
175
177
  pages_with_images,
176
178
  )
177
179
 
@@ -197,28 +199,45 @@ class DocumentConverter:
197
199
  end_pb_time = time.time() - start_pb_time
198
200
  _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
199
201
 
200
- # Free up mem resources of PDF backend
201
- in_doc._backend.unload()
202
-
203
- converted_doc.pages = all_assembled_pages
204
- self.assemble_doc(converted_doc)
202
+ conv_res.pages = all_assembled_pages
203
+ self._assemble_doc(conv_res)
204
+
205
+ status = ConversionStatus.SUCCESS
206
+ for page in conv_res.pages:
207
+ if not page._backend.is_valid():
208
+ conv_res.errors.append(
209
+ ErrorItem(
210
+ component_type=DoclingComponentType.PDF_BACKEND,
211
+ module_name=type(page._backend).__name__,
212
+ error_message=f"Page {page.page_no} failed to parse.",
213
+ )
214
+ )
215
+ status = ConversionStatus.PARTIAL_SUCCESS
205
216
 
206
- converted_doc.status = ConversionStatus.SUCCESS
217
+ conv_res.status = status
207
218
 
208
219
  except Exception as e:
209
- converted_doc.status = ConversionStatus.FAILURE
220
+ conv_res.status = ConversionStatus.FAILURE
210
221
  trace = "\n".join(traceback.format_exception(e))
211
- _log.info(f"Encountered an error during conversion: {trace}")
222
+ _log.info(
223
+ f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
224
+ f"{trace}"
225
+ )
226
+
227
+ finally:
228
+ # Always unload the PDF backend, even in case of failure
229
+ if in_doc._backend:
230
+ in_doc._backend.unload()
212
231
 
213
232
  end_doc_time = time.time() - start_doc_time
214
233
  _log.info(
215
234
  f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
216
235
  )
217
236
 
218
- return converted_doc
237
+ return conv_res
219
238
 
220
239
  # Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
221
- def initialize_page(self, doc: InputDocument, page: Page) -> Page:
240
+ def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
222
241
  page._backend = doc._backend.load_page(page.page_no)
223
242
  page.size = page._backend.get_size()
224
243
  page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
@@ -226,9 +245,11 @@ class DocumentConverter:
226
245
  return page
227
246
 
228
247
  # Generate the page image and store it in the page object
229
- def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
248
+ def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
230
249
  # default scale
231
- page.get_image(scale=1.0)
250
+ page.get_image(
251
+ scale=1.0
252
+ ) # puts the page image on the image cache at default scale
232
253
 
233
254
  # user requested scales
234
255
  if self.assemble_options.images_scale is not None:
@@ -240,7 +261,7 @@ class DocumentConverter:
240
261
  return page
241
262
 
242
263
  # Extract and populate the page cells and store it in the page object
243
- def parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
264
+ def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
244
265
  page.cells = page._backend.get_text_cells()
245
266
 
246
267
  # DEBUG code:
@@ -255,12 +276,12 @@ class DocumentConverter:
255
276
 
256
277
  return page
257
278
 
258
- def assemble_doc(self, converted_doc: ConvertedDocument):
279
+ def _assemble_doc(self, conv_res: ConversionResult):
259
280
  all_elements = []
260
281
  all_headers = []
261
282
  all_body = []
262
283
 
263
- for p in converted_doc.pages:
284
+ for p in conv_res.pages:
264
285
 
265
286
  for el in p.assembled.body:
266
287
  all_body.append(el)
@@ -269,8 +290,8 @@ class DocumentConverter:
269
290
  for el in p.assembled.elements:
270
291
  all_elements.append(el)
271
292
 
272
- converted_doc.assembled = AssembledUnit(
293
+ conv_res.assembled = AssembledUnit(
273
294
  elements=all_elements, headers=all_headers, body=all_body
274
295
  )
275
296
 
276
- converted_doc.output = self.glm_model(converted_doc)
297
+ conv_res.output = self.glm_model(conv_res)
@@ -3,21 +3,21 @@ import logging
3
3
  from abc import abstractmethod
4
4
  from typing import Iterable, List, Tuple
5
5
 
6
- import numpy
7
6
  import numpy as np
8
7
  from PIL import Image, ImageDraw
9
8
  from rtree import index
10
9
  from scipy.ndimage import find_objects, label
11
10
 
12
11
  from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
12
+ from docling.datamodel.pipeline_options import OcrOptions
13
13
 
14
14
  _log = logging.getLogger(__name__)
15
15
 
16
16
 
17
17
  class BaseOcrModel:
18
- def __init__(self, config):
19
- self.config = config
20
- self.enabled = config["enabled"]
18
+ def __init__(self, enabled: bool, options: OcrOptions):
19
+ self.enabled = enabled
20
+ self.options = options
21
21
 
22
22
  # Computes the optimum amount and coordinates of rectangles to OCR on a given page
23
23
  def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
@@ -2,7 +2,7 @@ import copy
2
2
  import random
3
3
 
4
4
  from deepsearch_glm.nlp_utils import init_nlp_model
5
- from deepsearch_glm.utils.ds_utils import to_legacy_document_format
5
+ from deepsearch_glm.utils.doc_utils import to_legacy_document_format
6
6
  from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
7
7
  from docling_core.types import BaseText
8
8
  from docling_core.types import Document as DsDocument
@@ -10,18 +10,22 @@ from docling_core.types import Ref
10
10
  from PIL import ImageDraw
11
11
 
12
12
  from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
13
- from docling.datamodel.document import ConvertedDocument
13
+ from docling.datamodel.document import ConversionResult
14
14
 
15
15
 
16
16
  class GlmModel:
17
17
  def __init__(self, config):
18
18
  self.config = config
19
+ self.model_names = self.config.get(
20
+ "model_names", ""
21
+ ) # "language;term;reference"
19
22
  load_pretrained_nlp_models()
20
- model = init_nlp_model(model_names="language;term;reference")
23
+ # model = init_nlp_model(model_names="language;term;reference")
24
+ model = init_nlp_model(model_names=self.model_names)
21
25
  self.model = model
22
26
 
23
- def __call__(self, document: ConvertedDocument) -> DsDocument:
24
- ds_doc = document.to_ds_document()
27
+ def __call__(self, conv_res: ConversionResult) -> DsDocument:
28
+ ds_doc = conv_res._to_ds_document()
25
29
  ds_doc_dict = ds_doc.model_dump(by_alias=True)
26
30
 
27
31
  glm_doc = self.model.apply_on_doc(ds_doc_dict)
@@ -34,7 +38,7 @@ class GlmModel:
34
38
  # DEBUG code:
35
39
  def draw_clusters_and_cells(ds_document, page_no):
36
40
  clusters_to_draw = []
37
- image = copy.deepcopy(document.pages[page_no].image)
41
+ image = copy.deepcopy(conv_res.pages[page_no].image)
38
42
  for ix, elem in enumerate(ds_document.main_text):
39
43
  if isinstance(elem, BaseText):
40
44
  prov = elem.prov[0]
@@ -56,7 +60,7 @@ class GlmModel:
56
60
  bbox=BoundingBox.from_tuple(
57
61
  coord=prov.bbox,
58
62
  origin=CoordOrigin.BOTTOMLEFT,
59
- ).to_top_left_origin(document.pages[page_no].size.height),
63
+ ).to_top_left_origin(conv_res.pages[page_no].size.height),
60
64
  )
61
65
  )
62
66
 
@@ -4,21 +4,33 @@ from typing import Iterable
4
4
  import numpy
5
5
 
6
6
  from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
7
+ from docling.datamodel.pipeline_options import EasyOcrOptions
7
8
  from docling.models.base_ocr_model import BaseOcrModel
8
9
 
9
10
  _log = logging.getLogger(__name__)
10
11
 
11
12
 
12
13
  class EasyOcrModel(BaseOcrModel):
13
- def __init__(self, config):
14
- super().__init__(config)
14
+ def __init__(self, enabled: bool, options: EasyOcrOptions):
15
+ super().__init__(enabled=enabled, options=options)
16
+ self.options: EasyOcrOptions
15
17
 
16
18
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
17
19
 
18
20
  if self.enabled:
19
- import easyocr
21
+ try:
22
+ import easyocr
23
+ except ImportError:
24
+ raise ImportError(
25
+ "EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
26
+ "Alternatively, Docling has support for other OCR engines. See the documentation."
27
+ )
20
28
 
21
- self.reader = easyocr.Reader(config["lang"])
29
+ self.reader = easyocr.Reader(
30
+ lang_list=self.options.lang,
31
+ model_storage_directory=self.options.model_storage_directory,
32
+ download_enabled=self.options.download_enabled,
33
+ )
22
34
 
23
35
  def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
24
36
 
@@ -31,6 +43,9 @@ class EasyOcrModel(BaseOcrModel):
31
43
 
32
44
  all_ocr_cells = []
33
45
  for ocr_rect in ocr_rects:
46
+ # Skip zero area boxes
47
+ if ocr_rect.area() == 0:
48
+ continue
34
49
  high_res_image = page._backend.get_page_image(
35
50
  scale=self.scale, cropbox=ocr_rect
36
51
  )
@@ -33,6 +33,7 @@ class LayoutModel:
33
33
  "Page-footer",
34
34
  "Code",
35
35
  "List-item",
36
+ # "Title"
36
37
  # "Formula",
37
38
  ]
38
39
  PAGE_HEADER_LABELS = ["Page-header", "Page-footer"]
@@ -69,9 +70,7 @@ class LayoutModel:
69
70
  "Key-Value Region": 0.45,
70
71
  }
71
72
 
72
- CLASS_REMAPPINGS = {
73
- "Document Index": "Table",
74
- }
73
+ CLASS_REMAPPINGS = {"Document Index": "Table", "Title": "Section-header"}
75
74
 
76
75
  _log.debug("================= Start postprocess function ====================")
77
76
  start_time = time.time()
@@ -277,6 +276,7 @@ class LayoutModel:
277
276
  bbox=BoundingBox.model_validate(pred_item),
278
277
  cells=[],
279
278
  )
279
+
280
280
  clusters.append(cluster)
281
281
 
282
282
  # Map cells to clusters