docling 2.0.0__tar.gz → 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {docling-2.0.0 → docling-2.1.0}/PKG-INFO +4 -4
  2. {docling-2.0.0 → docling-2.1.0}/README.md +3 -3
  3. {docling-2.0.0 → docling-2.1.0}/docling/datamodel/document.py +16 -7
  4. {docling-2.0.0 → docling-2.1.0}/docling/datamodel/pipeline_options.py +3 -0
  5. {docling-2.0.0 → docling-2.1.0}/docling/models/base_ocr_model.py +9 -1
  6. {docling-2.0.0 → docling-2.1.0}/docling/models/ds_glm_model.py +16 -7
  7. docling-2.1.0/docling/models/easyocr_model.py +90 -0
  8. {docling-2.0.0 → docling-2.1.0}/docling/models/layout_model.py +63 -59
  9. docling-2.1.0/docling/models/page_assemble_model.py +172 -0
  10. {docling-2.0.0 → docling-2.1.0}/docling/models/page_preprocessing_model.py +7 -3
  11. docling-2.1.0/docling/models/table_structure_model.py +171 -0
  12. {docling-2.0.0 → docling-2.1.0}/docling/models/tesseract_ocr_cli_model.py +56 -52
  13. {docling-2.0.0 → docling-2.1.0}/docling/models/tesseract_ocr_model.py +50 -45
  14. {docling-2.0.0 → docling-2.1.0}/docling/pipeline/standard_pdf_pipeline.py +7 -7
  15. {docling-2.0.0 → docling-2.1.0}/pyproject.toml +3 -1
  16. docling-2.0.0/docling/models/easyocr_model.py +0 -88
  17. docling-2.0.0/docling/models/page_assemble_model.py +0 -164
  18. docling-2.0.0/docling/models/table_structure_model.py +0 -162
  19. {docling-2.0.0 → docling-2.1.0}/LICENSE +0 -0
  20. {docling-2.0.0 → docling-2.1.0}/docling/__init__.py +0 -0
  21. {docling-2.0.0 → docling-2.1.0}/docling/backend/__init__.py +0 -0
  22. {docling-2.0.0 → docling-2.1.0}/docling/backend/abstract_backend.py +0 -0
  23. {docling-2.0.0 → docling-2.1.0}/docling/backend/docling_parse_backend.py +0 -0
  24. {docling-2.0.0 → docling-2.1.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  25. {docling-2.0.0 → docling-2.1.0}/docling/backend/html_backend.py +0 -0
  26. {docling-2.0.0 → docling-2.1.0}/docling/backend/mspowerpoint_backend.py +0 -0
  27. {docling-2.0.0 → docling-2.1.0}/docling/backend/msword_backend.py +0 -0
  28. {docling-2.0.0 → docling-2.1.0}/docling/backend/pdf_backend.py +0 -0
  29. {docling-2.0.0 → docling-2.1.0}/docling/backend/pypdfium2_backend.py +0 -0
  30. {docling-2.0.0 → docling-2.1.0}/docling/cli/__init__.py +0 -0
  31. {docling-2.0.0 → docling-2.1.0}/docling/cli/main.py +0 -0
  32. {docling-2.0.0 → docling-2.1.0}/docling/datamodel/__init__.py +0 -0
  33. {docling-2.0.0 → docling-2.1.0}/docling/datamodel/base_models.py +0 -0
  34. {docling-2.0.0 → docling-2.1.0}/docling/datamodel/settings.py +0 -0
  35. {docling-2.0.0 → docling-2.1.0}/docling/document_converter.py +0 -0
  36. {docling-2.0.0 → docling-2.1.0}/docling/models/__init__.py +0 -0
  37. {docling-2.0.0 → docling-2.1.0}/docling/models/base_model.py +0 -0
  38. {docling-2.0.0 → docling-2.1.0}/docling/pipeline/__init__.py +0 -0
  39. {docling-2.0.0 → docling-2.1.0}/docling/pipeline/base_pipeline.py +0 -0
  40. {docling-2.0.0 → docling-2.1.0}/docling/pipeline/simple_pipeline.py +0 -0
  41. {docling-2.0.0 → docling-2.1.0}/docling/utils/__init__.py +0 -0
  42. {docling-2.0.0 → docling-2.1.0}/docling/utils/export.py +0 -0
  43. {docling-2.0.0 → docling-2.1.0}/docling/utils/layout_utils.py +0 -0
  44. {docling-2.0.0 → docling-2.1.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.0.0
3
+ Version: 2.1.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -50,7 +50,7 @@ Description-Content-Type: text/markdown
50
50
 
51
51
  <p align="center">
52
52
  <a href="https://github.com/ds4sd/docling">
53
- <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/logo.png" width="150" />
53
+ <img loading="lazy" alt="Docling" src="docs/assets/docling_processing.png" width="100%"/>
54
54
  </a>
55
55
  </p>
56
56
 
@@ -69,6 +69,7 @@ Description-Content-Type: text/markdown
69
69
 
70
70
  Docling parses documents and exports them to the desired format with ease and speed.
71
71
 
72
+
72
73
  ## Features
73
74
 
74
75
  * 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.)
@@ -94,7 +95,7 @@ More [detailed installation instructions](https://ds4sd.github.io/docling/instal
94
95
 
95
96
  ## Getting started
96
97
 
97
- To convert invidual documents, use `convert()`, for example:
98
+ To convert individual documents, use `convert()`, for example:
98
99
 
99
100
  ```python
100
101
  from docling.document_converter import DocumentConverter
@@ -103,7 +104,6 @@ source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
103
104
  converter = DocumentConverter()
104
105
  result = converter.convert(source)
105
106
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
106
- print(result.document.export_to_document_tokens()) # output: "<document><title><page_1><loc_20>..."
107
107
  ```
108
108
 
109
109
 
@@ -1,6 +1,6 @@
1
1
  <p align="center">
2
2
  <a href="https://github.com/ds4sd/docling">
3
- <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/logo.png" width="150" />
3
+ <img loading="lazy" alt="Docling" src="docs/assets/docling_processing.png" width="100%"/>
4
4
  </a>
5
5
  </p>
6
6
 
@@ -19,6 +19,7 @@
19
19
 
20
20
  Docling parses documents and exports them to the desired format with ease and speed.
21
21
 
22
+
22
23
  ## Features
23
24
 
24
25
  * 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.)
@@ -44,7 +45,7 @@ More [detailed installation instructions](https://ds4sd.github.io/docling/instal
44
45
 
45
46
  ## Getting started
46
47
 
47
- To convert invidual documents, use `convert()`, for example:
48
+ To convert individual documents, use `convert()`, for example:
48
49
 
49
50
  ```python
50
51
  from docling.document_converter import DocumentConverter
@@ -53,7 +54,6 @@ source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
53
54
  converter = DocumentConverter()
54
55
  result = converter.convert(source)
55
56
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
56
- print(result.document.export_to_document_tokens()) # output: "<document><title><page_1><loc_20>..."
57
57
  ```
58
58
 
59
59
 
@@ -6,12 +6,6 @@ from pathlib import Path, PurePath
6
6
  from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
7
7
 
8
8
  import filetype
9
- from docling_core.types import BaseText
10
- from docling_core.types import Document as DsDocument
11
- from docling_core.types import DocumentDescription as DsDocumentDescription
12
- from docling_core.types import FileInfoObject as DsFileInfoObject
13
- from docling_core.types import PageDimensions, PageReference, Prov, Ref
14
- from docling_core.types import Table as DsSchemaTable
15
9
  from docling_core.types.doc import (
16
10
  DocItem,
17
11
  DocItemLabel,
@@ -22,7 +16,22 @@ from docling_core.types.doc import (
22
16
  TextItem,
23
17
  )
24
18
  from docling_core.types.doc.document import ListItem
25
- from docling_core.types.legacy_doc.base import Figure, GlmTableCell, TableCell
19
+ from docling_core.types.legacy_doc.base import (
20
+ BaseText,
21
+ Figure,
22
+ GlmTableCell,
23
+ PageDimensions,
24
+ PageReference,
25
+ Prov,
26
+ Ref,
27
+ )
28
+ from docling_core.types.legacy_doc.base import Table as DsSchemaTable
29
+ from docling_core.types.legacy_doc.base import TableCell
30
+ from docling_core.types.legacy_doc.document import (
31
+ CCSDocumentDescription as DsDocumentDescription,
32
+ )
33
+ from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
34
+ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
26
35
  from docling_core.utils.file import resolve_file_source
27
36
  from pydantic import BaseModel
28
37
  from typing_extensions import deprecated
@@ -22,6 +22,9 @@ class TableStructureOptions(BaseModel):
22
22
 
23
23
  class OcrOptions(BaseModel):
24
24
  kind: str
25
+ bitmap_area_threshold: float = (
26
+ 0.05 # percentage of the area for a bitmap to processed with OCR
27
+ )
25
28
 
26
29
 
27
30
  class EasyOcrOptions(OcrOptions):
@@ -69,7 +69,7 @@ class BaseOcrModel:
69
69
  coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
70
70
 
71
71
  # return full-page rectangle if sufficiently covered with bitmaps
72
- if coverage > BITMAP_COVERAGE_TRESHOLD:
72
+ if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold):
73
73
  return [
74
74
  BoundingBox(
75
75
  l=0,
@@ -81,6 +81,14 @@ class BaseOcrModel:
81
81
  ]
82
82
  # return individual rectangles if the bitmap coverage is smaller
83
83
  else: # coverage <= BITMAP_COVERAGE_TRESHOLD:
84
+
85
+ # skip OCR if the bitmap area on the page is smaller than the options threshold
86
+ ocr_rects = [
87
+ rect
88
+ for rect in ocr_rects
89
+ if rect.area() / (page.size.width * page.size.height)
90
+ > self.options.bitmap_area_threshold
91
+ ]
84
92
  return ocr_rects
85
93
 
86
94
  # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
@@ -5,15 +5,23 @@ from typing import List, Union
5
5
  from deepsearch_glm.nlp_utils import init_nlp_model
6
6
  from deepsearch_glm.utils.doc_utils import to_docling_document
7
7
  from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
8
- from docling_core.types import BaseText
9
- from docling_core.types import Document as DsDocument
10
- from docling_core.types import DocumentDescription as DsDocumentDescription
11
- from docling_core.types import FileInfoObject as DsFileInfoObject
12
- from docling_core.types import PageDimensions, PageReference, Prov, Ref
13
- from docling_core.types import Table as DsSchemaTable
14
8
  from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
15
9
  from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
16
- from docling_core.types.legacy_doc.base import Figure, TableCell
10
+ from docling_core.types.legacy_doc.base import (
11
+ Figure,
12
+ PageDimensions,
13
+ PageReference,
14
+ Prov,
15
+ Ref,
16
+ )
17
+ from docling_core.types.legacy_doc.base import Table as DsSchemaTable
18
+ from docling_core.types.legacy_doc.base import TableCell
19
+ from docling_core.types.legacy_doc.document import BaseText
20
+ from docling_core.types.legacy_doc.document import (
21
+ CCSDocumentDescription as DsDocumentDescription,
22
+ )
23
+ from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
24
+ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
17
25
  from PIL import ImageDraw
18
26
  from pydantic import BaseModel, ConfigDict
19
27
 
@@ -202,6 +210,7 @@ class GlmModel:
202
210
  page_dimensions = [
203
211
  PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
204
212
  for p in conv_res.pages
213
+ if p.size is not None
205
214
  ]
206
215
 
207
216
  ds_doc: DsDocument = DsDocument(
@@ -0,0 +1,90 @@
1
+ import logging
2
+ from typing import Iterable
3
+
4
+ import numpy
5
+ from docling_core.types.doc import BoundingBox, CoordOrigin
6
+
7
+ from docling.datamodel.base_models import OcrCell, Page
8
+ from docling.datamodel.pipeline_options import EasyOcrOptions
9
+ from docling.models.base_ocr_model import BaseOcrModel
10
+
11
+ _log = logging.getLogger(__name__)
12
+
13
+
14
+ class EasyOcrModel(BaseOcrModel):
15
+ def __init__(self, enabled: bool, options: EasyOcrOptions):
16
+ super().__init__(enabled=enabled, options=options)
17
+ self.options: EasyOcrOptions
18
+
19
+ self.scale = 3 # multiplier for 72 dpi == 216 dpi.
20
+
21
+ if self.enabled:
22
+ try:
23
+ import easyocr
24
+ except ImportError:
25
+ raise ImportError(
26
+ "EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
27
+ "Alternatively, Docling has support for other OCR engines. See the documentation."
28
+ )
29
+
30
+ self.reader = easyocr.Reader(
31
+ lang_list=self.options.lang,
32
+ model_storage_directory=self.options.model_storage_directory,
33
+ download_enabled=self.options.download_enabled,
34
+ )
35
+
36
+ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
37
+
38
+ if not self.enabled:
39
+ yield from page_batch
40
+ return
41
+
42
+ for page in page_batch:
43
+ assert page._backend is not None
44
+ if not page._backend.is_valid():
45
+ yield page
46
+ else:
47
+ ocr_rects = self.get_ocr_rects(page)
48
+
49
+ all_ocr_cells = []
50
+ for ocr_rect in ocr_rects:
51
+ # Skip zero area boxes
52
+ if ocr_rect.area() == 0:
53
+ continue
54
+ high_res_image = page._backend.get_page_image(
55
+ scale=self.scale, cropbox=ocr_rect
56
+ )
57
+ im = numpy.array(high_res_image)
58
+ result = self.reader.readtext(im)
59
+
60
+ del high_res_image
61
+ del im
62
+
63
+ cells = [
64
+ OcrCell(
65
+ id=ix,
66
+ text=line[1],
67
+ confidence=line[2],
68
+ bbox=BoundingBox.from_tuple(
69
+ coord=(
70
+ (line[0][0][0] / self.scale) + ocr_rect.l,
71
+ (line[0][0][1] / self.scale) + ocr_rect.t,
72
+ (line[0][2][0] / self.scale) + ocr_rect.l,
73
+ (line[0][2][1] / self.scale) + ocr_rect.t,
74
+ ),
75
+ origin=CoordOrigin.TOPLEFT,
76
+ ),
77
+ )
78
+ for ix, line in enumerate(result)
79
+ ]
80
+ all_ocr_cells.extend(cells)
81
+
82
+ ## Remove OCR cells which overlap with programmatic cells.
83
+ filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
84
+
85
+ page.cells.extend(filtered_ocr_cells)
86
+
87
+ # DEBUG code:
88
+ # self.draw_ocr_rects_and_cells(page, ocr_rects)
89
+
90
+ yield page
@@ -273,68 +273,72 @@ class LayoutModel(BasePageModel):
273
273
 
274
274
  def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
275
275
  for page in page_batch:
276
- assert page.size is not None
277
-
278
- clusters = []
279
- for ix, pred_item in enumerate(
280
- self.layout_predictor.predict(page.get_image(scale=1.0))
281
- ):
282
- label = DocItemLabel(
283
- pred_item["label"].lower().replace(" ", "_").replace("-", "_")
284
- ) # Temporary, until docling-ibm-model uses docling-core types
285
- cluster = Cluster(
286
- id=ix,
287
- label=label,
288
- confidence=pred_item["confidence"],
289
- bbox=BoundingBox.model_validate(pred_item),
290
- cells=[],
291
- )
292
- clusters.append(cluster)
293
-
294
- # Map cells to clusters
295
- # TODO: Remove, postprocess should take care of it anyway.
296
- for cell in page.cells:
297
- for cluster in clusters:
298
- if not cell.bbox.area() > 0:
299
- overlap_frac = 0.0
300
- else:
301
- overlap_frac = (
302
- cell.bbox.intersection_area_with(cluster.bbox)
303
- / cell.bbox.area()
304
- )
305
-
306
- if overlap_frac > 0.5:
307
- cluster.cells.append(cell)
308
-
309
- # Pre-sort clusters
310
- # clusters = self.sort_clusters_by_cell_order(clusters)
311
-
312
- # DEBUG code:
313
- def draw_clusters_and_cells():
314
- image = copy.deepcopy(page.image)
315
- draw = ImageDraw.Draw(image)
316
- for c in clusters:
317
- x0, y0, x1, y1 = c.bbox.as_tuple()
318
- draw.rectangle([(x0, y0), (x1, y1)], outline="green")
319
-
320
- cell_color = (
321
- random.randint(30, 140),
322
- random.randint(30, 140),
323
- random.randint(30, 140),
276
+ assert page._backend is not None
277
+ if not page._backend.is_valid():
278
+ yield page
279
+ else:
280
+ assert page.size is not None
281
+
282
+ clusters = []
283
+ for ix, pred_item in enumerate(
284
+ self.layout_predictor.predict(page.get_image(scale=1.0))
285
+ ):
286
+ label = DocItemLabel(
287
+ pred_item["label"].lower().replace(" ", "_").replace("-", "_")
288
+ ) # Temporary, until docling-ibm-model uses docling-core types
289
+ cluster = Cluster(
290
+ id=ix,
291
+ label=label,
292
+ confidence=pred_item["confidence"],
293
+ bbox=BoundingBox.model_validate(pred_item),
294
+ cells=[],
324
295
  )
325
- for tc in c.cells: # [:1]:
326
- x0, y0, x1, y1 = tc.bbox.as_tuple()
327
- draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
328
- image.show()
296
+ clusters.append(cluster)
297
+
298
+ # Map cells to clusters
299
+ # TODO: Remove, postprocess should take care of it anyway.
300
+ for cell in page.cells:
301
+ for cluster in clusters:
302
+ if not cell.bbox.area() > 0:
303
+ overlap_frac = 0.0
304
+ else:
305
+ overlap_frac = (
306
+ cell.bbox.intersection_area_with(cluster.bbox)
307
+ / cell.bbox.area()
308
+ )
309
+
310
+ if overlap_frac > 0.5:
311
+ cluster.cells.append(cell)
312
+
313
+ # Pre-sort clusters
314
+ # clusters = self.sort_clusters_by_cell_order(clusters)
315
+
316
+ # DEBUG code:
317
+ def draw_clusters_and_cells():
318
+ image = copy.deepcopy(page.image)
319
+ draw = ImageDraw.Draw(image)
320
+ for c in clusters:
321
+ x0, y0, x1, y1 = c.bbox.as_tuple()
322
+ draw.rectangle([(x0, y0), (x1, y1)], outline="green")
323
+
324
+ cell_color = (
325
+ random.randint(30, 140),
326
+ random.randint(30, 140),
327
+ random.randint(30, 140),
328
+ )
329
+ for tc in c.cells: # [:1]:
330
+ x0, y0, x1, y1 = tc.bbox.as_tuple()
331
+ draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
332
+ image.show()
329
333
 
330
- # draw_clusters_and_cells()
334
+ # draw_clusters_and_cells()
331
335
 
332
- clusters, page.cells = self.postprocess(
333
- clusters, page.cells, page.size.height
334
- )
336
+ clusters, page.cells = self.postprocess(
337
+ clusters, page.cells, page.size.height
338
+ )
335
339
 
336
- # draw_clusters_and_cells()
340
+ # draw_clusters_and_cells()
337
341
 
338
- page.predictions.layout = LayoutPrediction(clusters=clusters)
342
+ page.predictions.layout = LayoutPrediction(clusters=clusters)
339
343
 
340
- yield page
344
+ yield page
@@ -0,0 +1,172 @@
1
+ import logging
2
+ import re
3
+ from typing import Iterable, List
4
+
5
+ from pydantic import BaseModel
6
+
7
+ from docling.datamodel.base_models import (
8
+ AssembledUnit,
9
+ FigureElement,
10
+ Page,
11
+ PageElement,
12
+ Table,
13
+ TextElement,
14
+ )
15
+ from docling.models.base_model import BasePageModel
16
+ from docling.models.layout_model import LayoutModel
17
+
18
+ _log = logging.getLogger(__name__)
19
+
20
+
21
+ class PageAssembleOptions(BaseModel):
22
+ keep_images: bool = False
23
+
24
+
25
+ class PageAssembleModel(BasePageModel):
26
+ def __init__(self, options: PageAssembleOptions):
27
+ self.options = options
28
+
29
+ def sanitize_text(self, lines):
30
+ if len(lines) <= 1:
31
+ return " ".join(lines)
32
+
33
+ for ix, line in enumerate(lines[1:]):
34
+ prev_line = lines[ix]
35
+
36
+ if prev_line.endswith("-"):
37
+ prev_words = re.findall(r"\b[\w]+\b", prev_line)
38
+ line_words = re.findall(r"\b[\w]+\b", line)
39
+
40
+ if (
41
+ len(prev_words)
42
+ and len(line_words)
43
+ and prev_words[-1].isalnum()
44
+ and line_words[0].isalnum()
45
+ ):
46
+ lines[ix] = prev_line[:-1]
47
+ else:
48
+ lines[ix] += " "
49
+
50
+ sanitized_text = "".join(lines)
51
+
52
+ return sanitized_text.strip() # Strip any leading or trailing whitespace
53
+
54
+ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
55
+ for page in page_batch:
56
+ assert page._backend is not None
57
+ if not page._backend.is_valid():
58
+ yield page
59
+ else:
60
+ assert page.predictions.layout is not None
61
+
62
+ # assembles some JSON output page by page.
63
+
64
+ elements: List[PageElement] = []
65
+ headers: List[PageElement] = []
66
+ body: List[PageElement] = []
67
+
68
+ for cluster in page.predictions.layout.clusters:
69
+ # _log.info("Cluster label seen:", cluster.label)
70
+ if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
71
+
72
+ textlines = [
73
+ cell.text.replace("\x02", "-").strip()
74
+ for cell in cluster.cells
75
+ if len(cell.text.strip()) > 0
76
+ ]
77
+ text = self.sanitize_text(textlines)
78
+ text_el = TextElement(
79
+ label=cluster.label,
80
+ id=cluster.id,
81
+ text=text,
82
+ page_no=page.page_no,
83
+ cluster=cluster,
84
+ )
85
+ elements.append(text_el)
86
+
87
+ if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
88
+ headers.append(text_el)
89
+ else:
90
+ body.append(text_el)
91
+ elif cluster.label == LayoutModel.TABLE_LABEL:
92
+ tbl = None
93
+ if page.predictions.tablestructure:
94
+ tbl = page.predictions.tablestructure.table_map.get(
95
+ cluster.id, None
96
+ )
97
+ if (
98
+ not tbl
99
+ ): # fallback: add table without structure, if it isn't present
100
+ tbl = Table(
101
+ label=cluster.label,
102
+ id=cluster.id,
103
+ text="",
104
+ otsl_seq=[],
105
+ table_cells=[],
106
+ cluster=cluster,
107
+ page_no=page.page_no,
108
+ )
109
+
110
+ elements.append(tbl)
111
+ body.append(tbl)
112
+ elif cluster.label == LayoutModel.FIGURE_LABEL:
113
+ fig = None
114
+ if page.predictions.figures_classification:
115
+ fig = (
116
+ page.predictions.figures_classification.figure_map.get(
117
+ cluster.id, None
118
+ )
119
+ )
120
+ if (
121
+ not fig
122
+ ): # fallback: add figure without classification, if it isn't present
123
+ fig = FigureElement(
124
+ label=cluster.label,
125
+ id=cluster.id,
126
+ text="",
127
+ data=None,
128
+ cluster=cluster,
129
+ page_no=page.page_no,
130
+ )
131
+ elements.append(fig)
132
+ body.append(fig)
133
+ elif cluster.label == LayoutModel.FORMULA_LABEL:
134
+ equation = None
135
+ if page.predictions.equations_prediction:
136
+ equation = (
137
+ page.predictions.equations_prediction.equation_map.get(
138
+ cluster.id, None
139
+ )
140
+ )
141
+ if (
142
+ not equation
143
+ ): # fallback: add empty formula, if it isn't present
144
+ text = self.sanitize_text(
145
+ [
146
+ cell.text.replace("\x02", "-").strip()
147
+ for cell in cluster.cells
148
+ if len(cell.text.strip()) > 0
149
+ ]
150
+ )
151
+ equation = TextElement(
152
+ label=cluster.label,
153
+ id=cluster.id,
154
+ cluster=cluster,
155
+ page_no=page.page_no,
156
+ text=text,
157
+ )
158
+ elements.append(equation)
159
+ body.append(equation)
160
+
161
+ page.assembled = AssembledUnit(
162
+ elements=elements, headers=headers, body=body
163
+ )
164
+
165
+ # Remove page images (can be disabled)
166
+ if not self.options.keep_images:
167
+ page._image_cache = {}
168
+
169
+ # Unload backend
170
+ page._backend.unload()
171
+
172
+ yield page
@@ -17,9 +17,13 @@ class PagePreprocessingModel(BasePageModel):
17
17
 
18
18
  def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
19
19
  for page in page_batch:
20
- page = self._populate_page_images(page)
21
- page = self._parse_page_cells(page)
22
- yield page
20
+ assert page._backend is not None
21
+ if not page._backend.is_valid():
22
+ yield page
23
+ else:
24
+ page = self._populate_page_images(page)
25
+ page = self._parse_page_cells(page)
26
+ yield page
23
27
 
24
28
  # Generate the page image and store it in the page object
25
29
  def _populate_page_images(self, page: Page) -> Page: