docling 2.0.0__tar.gz → 2.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.0.0 → docling-2.1.0}/PKG-INFO +4 -4
- {docling-2.0.0 → docling-2.1.0}/README.md +3 -3
- {docling-2.0.0 → docling-2.1.0}/docling/datamodel/document.py +16 -7
- {docling-2.0.0 → docling-2.1.0}/docling/datamodel/pipeline_options.py +3 -0
- {docling-2.0.0 → docling-2.1.0}/docling/models/base_ocr_model.py +9 -1
- {docling-2.0.0 → docling-2.1.0}/docling/models/ds_glm_model.py +16 -7
- docling-2.1.0/docling/models/easyocr_model.py +90 -0
- {docling-2.0.0 → docling-2.1.0}/docling/models/layout_model.py +63 -59
- docling-2.1.0/docling/models/page_assemble_model.py +172 -0
- {docling-2.0.0 → docling-2.1.0}/docling/models/page_preprocessing_model.py +7 -3
- docling-2.1.0/docling/models/table_structure_model.py +171 -0
- {docling-2.0.0 → docling-2.1.0}/docling/models/tesseract_ocr_cli_model.py +56 -52
- {docling-2.0.0 → docling-2.1.0}/docling/models/tesseract_ocr_model.py +50 -45
- {docling-2.0.0 → docling-2.1.0}/docling/pipeline/standard_pdf_pipeline.py +7 -7
- {docling-2.0.0 → docling-2.1.0}/pyproject.toml +3 -1
- docling-2.0.0/docling/models/easyocr_model.py +0 -88
- docling-2.0.0/docling/models/page_assemble_model.py +0 -164
- docling-2.0.0/docling/models/table_structure_model.py +0 -162
- {docling-2.0.0 → docling-2.1.0}/LICENSE +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/__init__.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/backend/__init__.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/backend/html_backend.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/cli/__init__.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/cli/main.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/datamodel/base_models.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/datamodel/settings.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/document_converter.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/models/__init__.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/models/base_model.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/utils/__init__.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/utils/export.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/utils/layout_utils.py +0 -0
- {docling-2.0.0 → docling-2.1.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.1.0
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -50,7 +50,7 @@ Description-Content-Type: text/markdown
|
|
50
50
|
|
51
51
|
<p align="center">
|
52
52
|
<a href="https://github.com/ds4sd/docling">
|
53
|
-
<img loading="lazy" alt="Docling" src="
|
53
|
+
<img loading="lazy" alt="Docling" src="docs/assets/docling_processing.png" width="100%"/>
|
54
54
|
</a>
|
55
55
|
</p>
|
56
56
|
|
@@ -69,6 +69,7 @@ Description-Content-Type: text/markdown
|
|
69
69
|
|
70
70
|
Docling parses documents and exports them to the desired format with ease and speed.
|
71
71
|
|
72
|
+
|
72
73
|
## Features
|
73
74
|
|
74
75
|
* 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.)
|
@@ -94,7 +95,7 @@ More [detailed installation instructions](https://ds4sd.github.io/docling/instal
|
|
94
95
|
|
95
96
|
## Getting started
|
96
97
|
|
97
|
-
To convert
|
98
|
+
To convert individual documents, use `convert()`, for example:
|
98
99
|
|
99
100
|
```python
|
100
101
|
from docling.document_converter import DocumentConverter
|
@@ -103,7 +104,6 @@ source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
|
103
104
|
converter = DocumentConverter()
|
104
105
|
result = converter.convert(source)
|
105
106
|
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
|
106
|
-
print(result.document.export_to_document_tokens()) # output: "<document><title><page_1><loc_20>..."
|
107
107
|
```
|
108
108
|
|
109
109
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
<p align="center">
|
2
2
|
<a href="https://github.com/ds4sd/docling">
|
3
|
-
<img loading="lazy" alt="Docling" src="
|
3
|
+
<img loading="lazy" alt="Docling" src="docs/assets/docling_processing.png" width="100%"/>
|
4
4
|
</a>
|
5
5
|
</p>
|
6
6
|
|
@@ -19,6 +19,7 @@
|
|
19
19
|
|
20
20
|
Docling parses documents and exports them to the desired format with ease and speed.
|
21
21
|
|
22
|
+
|
22
23
|
## Features
|
23
24
|
|
24
25
|
* 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.)
|
@@ -44,7 +45,7 @@ More [detailed installation instructions](https://ds4sd.github.io/docling/instal
|
|
44
45
|
|
45
46
|
## Getting started
|
46
47
|
|
47
|
-
To convert
|
48
|
+
To convert individual documents, use `convert()`, for example:
|
48
49
|
|
49
50
|
```python
|
50
51
|
from docling.document_converter import DocumentConverter
|
@@ -53,7 +54,6 @@ source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
|
53
54
|
converter = DocumentConverter()
|
54
55
|
result = converter.convert(source)
|
55
56
|
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
|
56
|
-
print(result.document.export_to_document_tokens()) # output: "<document><title><page_1><loc_20>..."
|
57
57
|
```
|
58
58
|
|
59
59
|
|
@@ -6,12 +6,6 @@ from pathlib import Path, PurePath
|
|
6
6
|
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
|
7
7
|
|
8
8
|
import filetype
|
9
|
-
from docling_core.types import BaseText
|
10
|
-
from docling_core.types import Document as DsDocument
|
11
|
-
from docling_core.types import DocumentDescription as DsDocumentDescription
|
12
|
-
from docling_core.types import FileInfoObject as DsFileInfoObject
|
13
|
-
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
14
|
-
from docling_core.types import Table as DsSchemaTable
|
15
9
|
from docling_core.types.doc import (
|
16
10
|
DocItem,
|
17
11
|
DocItemLabel,
|
@@ -22,7 +16,22 @@ from docling_core.types.doc import (
|
|
22
16
|
TextItem,
|
23
17
|
)
|
24
18
|
from docling_core.types.doc.document import ListItem
|
25
|
-
from docling_core.types.legacy_doc.base import
|
19
|
+
from docling_core.types.legacy_doc.base import (
|
20
|
+
BaseText,
|
21
|
+
Figure,
|
22
|
+
GlmTableCell,
|
23
|
+
PageDimensions,
|
24
|
+
PageReference,
|
25
|
+
Prov,
|
26
|
+
Ref,
|
27
|
+
)
|
28
|
+
from docling_core.types.legacy_doc.base import Table as DsSchemaTable
|
29
|
+
from docling_core.types.legacy_doc.base import TableCell
|
30
|
+
from docling_core.types.legacy_doc.document import (
|
31
|
+
CCSDocumentDescription as DsDocumentDescription,
|
32
|
+
)
|
33
|
+
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
34
|
+
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
26
35
|
from docling_core.utils.file import resolve_file_source
|
27
36
|
from pydantic import BaseModel
|
28
37
|
from typing_extensions import deprecated
|
@@ -69,7 +69,7 @@ class BaseOcrModel:
|
|
69
69
|
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
|
70
70
|
|
71
71
|
# return full-page rectangle if sufficiently covered with bitmaps
|
72
|
-
if coverage > BITMAP_COVERAGE_TRESHOLD:
|
72
|
+
if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold):
|
73
73
|
return [
|
74
74
|
BoundingBox(
|
75
75
|
l=0,
|
@@ -81,6 +81,14 @@ class BaseOcrModel:
|
|
81
81
|
]
|
82
82
|
# return individual rectangles if the bitmap coverage is smaller
|
83
83
|
else: # coverage <= BITMAP_COVERAGE_TRESHOLD:
|
84
|
+
|
85
|
+
# skip OCR if the bitmap area on the page is smaller than the options threshold
|
86
|
+
ocr_rects = [
|
87
|
+
rect
|
88
|
+
for rect in ocr_rects
|
89
|
+
if rect.area() / (page.size.width * page.size.height)
|
90
|
+
> self.options.bitmap_area_threshold
|
91
|
+
]
|
84
92
|
return ocr_rects
|
85
93
|
|
86
94
|
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
@@ -5,15 +5,23 @@ from typing import List, Union
|
|
5
5
|
from deepsearch_glm.nlp_utils import init_nlp_model
|
6
6
|
from deepsearch_glm.utils.doc_utils import to_docling_document
|
7
7
|
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
|
8
|
-
from docling_core.types import BaseText
|
9
|
-
from docling_core.types import Document as DsDocument
|
10
|
-
from docling_core.types import DocumentDescription as DsDocumentDescription
|
11
|
-
from docling_core.types import FileInfoObject as DsFileInfoObject
|
12
|
-
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
13
|
-
from docling_core.types import Table as DsSchemaTable
|
14
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
|
15
9
|
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
|
16
|
-
from docling_core.types.legacy_doc.base import
|
10
|
+
from docling_core.types.legacy_doc.base import (
|
11
|
+
Figure,
|
12
|
+
PageDimensions,
|
13
|
+
PageReference,
|
14
|
+
Prov,
|
15
|
+
Ref,
|
16
|
+
)
|
17
|
+
from docling_core.types.legacy_doc.base import Table as DsSchemaTable
|
18
|
+
from docling_core.types.legacy_doc.base import TableCell
|
19
|
+
from docling_core.types.legacy_doc.document import BaseText
|
20
|
+
from docling_core.types.legacy_doc.document import (
|
21
|
+
CCSDocumentDescription as DsDocumentDescription,
|
22
|
+
)
|
23
|
+
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
24
|
+
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
17
25
|
from PIL import ImageDraw
|
18
26
|
from pydantic import BaseModel, ConfigDict
|
19
27
|
|
@@ -202,6 +210,7 @@ class GlmModel:
|
|
202
210
|
page_dimensions = [
|
203
211
|
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
|
204
212
|
for p in conv_res.pages
|
213
|
+
if p.size is not None
|
205
214
|
]
|
206
215
|
|
207
216
|
ds_doc: DsDocument = DsDocument(
|
@@ -0,0 +1,90 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Iterable
|
3
|
+
|
4
|
+
import numpy
|
5
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
6
|
+
|
7
|
+
from docling.datamodel.base_models import OcrCell, Page
|
8
|
+
from docling.datamodel.pipeline_options import EasyOcrOptions
|
9
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
10
|
+
|
11
|
+
_log = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
class EasyOcrModel(BaseOcrModel):
|
15
|
+
def __init__(self, enabled: bool, options: EasyOcrOptions):
|
16
|
+
super().__init__(enabled=enabled, options=options)
|
17
|
+
self.options: EasyOcrOptions
|
18
|
+
|
19
|
+
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
20
|
+
|
21
|
+
if self.enabled:
|
22
|
+
try:
|
23
|
+
import easyocr
|
24
|
+
except ImportError:
|
25
|
+
raise ImportError(
|
26
|
+
"EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
|
27
|
+
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
28
|
+
)
|
29
|
+
|
30
|
+
self.reader = easyocr.Reader(
|
31
|
+
lang_list=self.options.lang,
|
32
|
+
model_storage_directory=self.options.model_storage_directory,
|
33
|
+
download_enabled=self.options.download_enabled,
|
34
|
+
)
|
35
|
+
|
36
|
+
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
37
|
+
|
38
|
+
if not self.enabled:
|
39
|
+
yield from page_batch
|
40
|
+
return
|
41
|
+
|
42
|
+
for page in page_batch:
|
43
|
+
assert page._backend is not None
|
44
|
+
if not page._backend.is_valid():
|
45
|
+
yield page
|
46
|
+
else:
|
47
|
+
ocr_rects = self.get_ocr_rects(page)
|
48
|
+
|
49
|
+
all_ocr_cells = []
|
50
|
+
for ocr_rect in ocr_rects:
|
51
|
+
# Skip zero area boxes
|
52
|
+
if ocr_rect.area() == 0:
|
53
|
+
continue
|
54
|
+
high_res_image = page._backend.get_page_image(
|
55
|
+
scale=self.scale, cropbox=ocr_rect
|
56
|
+
)
|
57
|
+
im = numpy.array(high_res_image)
|
58
|
+
result = self.reader.readtext(im)
|
59
|
+
|
60
|
+
del high_res_image
|
61
|
+
del im
|
62
|
+
|
63
|
+
cells = [
|
64
|
+
OcrCell(
|
65
|
+
id=ix,
|
66
|
+
text=line[1],
|
67
|
+
confidence=line[2],
|
68
|
+
bbox=BoundingBox.from_tuple(
|
69
|
+
coord=(
|
70
|
+
(line[0][0][0] / self.scale) + ocr_rect.l,
|
71
|
+
(line[0][0][1] / self.scale) + ocr_rect.t,
|
72
|
+
(line[0][2][0] / self.scale) + ocr_rect.l,
|
73
|
+
(line[0][2][1] / self.scale) + ocr_rect.t,
|
74
|
+
),
|
75
|
+
origin=CoordOrigin.TOPLEFT,
|
76
|
+
),
|
77
|
+
)
|
78
|
+
for ix, line in enumerate(result)
|
79
|
+
]
|
80
|
+
all_ocr_cells.extend(cells)
|
81
|
+
|
82
|
+
## Remove OCR cells which overlap with programmatic cells.
|
83
|
+
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
84
|
+
|
85
|
+
page.cells.extend(filtered_ocr_cells)
|
86
|
+
|
87
|
+
# DEBUG code:
|
88
|
+
# self.draw_ocr_rects_and_cells(page, ocr_rects)
|
89
|
+
|
90
|
+
yield page
|
@@ -273,68 +273,72 @@ class LayoutModel(BasePageModel):
|
|
273
273
|
|
274
274
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
275
275
|
for page in page_batch:
|
276
|
-
assert page.
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
# TODO: Remove, postprocess should take care of it anyway.
|
296
|
-
for cell in page.cells:
|
297
|
-
for cluster in clusters:
|
298
|
-
if not cell.bbox.area() > 0:
|
299
|
-
overlap_frac = 0.0
|
300
|
-
else:
|
301
|
-
overlap_frac = (
|
302
|
-
cell.bbox.intersection_area_with(cluster.bbox)
|
303
|
-
/ cell.bbox.area()
|
304
|
-
)
|
305
|
-
|
306
|
-
if overlap_frac > 0.5:
|
307
|
-
cluster.cells.append(cell)
|
308
|
-
|
309
|
-
# Pre-sort clusters
|
310
|
-
# clusters = self.sort_clusters_by_cell_order(clusters)
|
311
|
-
|
312
|
-
# DEBUG code:
|
313
|
-
def draw_clusters_and_cells():
|
314
|
-
image = copy.deepcopy(page.image)
|
315
|
-
draw = ImageDraw.Draw(image)
|
316
|
-
for c in clusters:
|
317
|
-
x0, y0, x1, y1 = c.bbox.as_tuple()
|
318
|
-
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
|
319
|
-
|
320
|
-
cell_color = (
|
321
|
-
random.randint(30, 140),
|
322
|
-
random.randint(30, 140),
|
323
|
-
random.randint(30, 140),
|
276
|
+
assert page._backend is not None
|
277
|
+
if not page._backend.is_valid():
|
278
|
+
yield page
|
279
|
+
else:
|
280
|
+
assert page.size is not None
|
281
|
+
|
282
|
+
clusters = []
|
283
|
+
for ix, pred_item in enumerate(
|
284
|
+
self.layout_predictor.predict(page.get_image(scale=1.0))
|
285
|
+
):
|
286
|
+
label = DocItemLabel(
|
287
|
+
pred_item["label"].lower().replace(" ", "_").replace("-", "_")
|
288
|
+
) # Temporary, until docling-ibm-model uses docling-core types
|
289
|
+
cluster = Cluster(
|
290
|
+
id=ix,
|
291
|
+
label=label,
|
292
|
+
confidence=pred_item["confidence"],
|
293
|
+
bbox=BoundingBox.model_validate(pred_item),
|
294
|
+
cells=[],
|
324
295
|
)
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
296
|
+
clusters.append(cluster)
|
297
|
+
|
298
|
+
# Map cells to clusters
|
299
|
+
# TODO: Remove, postprocess should take care of it anyway.
|
300
|
+
for cell in page.cells:
|
301
|
+
for cluster in clusters:
|
302
|
+
if not cell.bbox.area() > 0:
|
303
|
+
overlap_frac = 0.0
|
304
|
+
else:
|
305
|
+
overlap_frac = (
|
306
|
+
cell.bbox.intersection_area_with(cluster.bbox)
|
307
|
+
/ cell.bbox.area()
|
308
|
+
)
|
309
|
+
|
310
|
+
if overlap_frac > 0.5:
|
311
|
+
cluster.cells.append(cell)
|
312
|
+
|
313
|
+
# Pre-sort clusters
|
314
|
+
# clusters = self.sort_clusters_by_cell_order(clusters)
|
315
|
+
|
316
|
+
# DEBUG code:
|
317
|
+
def draw_clusters_and_cells():
|
318
|
+
image = copy.deepcopy(page.image)
|
319
|
+
draw = ImageDraw.Draw(image)
|
320
|
+
for c in clusters:
|
321
|
+
x0, y0, x1, y1 = c.bbox.as_tuple()
|
322
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
|
323
|
+
|
324
|
+
cell_color = (
|
325
|
+
random.randint(30, 140),
|
326
|
+
random.randint(30, 140),
|
327
|
+
random.randint(30, 140),
|
328
|
+
)
|
329
|
+
for tc in c.cells: # [:1]:
|
330
|
+
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
331
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
332
|
+
image.show()
|
329
333
|
|
330
|
-
|
334
|
+
# draw_clusters_and_cells()
|
331
335
|
|
332
|
-
|
333
|
-
|
334
|
-
|
336
|
+
clusters, page.cells = self.postprocess(
|
337
|
+
clusters, page.cells, page.size.height
|
338
|
+
)
|
335
339
|
|
336
|
-
|
340
|
+
# draw_clusters_and_cells()
|
337
341
|
|
338
|
-
|
342
|
+
page.predictions.layout = LayoutPrediction(clusters=clusters)
|
339
343
|
|
340
|
-
|
344
|
+
yield page
|
@@ -0,0 +1,172 @@
|
|
1
|
+
import logging
|
2
|
+
import re
|
3
|
+
from typing import Iterable, List
|
4
|
+
|
5
|
+
from pydantic import BaseModel
|
6
|
+
|
7
|
+
from docling.datamodel.base_models import (
|
8
|
+
AssembledUnit,
|
9
|
+
FigureElement,
|
10
|
+
Page,
|
11
|
+
PageElement,
|
12
|
+
Table,
|
13
|
+
TextElement,
|
14
|
+
)
|
15
|
+
from docling.models.base_model import BasePageModel
|
16
|
+
from docling.models.layout_model import LayoutModel
|
17
|
+
|
18
|
+
_log = logging.getLogger(__name__)
|
19
|
+
|
20
|
+
|
21
|
+
class PageAssembleOptions(BaseModel):
|
22
|
+
keep_images: bool = False
|
23
|
+
|
24
|
+
|
25
|
+
class PageAssembleModel(BasePageModel):
|
26
|
+
def __init__(self, options: PageAssembleOptions):
|
27
|
+
self.options = options
|
28
|
+
|
29
|
+
def sanitize_text(self, lines):
|
30
|
+
if len(lines) <= 1:
|
31
|
+
return " ".join(lines)
|
32
|
+
|
33
|
+
for ix, line in enumerate(lines[1:]):
|
34
|
+
prev_line = lines[ix]
|
35
|
+
|
36
|
+
if prev_line.endswith("-"):
|
37
|
+
prev_words = re.findall(r"\b[\w]+\b", prev_line)
|
38
|
+
line_words = re.findall(r"\b[\w]+\b", line)
|
39
|
+
|
40
|
+
if (
|
41
|
+
len(prev_words)
|
42
|
+
and len(line_words)
|
43
|
+
and prev_words[-1].isalnum()
|
44
|
+
and line_words[0].isalnum()
|
45
|
+
):
|
46
|
+
lines[ix] = prev_line[:-1]
|
47
|
+
else:
|
48
|
+
lines[ix] += " "
|
49
|
+
|
50
|
+
sanitized_text = "".join(lines)
|
51
|
+
|
52
|
+
return sanitized_text.strip() # Strip any leading or trailing whitespace
|
53
|
+
|
54
|
+
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
55
|
+
for page in page_batch:
|
56
|
+
assert page._backend is not None
|
57
|
+
if not page._backend.is_valid():
|
58
|
+
yield page
|
59
|
+
else:
|
60
|
+
assert page.predictions.layout is not None
|
61
|
+
|
62
|
+
# assembles some JSON output page by page.
|
63
|
+
|
64
|
+
elements: List[PageElement] = []
|
65
|
+
headers: List[PageElement] = []
|
66
|
+
body: List[PageElement] = []
|
67
|
+
|
68
|
+
for cluster in page.predictions.layout.clusters:
|
69
|
+
# _log.info("Cluster label seen:", cluster.label)
|
70
|
+
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
71
|
+
|
72
|
+
textlines = [
|
73
|
+
cell.text.replace("\x02", "-").strip()
|
74
|
+
for cell in cluster.cells
|
75
|
+
if len(cell.text.strip()) > 0
|
76
|
+
]
|
77
|
+
text = self.sanitize_text(textlines)
|
78
|
+
text_el = TextElement(
|
79
|
+
label=cluster.label,
|
80
|
+
id=cluster.id,
|
81
|
+
text=text,
|
82
|
+
page_no=page.page_no,
|
83
|
+
cluster=cluster,
|
84
|
+
)
|
85
|
+
elements.append(text_el)
|
86
|
+
|
87
|
+
if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
|
88
|
+
headers.append(text_el)
|
89
|
+
else:
|
90
|
+
body.append(text_el)
|
91
|
+
elif cluster.label == LayoutModel.TABLE_LABEL:
|
92
|
+
tbl = None
|
93
|
+
if page.predictions.tablestructure:
|
94
|
+
tbl = page.predictions.tablestructure.table_map.get(
|
95
|
+
cluster.id, None
|
96
|
+
)
|
97
|
+
if (
|
98
|
+
not tbl
|
99
|
+
): # fallback: add table without structure, if it isn't present
|
100
|
+
tbl = Table(
|
101
|
+
label=cluster.label,
|
102
|
+
id=cluster.id,
|
103
|
+
text="",
|
104
|
+
otsl_seq=[],
|
105
|
+
table_cells=[],
|
106
|
+
cluster=cluster,
|
107
|
+
page_no=page.page_no,
|
108
|
+
)
|
109
|
+
|
110
|
+
elements.append(tbl)
|
111
|
+
body.append(tbl)
|
112
|
+
elif cluster.label == LayoutModel.FIGURE_LABEL:
|
113
|
+
fig = None
|
114
|
+
if page.predictions.figures_classification:
|
115
|
+
fig = (
|
116
|
+
page.predictions.figures_classification.figure_map.get(
|
117
|
+
cluster.id, None
|
118
|
+
)
|
119
|
+
)
|
120
|
+
if (
|
121
|
+
not fig
|
122
|
+
): # fallback: add figure without classification, if it isn't present
|
123
|
+
fig = FigureElement(
|
124
|
+
label=cluster.label,
|
125
|
+
id=cluster.id,
|
126
|
+
text="",
|
127
|
+
data=None,
|
128
|
+
cluster=cluster,
|
129
|
+
page_no=page.page_no,
|
130
|
+
)
|
131
|
+
elements.append(fig)
|
132
|
+
body.append(fig)
|
133
|
+
elif cluster.label == LayoutModel.FORMULA_LABEL:
|
134
|
+
equation = None
|
135
|
+
if page.predictions.equations_prediction:
|
136
|
+
equation = (
|
137
|
+
page.predictions.equations_prediction.equation_map.get(
|
138
|
+
cluster.id, None
|
139
|
+
)
|
140
|
+
)
|
141
|
+
if (
|
142
|
+
not equation
|
143
|
+
): # fallback: add empty formula, if it isn't present
|
144
|
+
text = self.sanitize_text(
|
145
|
+
[
|
146
|
+
cell.text.replace("\x02", "-").strip()
|
147
|
+
for cell in cluster.cells
|
148
|
+
if len(cell.text.strip()) > 0
|
149
|
+
]
|
150
|
+
)
|
151
|
+
equation = TextElement(
|
152
|
+
label=cluster.label,
|
153
|
+
id=cluster.id,
|
154
|
+
cluster=cluster,
|
155
|
+
page_no=page.page_no,
|
156
|
+
text=text,
|
157
|
+
)
|
158
|
+
elements.append(equation)
|
159
|
+
body.append(equation)
|
160
|
+
|
161
|
+
page.assembled = AssembledUnit(
|
162
|
+
elements=elements, headers=headers, body=body
|
163
|
+
)
|
164
|
+
|
165
|
+
# Remove page images (can be disabled)
|
166
|
+
if not self.options.keep_images:
|
167
|
+
page._image_cache = {}
|
168
|
+
|
169
|
+
# Unload backend
|
170
|
+
page._backend.unload()
|
171
|
+
|
172
|
+
yield page
|
@@ -17,9 +17,13 @@ class PagePreprocessingModel(BasePageModel):
|
|
17
17
|
|
18
18
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
19
19
|
for page in page_batch:
|
20
|
-
page
|
21
|
-
|
22
|
-
|
20
|
+
assert page._backend is not None
|
21
|
+
if not page._backend.is_valid():
|
22
|
+
yield page
|
23
|
+
else:
|
24
|
+
page = self._populate_page_images(page)
|
25
|
+
page = self._parse_page_cells(page)
|
26
|
+
yield page
|
23
27
|
|
24
28
|
# Generate the page image and store it in the page object
|
25
29
|
def _populate_page_images(self, page: Page) -> Page:
|