docling 1.6.2__py3-none-any.whl → 1.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  import copy
2
+ from pathlib import Path
2
3
  from typing import Iterable, List
3
4
 
4
5
  import numpy
@@ -12,16 +13,22 @@ from docling.datamodel.base_models import (
12
13
  TableElement,
13
14
  TableStructurePrediction,
14
15
  )
16
+ from docling.datamodel.pipeline_options import TableFormerMode
15
17
 
16
18
 
17
19
  class TableStructureModel:
18
20
  def __init__(self, config):
19
21
  self.config = config
20
22
  self.do_cell_matching = config["do_cell_matching"]
23
+ self.mode = config["mode"]
21
24
 
22
25
  self.enabled = config["enabled"]
23
26
  if self.enabled:
24
- artifacts_path = config["artifacts_path"]
27
+ artifacts_path: Path = config["artifacts_path"]
28
+
29
+ if self.mode == TableFormerMode.ACCURATE:
30
+ artifacts_path = artifacts_path / "fat"
31
+
25
32
  # Third Party
26
33
  import docling_ibm_models.tableformer.common as c
27
34
 
@@ -44,7 +51,16 @@ class TableStructureModel:
44
51
 
45
52
  for tc in table_element.table_cells:
46
53
  x0, y0, x1, y1 = tc.bbox.as_tuple()
47
- draw.rectangle([(x0, y0), (x1, y1)], outline="blue")
54
+ if tc.column_header:
55
+ width = 3
56
+ else:
57
+ width = 1
58
+ draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
59
+ draw.text(
60
+ (x0 + 3, y0 + 3),
61
+ text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
62
+ fill="black",
63
+ )
48
64
 
49
65
  image.show()
50
66
 
@@ -0,0 +1,167 @@
1
+ import io
2
+ import logging
3
+ import tempfile
4
+ from subprocess import PIPE, Popen
5
+ from typing import Iterable, Tuple
6
+
7
+ import pandas as pd
8
+
9
+ from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
10
+ from docling.datamodel.pipeline_options import TesseractCliOcrOptions
11
+ from docling.models.base_ocr_model import BaseOcrModel
12
+
13
+ _log = logging.getLogger(__name__)
14
+
15
+
16
+ class TesseractOcrCliModel(BaseOcrModel):
17
+
18
+ def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
19
+ super().__init__(enabled=enabled, options=options)
20
+ self.options: TesseractCliOcrOptions
21
+
22
+ self.scale = 3 # multiplier for 72 dpi == 216 dpi.
23
+
24
+ self._name = None
25
+ self._version = None
26
+
27
+ if self.enabled:
28
+ try:
29
+ self._get_name_and_version()
30
+
31
+ except Exception as exc:
32
+ raise RuntimeError(
33
+ f"Tesseract is not available, aborting: {exc} "
34
+ "Install tesseract on your system and the tesseract binary is discoverable. "
35
+ "The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. "
36
+ "Alternatively, Docling has support for other OCR engines. See the documentation."
37
+ )
38
+
39
+ def _get_name_and_version(self) -> Tuple[str, str]:
40
+
41
+ if self._name != None and self._version != None:
42
+ return self._name, self._version
43
+
44
+ cmd = [self.options.tesseract_cmd, "--version"]
45
+
46
+ proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
47
+ stdout, stderr = proc.communicate()
48
+
49
+ proc.wait()
50
+
51
+ # HACK: Windows versions of Tesseract output the version to stdout, Linux versions
52
+ # to stderr, so check both.
53
+ version_line = (
54
+ (stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
55
+ .split("\n")[0]
56
+ .strip()
57
+ )
58
+
59
+ # If everything else fails...
60
+ if not version_line:
61
+ version_line = "tesseract XXX"
62
+
63
+ name, version = version_line.split(" ")
64
+
65
+ self._name = name
66
+ self._version = version
67
+
68
+ return name, version
69
+
70
+ def _run_tesseract(self, ifilename: str):
71
+
72
+ cmd = [self.options.tesseract_cmd]
73
+
74
+ if self.options.lang is not None and len(self.options.lang) > 0:
75
+ cmd.append("-l")
76
+ cmd.append("+".join(self.options.lang))
77
+ if self.options.path is not None:
78
+ cmd.append("--tessdata-dir")
79
+ cmd.append(self.options.path)
80
+
81
+ cmd += [ifilename, "stdout", "tsv"]
82
+ _log.info("command: {}".format(" ".join(cmd)))
83
+
84
+ proc = Popen(cmd, stdout=PIPE)
85
+ output, _ = proc.communicate()
86
+
87
+ # _log.info(output)
88
+
89
+ # Decode the byte string to a regular string
90
+ decoded_data = output.decode("utf-8")
91
+ # _log.info(decoded_data)
92
+
93
+ # Read the TSV file generated by Tesseract
94
+ df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
95
+
96
+ # Display the dataframe (optional)
97
+ # _log.info("df: ", df.head())
98
+
99
+ # Filter rows that contain actual text (ignore header or empty rows)
100
+ df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
101
+
102
+ return df_filtered
103
+
104
+ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
105
+
106
+ if not self.enabled:
107
+ yield from page_batch
108
+ return
109
+
110
+ for page in page_batch:
111
+ ocr_rects = self.get_ocr_rects(page)
112
+
113
+ all_ocr_cells = []
114
+ for ocr_rect in ocr_rects:
115
+ # Skip zero area boxes
116
+ if ocr_rect.area() == 0:
117
+ continue
118
+ high_res_image = page._backend.get_page_image(
119
+ scale=self.scale, cropbox=ocr_rect
120
+ )
121
+
122
+ with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
123
+ fname = image_file.name
124
+ high_res_image.save(fname)
125
+
126
+ df = self._run_tesseract(fname)
127
+
128
+ # _log.info(df)
129
+
130
+ # Print relevant columns (bounding box and text)
131
+ for ix, row in df.iterrows():
132
+ text = row["text"]
133
+ conf = row["conf"]
134
+
135
+ l = float(row["left"])
136
+ b = float(row["top"])
137
+ w = float(row["width"])
138
+ h = float(row["height"])
139
+
140
+ t = b + h
141
+ r = l + w
142
+
143
+ cell = OcrCell(
144
+ id=ix,
145
+ text=text,
146
+ confidence=conf / 100.0,
147
+ bbox=BoundingBox.from_tuple(
148
+ coord=(
149
+ (l / self.scale) + ocr_rect.l,
150
+ (b / self.scale) + ocr_rect.t,
151
+ (r / self.scale) + ocr_rect.l,
152
+ (t / self.scale) + ocr_rect.t,
153
+ ),
154
+ origin=CoordOrigin.TOPLEFT,
155
+ ),
156
+ )
157
+ all_ocr_cells.append(cell)
158
+
159
+ ## Remove OCR cells which overlap with programmatic cells.
160
+ filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
161
+
162
+ page.cells.extend(filtered_ocr_cells)
163
+
164
+ # DEBUG code:
165
+ # self.draw_ocr_rects_and_cells(page, ocr_rects)
166
+
167
+ yield page
@@ -0,0 +1,122 @@
1
+ import logging
2
+ from typing import Iterable
3
+
4
+ import numpy
5
+
6
+ from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
7
+ from docling.datamodel.pipeline_options import TesseractCliOcrOptions
8
+ from docling.models.base_ocr_model import BaseOcrModel
9
+
10
+ _log = logging.getLogger(__name__)
11
+
12
+
13
+ class TesseractOcrModel(BaseOcrModel):
14
+ def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
15
+ super().__init__(enabled=enabled, options=options)
16
+ self.options: TesseractCliOcrOptions
17
+
18
+ self.scale = 3 # multiplier for 72 dpi == 216 dpi.
19
+ self.reader = None
20
+
21
+ if self.enabled:
22
+ setup_errmsg = (
23
+ "tesserocr is not correctly installed. "
24
+ "Please install it via `pip install tesserocr` to use this OCR engine. "
25
+ "Note that tesserocr might have to be manually compiled for working with"
26
+ "your Tesseract installation. The Docling documentation provides examples for it. "
27
+ "Alternatively, Docling has support for other OCR engines. See the documentation."
28
+ )
29
+ try:
30
+ import tesserocr
31
+ except ImportError:
32
+ raise ImportError(setup_errmsg)
33
+
34
+ try:
35
+ tesseract_version = tesserocr.tesseract_version()
36
+ _log.debug("Initializing TesserOCR: %s", tesseract_version)
37
+ except:
38
+ raise ImportError(setup_errmsg)
39
+
40
+ # Initialize the tesseractAPI
41
+ lang = "+".join(self.options.lang)
42
+ if self.options.path is not None:
43
+ self.reader = tesserocr.PyTessBaseAPI(
44
+ path=self.options.path,
45
+ lang=lang,
46
+ psm=tesserocr.PSM.AUTO,
47
+ init=True,
48
+ oem=tesserocr.OEM.DEFAULT,
49
+ )
50
+ else:
51
+ self.reader = tesserocr.PyTessBaseAPI(
52
+ lang=lang,
53
+ psm=tesserocr.PSM.AUTO,
54
+ init=True,
55
+ oem=tesserocr.OEM.DEFAULT,
56
+ )
57
+ self.reader_RIL = tesserocr.RIL
58
+
59
+ def __del__(self):
60
+ if self.reader is not None:
61
+ # Finalize the tesseractAPI
62
+ self.reader.End()
63
+
64
+ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
65
+
66
+ if not self.enabled:
67
+ yield from page_batch
68
+ return
69
+
70
+ for page in page_batch:
71
+ ocr_rects = self.get_ocr_rects(page)
72
+
73
+ all_ocr_cells = []
74
+ for ocr_rect in ocr_rects:
75
+ # Skip zero area boxes
76
+ if ocr_rect.area() == 0:
77
+ continue
78
+ high_res_image = page._backend.get_page_image(
79
+ scale=self.scale, cropbox=ocr_rect
80
+ )
81
+
82
+ # Retrieve text snippets with their bounding boxes
83
+ self.reader.SetImage(high_res_image)
84
+ boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
85
+
86
+ cells = []
87
+ for ix, (im, box, _, _) in enumerate(boxes):
88
+ # Set the area of interest. Tesseract uses Bottom-Left for the origin
89
+ self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
90
+
91
+ # Extract text within the bounding box
92
+ text = self.reader.GetUTF8Text().strip()
93
+ confidence = self.reader.MeanTextConf()
94
+ left = box["x"] / self.scale
95
+ bottom = box["y"] / self.scale
96
+ right = (box["x"] + box["w"]) / self.scale
97
+ top = (box["y"] + box["h"]) / self.scale
98
+
99
+ cells.append(
100
+ OcrCell(
101
+ id=ix,
102
+ text=text,
103
+ confidence=confidence,
104
+ bbox=BoundingBox.from_tuple(
105
+ coord=(left, top, right, bottom),
106
+ origin=CoordOrigin.TOPLEFT,
107
+ ),
108
+ )
109
+ )
110
+
111
+ # del high_res_image
112
+ all_ocr_cells.extend(cells)
113
+
114
+ ## Remove OCR cells which overlap with programmatic cells.
115
+ filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
116
+
117
+ page.cells.extend(filtered_ocr_cells)
118
+
119
+ # DEBUG code:
120
+ # self.draw_ocr_rects_and_cells(page, ocr_rects)
121
+
122
+ yield page
@@ -1,12 +1,13 @@
1
1
  from pathlib import Path
2
- from typing import Iterable
2
+ from typing import Callable, Iterable, List
3
3
 
4
- from docling.datamodel.base_models import Page, PipelineOptions
4
+ from docling.datamodel.base_models import Page
5
+ from docling.datamodel.pipeline_options import PipelineOptions
5
6
 
6
7
 
7
8
  class BaseModelPipeline:
8
9
  def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
9
- self.model_pipe = []
10
+ self.model_pipe: List[Callable] = []
10
11
  self.artifacts_path = artifacts_path
11
12
  self.pipeline_options = pipeline_options
12
13
 
@@ -1,37 +1,65 @@
1
1
  from pathlib import Path
2
2
 
3
- from docling.datamodel.base_models import PipelineOptions
3
+ from docling.datamodel.pipeline_options import (
4
+ EasyOcrOptions,
5
+ PipelineOptions,
6
+ TesseractCliOcrOptions,
7
+ TesseractOcrOptions,
8
+ )
9
+ from docling.models.base_ocr_model import BaseOcrModel
4
10
  from docling.models.easyocr_model import EasyOcrModel
5
11
  from docling.models.layout_model import LayoutModel
6
12
  from docling.models.table_structure_model import TableStructureModel
13
+ from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
14
+ from docling.models.tesseract_ocr_model import TesseractOcrModel
7
15
  from docling.pipeline.base_model_pipeline import BaseModelPipeline
8
16
 
9
17
 
10
18
  class StandardModelPipeline(BaseModelPipeline):
11
- _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
19
+ _layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
12
20
  _table_model_path = "model_artifacts/tableformer"
13
21
 
14
22
  def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
15
23
  super().__init__(artifacts_path, pipeline_options)
16
24
 
25
+ ocr_model: BaseOcrModel
26
+ if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
27
+ ocr_model = EasyOcrModel(
28
+ enabled=pipeline_options.do_ocr,
29
+ options=pipeline_options.ocr_options,
30
+ )
31
+ elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions):
32
+ ocr_model = TesseractOcrCliModel(
33
+ enabled=pipeline_options.do_ocr,
34
+ options=pipeline_options.ocr_options,
35
+ )
36
+ elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
37
+ ocr_model = TesseractOcrModel(
38
+ enabled=pipeline_options.do_ocr,
39
+ options=pipeline_options.ocr_options,
40
+ )
41
+ else:
42
+ raise RuntimeError(
43
+ f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
44
+ )
45
+
17
46
  self.model_pipe = [
18
- EasyOcrModel(
19
- config={
20
- "lang": ["fr", "de", "es", "en"],
21
- "enabled": pipeline_options.do_ocr,
22
- }
23
- ),
47
+ # OCR
48
+ ocr_model,
49
+ # Layout
24
50
  LayoutModel(
25
51
  config={
26
52
  "artifacts_path": artifacts_path
27
53
  / StandardModelPipeline._layout_model_path
28
54
  }
29
55
  ),
56
+ # Table structure
30
57
  TableStructureModel(
31
58
  config={
32
59
  "artifacts_path": artifacts_path
33
60
  / StandardModelPipeline._table_model_path,
34
61
  "enabled": pipeline_options.do_table_structure,
62
+ "mode": pipeline_options.table_structure_options.mode,
35
63
  "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
36
64
  }
37
65
  ),
@@ -0,0 +1,145 @@
1
+ import logging
2
+ from typing import Any, Dict, Iterable, List, Tuple, Union
3
+
4
+ from docling_core.types.doc.base import BaseCell, BaseText, Ref, Table, TableCell
5
+
6
+ from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
7
+ from docling.datamodel.document import ConversionResult, Page
8
+
9
+ _log = logging.getLogger(__name__)
10
+
11
+
12
+ def generate_multimodal_pages(
13
+ doc_result: ConversionResult,
14
+ ) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
15
+
16
+ label_to_doclaynet = {
17
+ "title": "title",
18
+ "table-of-contents": "document_index",
19
+ "subtitle-level-1": "section_header",
20
+ "checkbox-selected": "checkbox_selected",
21
+ "checkbox-unselected": "checkbox_unselected",
22
+ "caption": "caption",
23
+ "page-header": "page_header",
24
+ "page-footer": "page_footer",
25
+ "footnote": "footnote",
26
+ "table": "table",
27
+ "formula": "formula",
28
+ "list-item": "list_item",
29
+ "code": "code",
30
+ "figure": "picture",
31
+ "picture": "picture",
32
+ "reference": "text",
33
+ "paragraph": "text",
34
+ "text": "text",
35
+ }
36
+
37
+ content_text = ""
38
+ page_no = 0
39
+ start_ix = 0
40
+ end_ix = 0
41
+ doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []
42
+
43
+ doc = doc_result.output
44
+
45
+ def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
46
+ segments = []
47
+
48
+ for ix, item in doc_items:
49
+ item_type = item.obj_type
50
+ label = label_to_doclaynet.get(item_type, None)
51
+
52
+ if label is None or item.prov is None or page.size is None:
53
+ continue
54
+
55
+ bbox = BoundingBox.from_tuple(
56
+ tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT
57
+ )
58
+ new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(
59
+ page_size=page.size
60
+ )
61
+
62
+ new_segment = {
63
+ "index_in_doc": ix,
64
+ "label": label,
65
+ "text": item.text if item.text is not None else "",
66
+ "bbox": new_bbox.as_tuple(),
67
+ "data": [],
68
+ }
69
+
70
+ if isinstance(item, Table):
71
+ table_html = item.export_to_html()
72
+ new_segment["data"].append(
73
+ {
74
+ "html_seq": table_html,
75
+ "otsl_seq": "",
76
+ }
77
+ )
78
+
79
+ segments.append(new_segment)
80
+
81
+ return segments
82
+
83
+ def _process_page_cells(page: Page):
84
+ cells: List[dict] = []
85
+ if page.size is None:
86
+ return cells
87
+ for cell in page.cells:
88
+ new_bbox = cell.bbox.to_top_left_origin(
89
+ page_height=page.size.height
90
+ ).normalized(page_size=page.size)
91
+ is_ocr = isinstance(cell, OcrCell)
92
+ ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0
93
+ cells.append(
94
+ {
95
+ "text": cell.text,
96
+ "bbox": new_bbox.as_tuple(),
97
+ "ocr": is_ocr,
98
+ "ocr_confidence": ocr_confidence,
99
+ }
100
+ )
101
+ return cells
102
+
103
+ def _process_page():
104
+ page_ix = page_no - 1
105
+ page = doc_result.pages[page_ix]
106
+
107
+ page_cells = _process_page_cells(page=page)
108
+ page_segments = _process_page_segments(doc_items=doc_items, page=page)
109
+ content_md = doc.export_to_markdown(
110
+ main_text_start=start_ix, main_text_stop=end_ix
111
+ )
112
+ # No page-tagging since we only do 1 page at the time
113
+ content_dt = doc.export_to_document_tokens(
114
+ main_text_start=start_ix, main_text_stop=end_ix, add_page_index=False
115
+ )
116
+
117
+ return content_text, content_md, content_dt, page_cells, page_segments, page
118
+
119
+ if doc.main_text is None:
120
+ return
121
+ for ix, orig_item in enumerate(doc.main_text):
122
+
123
+ item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
124
+ if item is None or item.prov is None or len(item.prov) == 0:
125
+ _log.debug(f"Skipping item {orig_item}")
126
+ continue
127
+
128
+ item_page = item.prov[0].page
129
+
130
+ # Page is complete
131
+ if page_no > 0 and item_page > page_no:
132
+ yield _process_page()
133
+
134
+ start_ix = ix
135
+ doc_items = []
136
+ content_text = ""
137
+
138
+ page_no = item_page
139
+ end_ix = ix
140
+ doc_items.append((ix, item))
141
+ if item.text is not None and item.text != "":
142
+ content_text += item.text + " "
143
+
144
+ if len(doc_items) > 0:
145
+ yield _process_page()
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) [year] [fullname]
3
+ Copyright (c) 2024 International Business Machines
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal