magic-pdf 0.10.5__py3-none-any.whl → 0.10.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -51,3 +51,8 @@ class MODEL_NAME:
51
51
  UniMerNet_v2_Small = 'unimernet_small'
52
52
 
53
53
  RAPID_TABLE = 'rapid_table'
54
+
55
+
56
+ PARSE_TYPE_TXT = 'txt'
57
+ PARSE_TYPE_OCR = 'ocr'
58
+
@@ -48,4 +48,16 @@ class DataWriter(ABC):
48
48
  path (str): the target file where to write
49
49
  data (str): the data want to write
50
50
  """
51
- self.write(path, data.encode())
51
+
52
+ def safe_encode(data: str, method: str):
53
+ try:
54
+ bit_data = data.encode(encoding=method, errors='replace')
55
+ return bit_data, True
56
+ except: # noqa
57
+ return None, False
58
+
59
+ for method in ['utf-8', 'ascii']:
60
+ bit_data, flag = safe_encode(data, method)
61
+ if flag:
62
+ self.write(path, bit_data)
63
+ break
magic_pdf/data/dataset.py CHANGED
@@ -1,11 +1,13 @@
1
+ import os
1
2
  from abc import ABC, abstractmethod
2
- from typing import Iterator
3
+ from typing import Callable, Iterator
3
4
 
4
5
  import fitz
5
6
 
6
7
  from magic_pdf.config.enums import SupportedPdfParseMethod
7
8
  from magic_pdf.data.schemas import PageInfo
8
9
  from magic_pdf.data.utils import fitz_doc_to_image
10
+ from magic_pdf.filter import classify
9
11
 
10
12
 
11
13
  class PageableData(ABC):
@@ -28,6 +30,32 @@ class PageableData(ABC):
28
30
  """
29
31
  pass
30
32
 
33
+ @abstractmethod
34
+ def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
35
+ """draw rectangle.
36
+
37
+ Args:
38
+ rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
39
+ color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
40
+ fill (list[float] | None): fill the board with RGB, None means will not fill with color
41
+ fill_opacity (float): opacity of the fill, range from [0, 1]
42
+ width (float): the width of board
43
+ overlay (bool): fill the color in foreground or background. True means fill in background.
44
+ """
45
+ pass
46
+
47
+ @abstractmethod
48
+ def insert_text(self, coord, content, fontsize, color):
49
+ """insert text.
50
+
51
+ Args:
52
+ coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
53
+ content (str): the text content
54
+ fontsize (int): font size of the text
55
+ color (list[float] | None): three element tuple which describe the RGB of the board line, None will use the default font color!
56
+ """
57
+ pass
58
+
31
59
 
32
60
  class Dataset(ABC):
33
61
  @abstractmethod
@@ -66,6 +94,43 @@ class Dataset(ABC):
66
94
  """
67
95
  pass
68
96
 
97
+ @abstractmethod
98
+ def dump_to_file(self, file_path: str):
99
+ """Dump the file
100
+
101
+ Args:
102
+ file_path (str): the file path
103
+ """
104
+ pass
105
+
106
+ @abstractmethod
107
+ def apply(self, proc: Callable, *args, **kwargs):
108
+ """Apply callable method which.
109
+
110
+ Args:
111
+ proc (Callable): invoke proc as follows:
112
+ proc(self, *args, **kwargs)
113
+
114
+ Returns:
115
+ Any: return the result generated by proc
116
+ """
117
+ pass
118
+
119
+ @abstractmethod
120
+ def classify(self) -> SupportedPdfParseMethod:
121
+ """classify the dataset
122
+
123
+ Returns:
124
+ SupportedPdfParseMethod: _description_
125
+ """
126
+ pass
127
+
128
+ @abstractmethod
129
+ def clone(self):
130
+ """clone this dataset
131
+ """
132
+ pass
133
+
69
134
 
70
135
  class PymuDocDataset(Dataset):
71
136
  def __init__(self, bits: bytes):
@@ -74,7 +139,8 @@ class PymuDocDataset(Dataset):
74
139
  Args:
75
140
  bits (bytes): the bytes of the pdf
76
141
  """
77
- self._records = [Doc(v) for v in fitz.open('pdf', bits)]
142
+ self._raw_fitz = fitz.open('pdf', bits)
143
+ self._records = [Doc(v) for v in self._raw_fitz]
78
144
  self._data_bits = bits
79
145
  self._raw_data = bits
80
146
 
@@ -109,6 +175,43 @@ class PymuDocDataset(Dataset):
109
175
  """
110
176
  return self._records[page_id]
111
177
 
178
+ def dump_to_file(self, file_path: str):
179
+ """Dump the file
180
+
181
+ Args:
182
+ file_path (str): the file path
183
+ """
184
+
185
+ dir_name = os.path.dirname(file_path)
186
+ if dir_name not in ('', '.', '..'):
187
+ os.makedirs(dir_name, exist_ok=True)
188
+ self._raw_fitz.save(file_path)
189
+
190
+ def apply(self, proc: Callable, *args, **kwargs):
191
+ """Apply callable method which.
192
+
193
+ Args:
194
+ proc (Callable): invoke proc as follows:
195
+ proc(dataset, *args, **kwargs)
196
+
197
+ Returns:
198
+ Any: return the result generated by proc
199
+ """
200
+ return proc(self, *args, **kwargs)
201
+
202
+ def classify(self) -> SupportedPdfParseMethod:
203
+ """classify the dataset
204
+
205
+ Returns:
206
+ SupportedPdfParseMethod: _description_
207
+ """
208
+ return classify(self._data_bits)
209
+
210
+ def clone(self):
211
+ """clone this dataset
212
+ """
213
+ return PymuDocDataset(self._raw_data)
214
+
112
215
 
113
216
  class ImageDataset(Dataset):
114
217
  def __init__(self, bits: bytes):
@@ -118,7 +221,8 @@ class ImageDataset(Dataset):
118
221
  bits (bytes): the bytes of the photo which will be converted to pdf first. then converted to pymudoc.
119
222
  """
120
223
  pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
121
- self._records = [Doc(v) for v in fitz.open('pdf', pdf_bytes)]
224
+ self._raw_fitz = fitz.open('pdf', pdf_bytes)
225
+ self._records = [Doc(v) for v in self._raw_fitz]
122
226
  self._raw_data = bits
123
227
  self._data_bits = pdf_bytes
124
228
 
@@ -153,14 +257,50 @@ class ImageDataset(Dataset):
153
257
  """
154
258
  return self._records[page_id]
155
259
 
260
+ def dump_to_file(self, file_path: str):
261
+ """Dump the file
262
+
263
+ Args:
264
+ file_path (str): the file path
265
+ """
266
+ dir_name = os.path.dirname(file_path)
267
+ if dir_name not in ('', '.', '..'):
268
+ os.makedirs(dir_name, exist_ok=True)
269
+ self._raw_fitz.save(file_path)
270
+
271
+ def apply(self, proc: Callable, *args, **kwargs):
272
+ """Apply callable method which.
273
+
274
+ Args:
275
+ proc (Callable): invoke proc as follows:
276
+ proc(dataset, *args, **kwargs)
277
+
278
+ Returns:
279
+ Any: return the result generated by proc
280
+ """
281
+ return proc(self, *args, **kwargs)
282
+
283
+ def classify(self) -> SupportedPdfParseMethod:
284
+ """classify the dataset
285
+
286
+ Returns:
287
+ SupportedPdfParseMethod: _description_
288
+ """
289
+ return SupportedPdfParseMethod.OCR
290
+
291
+ def clone(self):
292
+ """clone this dataset
293
+ """
294
+ return ImageDataset(self._raw_data)
156
295
 
157
296
  class Doc(PageableData):
158
297
  """Initialized with pymudoc object."""
298
+
159
299
  def __init__(self, doc: fitz.Page):
160
300
  self._doc = doc
161
301
 
162
302
  def get_image(self):
163
- """Return the imge info.
303
+ """Return the image info.
164
304
 
165
305
  Returns:
166
306
  dict: {
@@ -192,3 +332,34 @@ class Doc(PageableData):
192
332
  def __getattr__(self, name):
193
333
  if hasattr(self._doc, name):
194
334
  return getattr(self._doc, name)
335
+
336
+ def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
337
+ """draw rectangle.
338
+
339
+ Args:
340
+ rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
341
+ color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
342
+ fill (list[float] | None): fill the board with RGB, None means will not fill with color
343
+ fill_opacity (float): opacity of the fill, range from [0, 1]
344
+ width (float): the width of board
345
+ overlay (bool): fill the color in foreground or background. True means fill in background.
346
+ """
347
+ self._doc.draw_rect(
348
+ rect_coords,
349
+ color=color,
350
+ fill=fill,
351
+ fill_opacity=fill_opacity,
352
+ width=width,
353
+ overlay=overlay,
354
+ )
355
+
356
+ def insert_text(self, coord, content, fontsize, color):
357
+ """insert text.
358
+
359
+ Args:
360
+ coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
361
+ content (str): the text content
362
+ fontsize (int): font size of the text
363
+ color (list[float] | None): three element tuple which describe the RGB of the board line, None will use the default font color!
364
+ """
365
+ self._doc.insert_text(coord, content, fontsize=fontsize, color=color)
@@ -165,8 +165,8 @@ def merge_para_with_text(para_block):
165
165
  if content:
166
166
  langs = ['zh', 'ja', 'ko']
167
167
  # logger.info(f'block_lang: {block_lang}, content: {content}')
168
- if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔
169
- if j == len(line['spans']) - 1:
168
+ if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格
169
+ if j == len(line['spans']) - 1 and span_type not in [ContentType.InlineEquation]:
170
170
  para_text += content
171
171
  else:
172
172
  para_text += f'{content} '
@@ -0,0 +1,32 @@
1
+
2
+ from magic_pdf.config.drop_reason import DropReason
3
+ from magic_pdf.config.enums import SupportedPdfParseMethod
4
+ from magic_pdf.filter.pdf_classify_by_type import classify as do_classify
5
+ from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
6
+
7
+
8
+ def classify(pdf_bytes: bytes) -> SupportedPdfParseMethod:
9
+ """根据pdf的元数据,判断是文本pdf,还是ocr pdf."""
10
+ pdf_meta = pdf_meta_scan(pdf_bytes)
11
+ if pdf_meta.get('_need_drop', False): # 如果返回了需要丢弃的标志,则抛出异常
12
+ raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
13
+ else:
14
+ is_encrypted = pdf_meta['is_encrypted']
15
+ is_needs_password = pdf_meta['is_needs_password']
16
+ if is_encrypted or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
17
+ raise Exception(f'pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}')
18
+ else:
19
+ is_text_pdf, results = do_classify(
20
+ pdf_meta['total_page'],
21
+ pdf_meta['page_width_pts'],
22
+ pdf_meta['page_height_pts'],
23
+ pdf_meta['image_info_per_page'],
24
+ pdf_meta['text_len_per_page'],
25
+ pdf_meta['imgs_per_page'],
26
+ pdf_meta['text_layout_per_page'],
27
+ pdf_meta['invalid_chars'],
28
+ )
29
+ if is_text_pdf:
30
+ return SupportedPdfParseMethod.TXT
31
+ else:
32
+ return SupportedPdfParseMethod.OCR
@@ -8,7 +8,7 @@ from loguru import logger
8
8
  from magic_pdf.config.drop_reason import DropReason
9
9
  from magic_pdf.libs.commons import get_top_percent_list, mymax
10
10
  from magic_pdf.libs.language import detect_lang
11
- from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf
11
+ from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf, detect_invalid_chars
12
12
 
13
13
  scan_max_page = 50
14
14
  junk_limit_min = 10
@@ -323,7 +323,8 @@ def get_language(doc: fitz.Document):
323
323
 
324
324
  def check_invalid_chars(pdf_bytes):
325
325
  """乱码检测."""
326
- return detect_invalid_chars_by_pymupdf(pdf_bytes)
326
+ # return detect_invalid_chars_by_pymupdf(pdf_bytes)
327
+ return detect_invalid_chars(pdf_bytes)
327
328
 
328
329
 
329
330
  def pdf_meta_scan(pdf_bytes: bytes):
@@ -1,7 +1,8 @@
1
1
  import fitz
2
2
  from magic_pdf.config.constants import CROSS_PAGE
3
- from magic_pdf.config.ocr_content_type import BlockType, CategoryId, ContentType
4
- from magic_pdf.data.dataset import PymuDocDataset
3
+ from magic_pdf.config.ocr_content_type import (BlockType, CategoryId,
4
+ ContentType)
5
+ from magic_pdf.data.dataset import Dataset
5
6
  from magic_pdf.model.magic_model import MagicModel
6
7
 
7
8
 
@@ -194,7 +195,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
194
195
  )
195
196
 
196
197
  # Save the PDF
197
- pdf_docs.save(f'{out_path}/{filename}_layout.pdf')
198
+ pdf_docs.save(f'{out_path}/{filename}')
198
199
 
199
200
 
200
201
  def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
@@ -282,18 +283,17 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
282
283
  draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
283
284
 
284
285
  # Save the PDF
285
- pdf_docs.save(f'{out_path}/{filename}_spans.pdf')
286
+ pdf_docs.save(f'{out_path}/{filename}')
286
287
 
287
288
 
288
- def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
289
+ def draw_model_bbox(model_list, dataset: Dataset, out_path, filename):
289
290
  dropped_bbox_list = []
290
291
  tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
291
292
  imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
292
293
  titles_list = []
293
294
  texts_list = []
294
295
  interequations_list = []
295
- pdf_docs = fitz.open('pdf', pdf_bytes)
296
- magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
296
+ magic_model = MagicModel(model_list, dataset)
297
297
  for i in range(len(model_list)):
298
298
  page_dropped_list = []
299
299
  tables_body, tables_caption, tables_footnote = [], [], []
@@ -337,7 +337,8 @@ def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
337
337
  dropped_bbox_list.append(page_dropped_list)
338
338
  imgs_footnote_list.append(imgs_footnote)
339
339
 
340
- for i, page in enumerate(pdf_docs):
340
+ for i in range(len(dataset)):
341
+ page = dataset.get_page(i)
341
342
  draw_bbox_with_number(
342
343
  i, dropped_bbox_list, page, [158, 158, 158], True
343
344
  ) # color !
@@ -352,7 +353,7 @@ def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
352
353
  draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
353
354
 
354
355
  # Save the PDF
355
- pdf_docs.save(f'{out_path}/{filename}_model.pdf')
356
+ dataset.dump_to_file(f'{out_path}/{filename}')
356
357
 
357
358
 
358
359
  def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
@@ -390,7 +391,7 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
390
391
  for i, page in enumerate(pdf_docs):
391
392
  draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
392
393
 
393
- pdf_docs.save(f'{out_path}/{filename}_line_sort.pdf')
394
+ pdf_docs.save(f'{out_path}/{filename}')
394
395
 
395
396
 
396
397
  def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
@@ -1,9 +1,9 @@
1
1
  import fitz
2
2
  import numpy as np
3
3
  from loguru import logger
4
- # import re
5
- # from io import BytesIO
6
- # from pdfminer.high_level import extract_text
4
+ import re
5
+ from io import BytesIO
6
+ from pdfminer.high_level import extract_text
7
7
 
8
8
 
9
9
  def calculate_sample_count(total_page: int):
@@ -33,33 +33,33 @@ def extract_pages(src_pdf_bytes: bytes) -> fitz.Document:
33
33
  return sample_docs
34
34
 
35
35
 
36
- # def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
37
- # """"
38
- # 检测PDF中是否包含非法字符
39
- # """
40
- # '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
41
- # sample_docs = extract_pages(src_pdf_bytes)
42
- # sample_pdf_bytes = sample_docs.tobytes()
43
- # sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
44
- # text = extract_text(sample_pdf_file_like_object)
45
- # text = text.replace("\n", "")
46
- # # logger.info(text)
47
- # '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
48
- # cid_pattern = re.compile(r'\(cid:\d+\)')
49
- # matches = cid_pattern.findall(text)
50
- # cid_count = len(matches)
51
- # cid_len = sum(len(match) for match in matches)
52
- # text_len = len(text)
53
- # if text_len == 0:
54
- # cid_chars_radio = 0
55
- # else:
56
- # cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
57
- # logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
58
- # '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
59
- # if cid_chars_radio > 0.05:
60
- # return False # 乱码文档
61
- # else:
62
- # return True # 正常文档
36
+ def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
37
+ """"
38
+ 检测PDF中是否包含非法字符
39
+ """
40
+ '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
41
+ sample_docs = extract_pages(src_pdf_bytes)
42
+ sample_pdf_bytes = sample_docs.tobytes()
43
+ sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
44
+ text = extract_text(sample_pdf_file_like_object)
45
+ text = text.replace("\n", "")
46
+ # logger.info(text)
47
+ '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
48
+ cid_pattern = re.compile(r'\(cid:\d+\)')
49
+ matches = cid_pattern.findall(text)
50
+ cid_count = len(matches)
51
+ cid_len = sum(len(match) for match in matches)
52
+ text_len = len(text)
53
+ if text_len == 0:
54
+ cid_chars_radio = 0
55
+ else:
56
+ cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
57
+ logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
58
+ '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
59
+ if cid_chars_radio > 0.05:
60
+ return False # 乱码文档
61
+ else:
62
+ return True # 正常文档
63
63
 
64
64
 
65
65
  def count_replacement_characters(text: str) -> int:
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.10.5"
1
+ __version__ = "0.10.6"
@@ -1,2 +1,126 @@
1
+ from typing import Callable
2
+
3
+ from abc import ABC, abstractmethod
4
+
5
+ from magic_pdf.data.data_reader_writer import DataWriter
6
+ from magic_pdf.data.dataset import Dataset
7
+ from magic_pdf.pipe.operators import PipeResult
8
+
9
+
1
10
  __use_inside_model__ = True
2
11
  __model_mode__ = "full"
12
+
13
+
14
+ class InferenceResultBase(ABC):
15
+
16
+ @abstractmethod
17
+ def __init__(self, inference_results: list, dataset: Dataset):
18
+ """Initialized method.
19
+
20
+ Args:
21
+ inference_results (list): the inference result generated by model
22
+ dataset (Dataset): the dataset related with model inference result
23
+ """
24
+ self._infer_res = inference_results
25
+ self._dataset = dataset
26
+
27
+ @abstractmethod
28
+ def draw_model(self, file_path: str) -> None:
29
+ """Draw model inference result.
30
+
31
+ Args:
32
+ file_path (str): the output file path
33
+ """
34
+ pass
35
+
36
+ @abstractmethod
37
+ def dump_model(self, writer: DataWriter, file_path: str):
38
+ """Dump model inference result to file.
39
+
40
+ Args:
41
+ writer (DataWriter): writer handle
42
+ file_path (str): the location of target file
43
+ """
44
+ pass
45
+
46
+ @abstractmethod
47
+ def get_infer_res(self):
48
+ """Get the inference result.
49
+
50
+ Returns:
51
+ list: the inference result generated by model
52
+ """
53
+ pass
54
+
55
+ @abstractmethod
56
+ def apply(self, proc: Callable, *args, **kwargs):
57
+ """Apply callable method which.
58
+
59
+ Args:
60
+ proc (Callable): invoke proc as follows:
61
+ proc(inference_result, *args, **kwargs)
62
+
63
+ Returns:
64
+ Any: return the result generated by proc
65
+ """
66
+ pass
67
+
68
+ @abstractmethod
69
+ def pipe_auto_mode(
70
+ self,
71
+ imageWriter: DataWriter,
72
+ start_page_id=0,
73
+ end_page_id=None,
74
+ debug_mode=False,
75
+ lang=None,
76
+ ) -> PipeResult:
77
+ """Post-proc the model inference result.
78
+ step1: classify the dataset type
79
+ step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
80
+
81
+ Args:
82
+ imageWriter (DataWriter): the image writer handle
83
+ start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
84
+ end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
85
+ debug_mode (bool, optional): Defaults to False. will dump more log if enabled
86
+ lang (str, optional): Defaults to None.
87
+
88
+ Returns:
89
+ PipeResult: the result
90
+ """
91
+ pass
92
+
93
+ @abstractmethod
94
+ def pipe_txt_mode(
95
+ self,
96
+ imageWriter: DataWriter,
97
+ start_page_id=0,
98
+ end_page_id=None,
99
+ debug_mode=False,
100
+ lang=None,
101
+ ) -> PipeResult:
102
+ """Post-proc the model inference result, Extract the text using the
103
+ third library, such as `pymupdf`
104
+
105
+ Args:
106
+ imageWriter (DataWriter): the image writer handle
107
+ start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
108
+ end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
109
+ debug_mode (bool, optional): Defaults to False. will dump more log if enabled
110
+ lang (str, optional): Defaults to None.
111
+
112
+ Returns:
113
+ PipeResult: the result
114
+ """
115
+ pass
116
+
117
+ @abstractmethod
118
+ def pipe_ocr_mode(
119
+ self,
120
+ imageWriter: DataWriter,
121
+ start_page_id=0,
122
+ end_page_id=None,
123
+ debug_mode=False,
124
+ lang=None,
125
+ ) -> PipeResult:
126
+ pass