magic-pdf 0.10.6__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. magic_pdf/config/constants.py +2 -0
  2. magic_pdf/config/exceptions.py +7 -0
  3. magic_pdf/data/data_reader_writer/filebase.py +1 -1
  4. magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
  5. magic_pdf/data/dataset.py +13 -1
  6. magic_pdf/data/read_api.py +59 -12
  7. magic_pdf/data/utils.py +35 -0
  8. magic_pdf/dict2md/ocr_mkcontent.py +14 -13
  9. magic_pdf/libs/clean_memory.py +11 -4
  10. magic_pdf/libs/config_reader.py +9 -0
  11. magic_pdf/libs/draw_bbox.py +8 -12
  12. magic_pdf/libs/language.py +3 -0
  13. magic_pdf/libs/version.py +1 -1
  14. magic_pdf/model/__init__.py +1 -125
  15. magic_pdf/model/batch_analyze.py +275 -0
  16. magic_pdf/model/doc_analyze_by_custom_model.py +4 -51
  17. magic_pdf/model/magic_model.py +4 -435
  18. magic_pdf/model/model_list.py +1 -0
  19. magic_pdf/model/pdf_extract_kit.py +33 -22
  20. magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
  21. magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
  22. magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
  23. magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
  24. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
  25. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
  26. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
  27. magic_pdf/model/sub_modules/model_init.py +30 -4
  28. magic_pdf/model/sub_modules/model_utils.py +8 -2
  29. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
  30. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
  31. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
  32. magic_pdf/operators/__init__.py +94 -0
  33. magic_pdf/{model/operators.py → operators/models.py} +2 -38
  34. magic_pdf/{pipe/operators.py → operators/pipes.py} +70 -17
  35. magic_pdf/pdf_parse_union_core_v2.py +71 -17
  36. magic_pdf/post_proc/__init__.py +1 -0
  37. magic_pdf/post_proc/llm_aided.py +133 -0
  38. magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
  39. magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
  40. magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
  41. magic_pdf/tools/cli.py +36 -11
  42. magic_pdf/tools/common.py +28 -18
  43. magic_pdf/utils/office_to_pdf.py +29 -0
  44. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/METADATA +73 -23
  45. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/RECORD +50 -53
  46. magic_pdf/para/__init__.py +0 -0
  47. magic_pdf/pdf_parse_by_ocr.py +0 -22
  48. magic_pdf/pdf_parse_by_txt.py +0 -23
  49. magic_pdf/pipe/AbsPipe.py +0 -99
  50. magic_pdf/pipe/OCRPipe.py +0 -80
  51. magic_pdf/pipe/TXTPipe.py +0 -42
  52. magic_pdf/pipe/UNIPipe.py +0 -150
  53. magic_pdf/pipe/__init__.py +0 -0
  54. magic_pdf/rw/AbsReaderWriter.py +0 -17
  55. magic_pdf/rw/DiskReaderWriter.py +0 -74
  56. magic_pdf/rw/S3ReaderWriter.py +0 -142
  57. magic_pdf/rw/__init__.py +0 -0
  58. magic_pdf/user_api.py +0 -144
  59. /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
  60. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/LICENSE.md +0 -0
  61. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/WHEEL +0 -0
  62. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/entry_points.txt +0 -0
  63. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/top_level.txt +0 -0
@@ -52,6 +52,8 @@ class MODEL_NAME:
52
52
 
53
53
  RAPID_TABLE = 'rapid_table'
54
54
 
55
+ YOLO_V11_LangDetect = 'yolo_v11n_langdetect'
56
+
55
57
 
56
58
  PARSE_TYPE_TXT = 'txt'
57
59
  PARSE_TYPE_OCR = 'ocr'
@@ -30,3 +30,10 @@ class EmptyData(Exception):
30
30
 
31
31
  def __str__(self):
32
32
  return f'Empty data: {self.msg}'
33
+
34
+ class CUDA_NOT_AVAILABLE(Exception):
35
+ def __init__(self, msg):
36
+ self.msg = msg
37
+
38
+ def __str__(self):
39
+ return f'CUDA not available: {self.msg}'
@@ -55,7 +55,7 @@ class FileBasedDataWriter(DataWriter):
55
55
  if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
56
56
  fn_path = os.path.join(self._parent_dir, path)
57
57
 
58
- if not os.path.exists(os.path.dirname(fn_path)):
58
+ if not os.path.exists(os.path.dirname(fn_path)) and os.path.dirname(fn_path) != "":
59
59
  os.makedirs(os.path.dirname(fn_path), exist_ok=True)
60
60
 
61
61
  with open(fn_path, 'wb') as f:
@@ -1,4 +1,4 @@
1
- import os
1
+
2
2
  from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
3
3
  from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
4
4
  from magic_pdf.data.io.s3 import S3Reader, S3Writer
@@ -22,10 +22,10 @@ class MultiS3Mixin:
22
22
  """
23
23
  if len(default_prefix) == 0:
24
24
  raise InvalidConfig('default_prefix must be provided')
25
-
26
- arr = default_prefix.strip("/").split("/")
25
+
26
+ arr = default_prefix.strip('/').split('/')
27
27
  self.default_bucket = arr[0]
28
- self.default_prefix = "/".join(arr[1:])
28
+ self.default_prefix = '/'.join(arr[1:])
29
29
 
30
30
  found_default_bucket_config = False
31
31
  for conf in s3_configs:
@@ -103,7 +103,8 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
103
103
  s3_reader = self.__get_s3_client(bucket_name)
104
104
  else:
105
105
  s3_reader = self.__get_s3_client(self.default_bucket)
106
- path = os.path.join(self.default_prefix, path)
106
+ if self.default_prefix:
107
+ path = self.default_prefix + '/' + path
107
108
  return s3_reader.read_at(path, offset, limit)
108
109
 
109
110
 
@@ -139,5 +140,6 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
139
140
  s3_writer = self.__get_s3_client(bucket_name)
140
141
  else:
141
142
  s3_writer = self.__get_s3_client(self.default_bucket)
142
- path = os.path.join(self.default_prefix, path)
143
+ if self.default_prefix:
144
+ path = self.default_prefix + '/' + path
143
145
  return s3_writer.write(path, data)
magic_pdf/data/dataset.py CHANGED
@@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
3
3
  from typing import Callable, Iterator
4
4
 
5
5
  import fitz
6
+ from loguru import logger
6
7
 
7
8
  from magic_pdf.config.enums import SupportedPdfParseMethod
8
9
  from magic_pdf.data.schemas import PageInfo
@@ -133,7 +134,7 @@ class Dataset(ABC):
133
134
 
134
135
 
135
136
  class PymuDocDataset(Dataset):
136
- def __init__(self, bits: bytes):
137
+ def __init__(self, bits: bytes, lang=None):
137
138
  """Initialize the dataset, which wraps the pymudoc documents.
138
139
 
139
140
  Args:
@@ -144,6 +145,15 @@ class PymuDocDataset(Dataset):
144
145
  self._data_bits = bits
145
146
  self._raw_data = bits
146
147
 
148
+ if lang == '':
149
+ self._lang = None
150
+ elif lang == 'auto':
151
+ from magic_pdf.model.sub_modules.language_detection.utils import auto_detect_lang
152
+ self._lang = auto_detect_lang(bits)
153
+ logger.info(f"lang: {lang}, detect_lang: {self._lang}")
154
+ else:
155
+ self._lang = lang
156
+ logger.info(f"lang: {lang}")
147
157
  def __len__(self) -> int:
148
158
  """The page number of the pdf."""
149
159
  return len(self._records)
@@ -197,6 +207,8 @@ class PymuDocDataset(Dataset):
197
207
  Returns:
198
208
  Any: return the result generated by proc
199
209
  """
210
+ if 'lang' in kwargs and self._lang is not None:
211
+ kwargs['lang'] = self._lang
200
212
  return proc(self, *args, **kwargs)
201
213
 
202
214
  def classify(self) -> SupportedPdfParseMethod:
@@ -1,12 +1,14 @@
1
1
  import json
2
2
  import os
3
+ import tempfile
4
+ import shutil
3
5
  from pathlib import Path
4
6
 
5
7
  from magic_pdf.config.exceptions import EmptyData, InvalidParams
6
8
  from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
7
9
  MultiBucketS3DataReader)
8
10
  from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
9
-
11
+ from magic_pdf.utils.office_to_pdf import convert_file_to_pdf, ConvertToPdfError
10
12
 
11
13
  def read_jsonl(
12
14
  s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
@@ -58,23 +60,68 @@ def read_local_pdfs(path: str) -> list[PymuDocDataset]:
58
60
  list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
59
61
  """
60
62
  if os.path.isdir(path):
61
- reader = FileBasedDataReader(path)
62
- return [
63
- PymuDocDataset(reader.read(doc_path.name))
64
- for doc_path in Path(path).glob('*.pdf')
65
- ]
63
+ reader = FileBasedDataReader()
64
+ ret = []
65
+ for root, _, files in os.walk(path):
66
+ for file in files:
67
+ suffix = file.split('.')
68
+ if suffix[-1] == 'pdf':
69
+ ret.append( PymuDocDataset(reader.read(os.path.join(root, file))))
70
+ return ret
66
71
  else:
67
72
  reader = FileBasedDataReader()
68
73
  bits = reader.read(path)
69
74
  return [PymuDocDataset(bits)]
70
75
 
76
+ def read_local_office(path: str) -> list[PymuDocDataset]:
77
+ """Read ms-office file (ppt, pptx, doc, docx) from path or directory.
71
78
 
72
- def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
79
+ Args:
80
+ path (str): ms-office file or directory that contains ms-office files
81
+
82
+ Returns:
83
+ list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
84
+
85
+ Raises:
86
+ ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice
87
+ FileNotFoundError: File not Found
88
+ Exception: Unknown Exception raised
89
+ """
90
+ suffixes = ['.ppt', '.pptx', '.doc', '.docx']
91
+ fns = []
92
+ ret = []
93
+ if os.path.isdir(path):
94
+ for root, _, files in os.walk(path):
95
+ for file in files:
96
+ suffix = Path(file).suffix
97
+ if suffix in suffixes:
98
+ fns.append((os.path.join(root, file)))
99
+ else:
100
+ fns.append(path)
101
+
102
+ reader = FileBasedDataReader()
103
+ temp_dir = tempfile.mkdtemp()
104
+ for fn in fns:
105
+ try:
106
+ convert_file_to_pdf(fn, temp_dir)
107
+ except ConvertToPdfError as e:
108
+ raise e
109
+ except FileNotFoundError as e:
110
+ raise e
111
+ except Exception as e:
112
+ raise e
113
+ fn_path = Path(fn)
114
+ pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
115
+ ret.append(PymuDocDataset(reader.read(pdf_fn)))
116
+ shutil.rmtree(temp_dir)
117
+ return ret
118
+
119
+ def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg']) -> list[ImageDataset]:
73
120
  """Read images from path or directory.
74
121
 
75
122
  Args:
76
123
  path (str): image file path or directory that contains image files
77
- suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png']
124
+ suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
78
125
 
79
126
  Returns:
80
127
  list[ImageDataset]: each image file will converted to a ImageDataset
@@ -82,12 +129,12 @@ def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
82
129
  if os.path.isdir(path):
83
130
  imgs_bits = []
84
131
  s_suffixes = set(suffixes)
85
- reader = FileBasedDataReader(path)
132
+ reader = FileBasedDataReader()
86
133
  for root, _, files in os.walk(path):
87
134
  for file in files:
88
- suffix = file.split('.')
89
- if suffix[-1] in s_suffixes:
90
- imgs_bits.append(reader.read(file))
135
+ suffix = Path(file).suffix
136
+ if suffix in s_suffixes:
137
+ imgs_bits.append(reader.read(os.path.join(root, file)))
91
138
  return [ImageDataset(bits) for bits in imgs_bits]
92
139
  else:
93
140
  reader = FileBasedDataReader()
magic_pdf/data/utils.py CHANGED
@@ -1,6 +1,7 @@
1
1
 
2
2
  import fitz
3
3
  import numpy as np
4
+ from loguru import logger
4
5
 
5
6
  from magic_pdf.utils.annotations import ImportPIL
6
7
 
@@ -30,3 +31,37 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
30
31
  img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
31
32
 
32
33
  return img_dict
34
+
35
+ @ImportPIL
36
+ def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list:
37
+ from PIL import Image
38
+ images = []
39
+ with fitz.open('pdf', pdf_bytes) as doc:
40
+ pdf_page_num = doc.page_count
41
+ end_page_id = (
42
+ end_page_id
43
+ if end_page_id is not None and end_page_id >= 0
44
+ else pdf_page_num - 1
45
+ )
46
+ if end_page_id > pdf_page_num - 1:
47
+ logger.warning('end_page_id is out of range, use images length')
48
+ end_page_id = pdf_page_num - 1
49
+
50
+ for index in range(0, doc.page_count):
51
+ if start_page_id <= index <= end_page_id:
52
+ page = doc[index]
53
+ mat = fitz.Matrix(dpi / 72, dpi / 72)
54
+ pm = page.get_pixmap(matrix=mat, alpha=False)
55
+
56
+ # If the width or height exceeds 4500 after scaling, do not scale further.
57
+ if pm.width > 4500 or pm.height > 4500:
58
+ pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
59
+
60
+ img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
61
+ img = np.array(img)
62
+ img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
63
+ else:
64
+ img_dict = {'img': [], 'width': 0, 'height': 0}
65
+
66
+ images.append(img_dict)
67
+ return images
@@ -7,7 +7,7 @@ from magic_pdf.config.ocr_content_type import BlockType, ContentType
7
7
  from magic_pdf.libs.commons import join_path
8
8
  from magic_pdf.libs.language import detect_lang
9
9
  from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
10
- from magic_pdf.para.para_split_v3 import ListLineTag
10
+ from magic_pdf.post_proc.para_split_v3 import ListLineTag
11
11
 
12
12
 
13
13
  def __is_hyphen_at_line_end(line):
@@ -61,7 +61,8 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
61
61
  if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
62
62
  para_text = merge_para_with_text(para_block)
63
63
  elif para_type == BlockType.Title:
64
- para_text = f'# {merge_para_with_text(para_block)}'
64
+ title_level = get_title_level(para_block)
65
+ para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
65
66
  elif para_type == BlockType.InterlineEquation:
66
67
  para_text = merge_para_with_text(para_block)
67
68
  elif para_type == BlockType.Image:
@@ -125,16 +126,6 @@ def detect_language(text):
125
126
  return 'empty'
126
127
 
127
128
 
128
- # 连写字符拆分
129
- def __replace_ligatures(text: str):
130
- text = re.sub(r'fi', 'fi', text) # 替换 fi 连写符
131
- text = re.sub(r'fl', 'fl', text) # 替换 fl 连写符
132
- text = re.sub(r'ff', 'ff', text) # 替换 ff 连写符
133
- text = re.sub(r'ffi', 'ffi', text) # 替换 ffi 连写符
134
- text = re.sub(r'ffl', 'ffl', text) # 替换 ffl 连写符
135
- return text
136
-
137
-
138
129
  def merge_para_with_text(para_block):
139
130
  block_text = ''
140
131
  for line in para_block['lines']:
@@ -196,10 +187,11 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
196
187
  'text': merge_para_with_text(para_block),
197
188
  }
198
189
  elif para_type == BlockType.Title:
190
+ title_level = get_title_level(para_block)
199
191
  para_content = {
200
192
  'type': 'text',
201
193
  'text': merge_para_with_text(para_block),
202
- 'text_level': 1,
194
+ 'text_level': title_level,
203
195
  }
204
196
  elif para_type == BlockType.InterlineEquation:
205
197
  para_content = {
@@ -299,3 +291,12 @@ def union_make(pdf_info_dict: list,
299
291
  return '\n\n'.join(output_content)
300
292
  elif make_mode == MakeMode.STANDARD_FORMAT:
301
293
  return output_content
294
+
295
+
296
+ def get_title_level(block):
297
+ title_level = block.get('level', 1)
298
+ if title_level > 4:
299
+ title_level = 4
300
+ elif title_level < 1:
301
+ title_level = 1
302
+ return title_level
@@ -3,8 +3,15 @@ import torch
3
3
  import gc
4
4
 
5
5
 
6
- def clean_memory():
7
- if torch.cuda.is_available():
8
- torch.cuda.empty_cache()
9
- torch.cuda.ipc_collect()
6
+ def clean_memory(device='cuda'):
7
+ if device == 'cuda':
8
+ if torch.cuda.is_available():
9
+ torch.cuda.empty_cache()
10
+ torch.cuda.ipc_collect()
11
+ elif str(device).startswith("npu"):
12
+ import torch_npu
13
+ if torch_npu.npu.is_available():
14
+ torch_npu.npu.empty_cache()
15
+ elif str(device).startswith("mps"):
16
+ torch.mps.empty_cache()
10
17
  gc.collect()
@@ -116,6 +116,15 @@ def get_formula_config():
116
116
  else:
117
117
  return formula_config
118
118
 
119
+ def get_llm_aided_config():
120
+ config = read_config()
121
+ llm_aided_config = config.get('llm-aided-config')
122
+ if llm_aided_config is None:
123
+ logger.warning(f"'llm-aided-config' not found in {CONFIG_FILE_NAME}, use 'None' as default")
124
+ return None
125
+ else:
126
+ return llm_aided_config
127
+
119
128
 
120
129
  if __name__ == '__main__':
121
130
  ak, sk, endpoint = get_s3_config('llm-raw')
@@ -394,17 +394,13 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
394
394
  pdf_docs.save(f'{out_path}/{filename}')
395
395
 
396
396
 
397
- def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
398
- layout_bbox_list = []
399
-
400
- for page in pdf_info:
401
- page_block_list = []
402
- for block in page['para_blocks']:
403
- bbox = block['bbox']
404
- page_block_list.append(bbox)
405
- layout_bbox_list.append(page_block_list)
397
+ def draw_char_bbox(pdf_bytes, out_path, filename):
406
398
  pdf_docs = fitz.open('pdf', pdf_bytes)
407
399
  for i, page in enumerate(pdf_docs):
408
- draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
409
-
410
- pdf_docs.save(f'{out_path}/{filename}_layout_sort.pdf')
400
+ for block in page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']:
401
+ for line in block['lines']:
402
+ for span in line['spans']:
403
+ for char in span['chars']:
404
+ char_bbox = char['bbox']
405
+ page.draw_rect(char_bbox, color=[1, 0, 0], fill=None, fill_opacity=1, width=0.3, overlay=True,)
406
+ pdf_docs.save(f'{out_path}/{filename}')
@@ -16,11 +16,14 @@ def detect_lang(text: str) -> str:
16
16
 
17
17
  if len(text) == 0:
18
18
  return ""
19
+
20
+ text = text.replace("\n", "")
19
21
  try:
20
22
  lang_upper = detect_language(text)
21
23
  except:
22
24
  html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
23
25
  lang_upper = detect_language(html_no_ctrl_chars)
26
+
24
27
  try:
25
28
  lang = lang_upper.lower()
26
29
  except:
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.10.6"
1
+ __version__ = "1.0.1"
@@ -1,126 +1,2 @@
1
- from typing import Callable
2
-
3
- from abc import ABC, abstractmethod
4
-
5
- from magic_pdf.data.data_reader_writer import DataWriter
6
- from magic_pdf.data.dataset import Dataset
7
- from magic_pdf.pipe.operators import PipeResult
8
-
9
-
10
1
  __use_inside_model__ = True
11
- __model_mode__ = "full"
12
-
13
-
14
- class InferenceResultBase(ABC):
15
-
16
- @abstractmethod
17
- def __init__(self, inference_results: list, dataset: Dataset):
18
- """Initialized method.
19
-
20
- Args:
21
- inference_results (list): the inference result generated by model
22
- dataset (Dataset): the dataset related with model inference result
23
- """
24
- self._infer_res = inference_results
25
- self._dataset = dataset
26
-
27
- @abstractmethod
28
- def draw_model(self, file_path: str) -> None:
29
- """Draw model inference result.
30
-
31
- Args:
32
- file_path (str): the output file path
33
- """
34
- pass
35
-
36
- @abstractmethod
37
- def dump_model(self, writer: DataWriter, file_path: str):
38
- """Dump model inference result to file.
39
-
40
- Args:
41
- writer (DataWriter): writer handle
42
- file_path (str): the location of target file
43
- """
44
- pass
45
-
46
- @abstractmethod
47
- def get_infer_res(self):
48
- """Get the inference result.
49
-
50
- Returns:
51
- list: the inference result generated by model
52
- """
53
- pass
54
-
55
- @abstractmethod
56
- def apply(self, proc: Callable, *args, **kwargs):
57
- """Apply callable method which.
58
-
59
- Args:
60
- proc (Callable): invoke proc as follows:
61
- proc(inference_result, *args, **kwargs)
62
-
63
- Returns:
64
- Any: return the result generated by proc
65
- """
66
- pass
67
-
68
- @abstractmethod
69
- def pipe_auto_mode(
70
- self,
71
- imageWriter: DataWriter,
72
- start_page_id=0,
73
- end_page_id=None,
74
- debug_mode=False,
75
- lang=None,
76
- ) -> PipeResult:
77
- """Post-proc the model inference result.
78
- step1: classify the dataset type
79
- step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
80
-
81
- Args:
82
- imageWriter (DataWriter): the image writer handle
83
- start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
84
- end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
85
- debug_mode (bool, optional): Defaults to False. will dump more log if enabled
86
- lang (str, optional): Defaults to None.
87
-
88
- Returns:
89
- PipeResult: the result
90
- """
91
- pass
92
-
93
- @abstractmethod
94
- def pipe_txt_mode(
95
- self,
96
- imageWriter: DataWriter,
97
- start_page_id=0,
98
- end_page_id=None,
99
- debug_mode=False,
100
- lang=None,
101
- ) -> PipeResult:
102
- """Post-proc the model inference result, Extract the text using the
103
- third library, such as `pymupdf`
104
-
105
- Args:
106
- imageWriter (DataWriter): the image writer handle
107
- start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
108
- end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
109
- debug_mode (bool, optional): Defaults to False. will dump more log if enabled
110
- lang (str, optional): Defaults to None.
111
-
112
- Returns:
113
- PipeResult: the result
114
- """
115
- pass
116
-
117
- @abstractmethod
118
- def pipe_ocr_mode(
119
- self,
120
- imageWriter: DataWriter,
121
- start_page_id=0,
122
- end_page_id=None,
123
- debug_mode=False,
124
- lang=None,
125
- ) -> PipeResult:
126
- pass
2
+ __model_mode__ = 'full'