magic-pdf 0.10.6__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +2 -0
- magic_pdf/config/exceptions.py +7 -0
- magic_pdf/data/data_reader_writer/filebase.py +1 -1
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
- magic_pdf/data/dataset.py +13 -1
- magic_pdf/data/read_api.py +59 -12
- magic_pdf/data/utils.py +35 -0
- magic_pdf/dict2md/ocr_mkcontent.py +14 -13
- magic_pdf/libs/clean_memory.py +11 -4
- magic_pdf/libs/config_reader.py +9 -0
- magic_pdf/libs/draw_bbox.py +8 -12
- magic_pdf/libs/language.py +3 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +1 -125
- magic_pdf/model/batch_analyze.py +275 -0
- magic_pdf/model/doc_analyze_by_custom_model.py +4 -51
- magic_pdf/model/magic_model.py +4 -435
- magic_pdf/model/model_list.py +1 -0
- magic_pdf/model/pdf_extract_kit.py +33 -22
- magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
- magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
- magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
- magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
- magic_pdf/model/sub_modules/model_init.py +30 -4
- magic_pdf/model/sub_modules/model_utils.py +8 -2
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
- magic_pdf/operators/__init__.py +94 -0
- magic_pdf/{model/operators.py → operators/models.py} +2 -38
- magic_pdf/{pipe/operators.py → operators/pipes.py} +70 -17
- magic_pdf/pdf_parse_union_core_v2.py +68 -17
- magic_pdf/post_proc/__init__.py +1 -0
- magic_pdf/post_proc/llm_aided.py +133 -0
- magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
- magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
- magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
- magic_pdf/tools/cli.py +36 -11
- magic_pdf/tools/common.py +28 -18
- magic_pdf/utils/office_to_pdf.py +29 -0
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/METADATA +73 -23
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/RECORD +50 -53
- magic_pdf/para/__init__.py +0 -0
- magic_pdf/pdf_parse_by_ocr.py +0 -22
- magic_pdf/pdf_parse_by_txt.py +0 -23
- magic_pdf/pipe/AbsPipe.py +0 -99
- magic_pdf/pipe/OCRPipe.py +0 -80
- magic_pdf/pipe/TXTPipe.py +0 -42
- magic_pdf/pipe/UNIPipe.py +0 -150
- magic_pdf/pipe/__init__.py +0 -0
- magic_pdf/rw/AbsReaderWriter.py +0 -17
- magic_pdf/rw/DiskReaderWriter.py +0 -74
- magic_pdf/rw/S3ReaderWriter.py +0 -142
- magic_pdf/rw/__init__.py +0 -0
- magic_pdf/user_api.py +0 -144
- /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/top_level.txt +0 -0
magic_pdf/config/constants.py
CHANGED
magic_pdf/config/exceptions.py
CHANGED
@@ -55,7 +55,7 @@ class FileBasedDataWriter(DataWriter):
|
|
55
55
|
if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
|
56
56
|
fn_path = os.path.join(self._parent_dir, path)
|
57
57
|
|
58
|
-
if not os.path.exists(os.path.dirname(fn_path)):
|
58
|
+
if not os.path.exists(os.path.dirname(fn_path)) and os.path.dirname(fn_path) != "":
|
59
59
|
os.makedirs(os.path.dirname(fn_path), exist_ok=True)
|
60
60
|
|
61
61
|
with open(fn_path, 'wb') as f:
|
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
|
2
2
|
from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
|
3
3
|
from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
|
4
4
|
from magic_pdf.data.io.s3 import S3Reader, S3Writer
|
@@ -22,10 +22,10 @@ class MultiS3Mixin:
|
|
22
22
|
"""
|
23
23
|
if len(default_prefix) == 0:
|
24
24
|
raise InvalidConfig('default_prefix must be provided')
|
25
|
-
|
26
|
-
arr = default_prefix.strip(
|
25
|
+
|
26
|
+
arr = default_prefix.strip('/').split('/')
|
27
27
|
self.default_bucket = arr[0]
|
28
|
-
self.default_prefix =
|
28
|
+
self.default_prefix = '/'.join(arr[1:])
|
29
29
|
|
30
30
|
found_default_bucket_config = False
|
31
31
|
for conf in s3_configs:
|
@@ -103,7 +103,8 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
|
|
103
103
|
s3_reader = self.__get_s3_client(bucket_name)
|
104
104
|
else:
|
105
105
|
s3_reader = self.__get_s3_client(self.default_bucket)
|
106
|
-
|
106
|
+
if self.default_prefix:
|
107
|
+
path = self.default_prefix + '/' + path
|
107
108
|
return s3_reader.read_at(path, offset, limit)
|
108
109
|
|
109
110
|
|
@@ -139,5 +140,6 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
|
|
139
140
|
s3_writer = self.__get_s3_client(bucket_name)
|
140
141
|
else:
|
141
142
|
s3_writer = self.__get_s3_client(self.default_bucket)
|
142
|
-
|
143
|
+
if self.default_prefix:
|
144
|
+
path = self.default_prefix + '/' + path
|
143
145
|
return s3_writer.write(path, data)
|
magic_pdf/data/dataset.py
CHANGED
@@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
|
|
3
3
|
from typing import Callable, Iterator
|
4
4
|
|
5
5
|
import fitz
|
6
|
+
from loguru import logger
|
6
7
|
|
7
8
|
from magic_pdf.config.enums import SupportedPdfParseMethod
|
8
9
|
from magic_pdf.data.schemas import PageInfo
|
@@ -133,7 +134,7 @@ class Dataset(ABC):
|
|
133
134
|
|
134
135
|
|
135
136
|
class PymuDocDataset(Dataset):
|
136
|
-
def __init__(self, bits: bytes):
|
137
|
+
def __init__(self, bits: bytes, lang=None):
|
137
138
|
"""Initialize the dataset, which wraps the pymudoc documents.
|
138
139
|
|
139
140
|
Args:
|
@@ -144,6 +145,15 @@ class PymuDocDataset(Dataset):
|
|
144
145
|
self._data_bits = bits
|
145
146
|
self._raw_data = bits
|
146
147
|
|
148
|
+
if lang == '':
|
149
|
+
self._lang = None
|
150
|
+
elif lang == 'auto':
|
151
|
+
from magic_pdf.model.sub_modules.language_detection.utils import auto_detect_lang
|
152
|
+
self._lang = auto_detect_lang(bits)
|
153
|
+
logger.info(f"lang: {lang}, detect_lang: {self._lang}")
|
154
|
+
else:
|
155
|
+
self._lang = lang
|
156
|
+
logger.info(f"lang: {lang}")
|
147
157
|
def __len__(self) -> int:
|
148
158
|
"""The page number of the pdf."""
|
149
159
|
return len(self._records)
|
@@ -197,6 +207,8 @@ class PymuDocDataset(Dataset):
|
|
197
207
|
Returns:
|
198
208
|
Any: return the result generated by proc
|
199
209
|
"""
|
210
|
+
if 'lang' in kwargs and self._lang is not None:
|
211
|
+
kwargs['lang'] = self._lang
|
200
212
|
return proc(self, *args, **kwargs)
|
201
213
|
|
202
214
|
def classify(self) -> SupportedPdfParseMethod:
|
magic_pdf/data/read_api.py
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
import json
|
2
2
|
import os
|
3
|
+
import tempfile
|
4
|
+
import shutil
|
3
5
|
from pathlib import Path
|
4
6
|
|
5
7
|
from magic_pdf.config.exceptions import EmptyData, InvalidParams
|
6
8
|
from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
|
7
9
|
MultiBucketS3DataReader)
|
8
10
|
from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
|
9
|
-
|
11
|
+
from magic_pdf.utils.office_to_pdf import convert_file_to_pdf, ConvertToPdfError
|
10
12
|
|
11
13
|
def read_jsonl(
|
12
14
|
s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
|
@@ -58,23 +60,68 @@ def read_local_pdfs(path: str) -> list[PymuDocDataset]:
|
|
58
60
|
list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
|
59
61
|
"""
|
60
62
|
if os.path.isdir(path):
|
61
|
-
reader = FileBasedDataReader(
|
62
|
-
|
63
|
-
|
64
|
-
for
|
65
|
-
|
63
|
+
reader = FileBasedDataReader()
|
64
|
+
ret = []
|
65
|
+
for root, _, files in os.walk(path):
|
66
|
+
for file in files:
|
67
|
+
suffix = file.split('.')
|
68
|
+
if suffix[-1] == 'pdf':
|
69
|
+
ret.append( PymuDocDataset(reader.read(os.path.join(root, file))))
|
70
|
+
return ret
|
66
71
|
else:
|
67
72
|
reader = FileBasedDataReader()
|
68
73
|
bits = reader.read(path)
|
69
74
|
return [PymuDocDataset(bits)]
|
70
75
|
|
76
|
+
def read_local_office(path: str) -> list[PymuDocDataset]:
|
77
|
+
"""Read ms-office file (ppt, pptx, doc, docx) from path or directory.
|
71
78
|
|
72
|
-
|
79
|
+
Args:
|
80
|
+
path (str): ms-office file or directory that contains ms-office files
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
|
84
|
+
|
85
|
+
Raises:
|
86
|
+
ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice
|
87
|
+
FileNotFoundError: File not Found
|
88
|
+
Exception: Unknown Exception raised
|
89
|
+
"""
|
90
|
+
suffixes = ['.ppt', '.pptx', '.doc', '.docx']
|
91
|
+
fns = []
|
92
|
+
ret = []
|
93
|
+
if os.path.isdir(path):
|
94
|
+
for root, _, files in os.walk(path):
|
95
|
+
for file in files:
|
96
|
+
suffix = Path(file).suffix
|
97
|
+
if suffix in suffixes:
|
98
|
+
fns.append((os.path.join(root, file)))
|
99
|
+
else:
|
100
|
+
fns.append(path)
|
101
|
+
|
102
|
+
reader = FileBasedDataReader()
|
103
|
+
temp_dir = tempfile.mkdtemp()
|
104
|
+
for fn in fns:
|
105
|
+
try:
|
106
|
+
convert_file_to_pdf(fn, temp_dir)
|
107
|
+
except ConvertToPdfError as e:
|
108
|
+
raise e
|
109
|
+
except FileNotFoundError as e:
|
110
|
+
raise e
|
111
|
+
except Exception as e:
|
112
|
+
raise e
|
113
|
+
fn_path = Path(fn)
|
114
|
+
pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
|
115
|
+
ret.append(PymuDocDataset(reader.read(pdf_fn)))
|
116
|
+
shutil.rmtree(temp_dir)
|
117
|
+
return ret
|
118
|
+
|
119
|
+
def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg']) -> list[ImageDataset]:
|
73
120
|
"""Read images from path or directory.
|
74
121
|
|
75
122
|
Args:
|
76
123
|
path (str): image file path or directory that contains image files
|
77
|
-
suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png']
|
124
|
+
suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
|
78
125
|
|
79
126
|
Returns:
|
80
127
|
list[ImageDataset]: each image file will converted to a ImageDataset
|
@@ -82,12 +129,12 @@ def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
|
|
82
129
|
if os.path.isdir(path):
|
83
130
|
imgs_bits = []
|
84
131
|
s_suffixes = set(suffixes)
|
85
|
-
reader = FileBasedDataReader(
|
132
|
+
reader = FileBasedDataReader()
|
86
133
|
for root, _, files in os.walk(path):
|
87
134
|
for file in files:
|
88
|
-
suffix = file.
|
89
|
-
if suffix
|
90
|
-
imgs_bits.append(reader.read(file))
|
135
|
+
suffix = Path(file).suffix
|
136
|
+
if suffix in s_suffixes:
|
137
|
+
imgs_bits.append(reader.read(os.path.join(root, file)))
|
91
138
|
return [ImageDataset(bits) for bits in imgs_bits]
|
92
139
|
else:
|
93
140
|
reader = FileBasedDataReader()
|
magic_pdf/data/utils.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
|
2
2
|
import fitz
|
3
3
|
import numpy as np
|
4
|
+
from loguru import logger
|
4
5
|
|
5
6
|
from magic_pdf.utils.annotations import ImportPIL
|
6
7
|
|
@@ -30,3 +31,37 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
|
|
30
31
|
img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
|
31
32
|
|
32
33
|
return img_dict
|
34
|
+
|
35
|
+
@ImportPIL
|
36
|
+
def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list:
|
37
|
+
from PIL import Image
|
38
|
+
images = []
|
39
|
+
with fitz.open('pdf', pdf_bytes) as doc:
|
40
|
+
pdf_page_num = doc.page_count
|
41
|
+
end_page_id = (
|
42
|
+
end_page_id
|
43
|
+
if end_page_id is not None and end_page_id >= 0
|
44
|
+
else pdf_page_num - 1
|
45
|
+
)
|
46
|
+
if end_page_id > pdf_page_num - 1:
|
47
|
+
logger.warning('end_page_id is out of range, use images length')
|
48
|
+
end_page_id = pdf_page_num - 1
|
49
|
+
|
50
|
+
for index in range(0, doc.page_count):
|
51
|
+
if start_page_id <= index <= end_page_id:
|
52
|
+
page = doc[index]
|
53
|
+
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
54
|
+
pm = page.get_pixmap(matrix=mat, alpha=False)
|
55
|
+
|
56
|
+
# If the width or height exceeds 4500 after scaling, do not scale further.
|
57
|
+
if pm.width > 4500 or pm.height > 4500:
|
58
|
+
pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
59
|
+
|
60
|
+
img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
|
61
|
+
img = np.array(img)
|
62
|
+
img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
|
63
|
+
else:
|
64
|
+
img_dict = {'img': [], 'width': 0, 'height': 0}
|
65
|
+
|
66
|
+
images.append(img_dict)
|
67
|
+
return images
|
@@ -7,7 +7,7 @@ from magic_pdf.config.ocr_content_type import BlockType, ContentType
|
|
7
7
|
from magic_pdf.libs.commons import join_path
|
8
8
|
from magic_pdf.libs.language import detect_lang
|
9
9
|
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
|
10
|
-
from magic_pdf.
|
10
|
+
from magic_pdf.post_proc.para_split_v3 import ListLineTag
|
11
11
|
|
12
12
|
|
13
13
|
def __is_hyphen_at_line_end(line):
|
@@ -61,7 +61,8 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
|
|
61
61
|
if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
|
62
62
|
para_text = merge_para_with_text(para_block)
|
63
63
|
elif para_type == BlockType.Title:
|
64
|
-
|
64
|
+
title_level = get_title_level(para_block)
|
65
|
+
para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
|
65
66
|
elif para_type == BlockType.InterlineEquation:
|
66
67
|
para_text = merge_para_with_text(para_block)
|
67
68
|
elif para_type == BlockType.Image:
|
@@ -125,16 +126,6 @@ def detect_language(text):
|
|
125
126
|
return 'empty'
|
126
127
|
|
127
128
|
|
128
|
-
# 连写字符拆分
|
129
|
-
def __replace_ligatures(text: str):
|
130
|
-
text = re.sub(r'fi', 'fi', text) # 替换 fi 连写符
|
131
|
-
text = re.sub(r'fl', 'fl', text) # 替换 fl 连写符
|
132
|
-
text = re.sub(r'ff', 'ff', text) # 替换 ff 连写符
|
133
|
-
text = re.sub(r'ffi', 'ffi', text) # 替换 ffi 连写符
|
134
|
-
text = re.sub(r'ffl', 'ffl', text) # 替换 ffl 连写符
|
135
|
-
return text
|
136
|
-
|
137
|
-
|
138
129
|
def merge_para_with_text(para_block):
|
139
130
|
block_text = ''
|
140
131
|
for line in para_block['lines']:
|
@@ -196,10 +187,11 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
|
|
196
187
|
'text': merge_para_with_text(para_block),
|
197
188
|
}
|
198
189
|
elif para_type == BlockType.Title:
|
190
|
+
title_level = get_title_level(para_block)
|
199
191
|
para_content = {
|
200
192
|
'type': 'text',
|
201
193
|
'text': merge_para_with_text(para_block),
|
202
|
-
'text_level':
|
194
|
+
'text_level': title_level,
|
203
195
|
}
|
204
196
|
elif para_type == BlockType.InterlineEquation:
|
205
197
|
para_content = {
|
@@ -299,3 +291,12 @@ def union_make(pdf_info_dict: list,
|
|
299
291
|
return '\n\n'.join(output_content)
|
300
292
|
elif make_mode == MakeMode.STANDARD_FORMAT:
|
301
293
|
return output_content
|
294
|
+
|
295
|
+
|
296
|
+
def get_title_level(block):
|
297
|
+
title_level = block.get('level', 1)
|
298
|
+
if title_level > 4:
|
299
|
+
title_level = 4
|
300
|
+
elif title_level < 1:
|
301
|
+
title_level = 1
|
302
|
+
return title_level
|
magic_pdf/libs/clean_memory.py
CHANGED
@@ -3,8 +3,15 @@ import torch
|
|
3
3
|
import gc
|
4
4
|
|
5
5
|
|
6
|
-
def clean_memory():
|
7
|
-
if
|
8
|
-
torch.cuda.
|
9
|
-
|
6
|
+
def clean_memory(device='cuda'):
|
7
|
+
if device == 'cuda':
|
8
|
+
if torch.cuda.is_available():
|
9
|
+
torch.cuda.empty_cache()
|
10
|
+
torch.cuda.ipc_collect()
|
11
|
+
elif str(device).startswith("npu"):
|
12
|
+
import torch_npu
|
13
|
+
if torch_npu.npu.is_available():
|
14
|
+
torch_npu.npu.empty_cache()
|
15
|
+
elif str(device).startswith("mps"):
|
16
|
+
torch.mps.empty_cache()
|
10
17
|
gc.collect()
|
magic_pdf/libs/config_reader.py
CHANGED
@@ -116,6 +116,15 @@ def get_formula_config():
|
|
116
116
|
else:
|
117
117
|
return formula_config
|
118
118
|
|
119
|
+
def get_llm_aided_config():
|
120
|
+
config = read_config()
|
121
|
+
llm_aided_config = config.get('llm-aided-config')
|
122
|
+
if llm_aided_config is None:
|
123
|
+
logger.warning(f"'llm-aided-config' not found in {CONFIG_FILE_NAME}, use 'None' as default")
|
124
|
+
return None
|
125
|
+
else:
|
126
|
+
return llm_aided_config
|
127
|
+
|
119
128
|
|
120
129
|
if __name__ == '__main__':
|
121
130
|
ak, sk, endpoint = get_s3_config('llm-raw')
|
magic_pdf/libs/draw_bbox.py
CHANGED
@@ -394,17 +394,13 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
394
394
|
pdf_docs.save(f'{out_path}/{filename}')
|
395
395
|
|
396
396
|
|
397
|
-
def
|
398
|
-
layout_bbox_list = []
|
399
|
-
|
400
|
-
for page in pdf_info:
|
401
|
-
page_block_list = []
|
402
|
-
for block in page['para_blocks']:
|
403
|
-
bbox = block['bbox']
|
404
|
-
page_block_list.append(bbox)
|
405
|
-
layout_bbox_list.append(page_block_list)
|
397
|
+
def draw_char_bbox(pdf_bytes, out_path, filename):
|
406
398
|
pdf_docs = fitz.open('pdf', pdf_bytes)
|
407
399
|
for i, page in enumerate(pdf_docs):
|
408
|
-
|
409
|
-
|
410
|
-
|
400
|
+
for block in page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']:
|
401
|
+
for line in block['lines']:
|
402
|
+
for span in line['spans']:
|
403
|
+
for char in span['chars']:
|
404
|
+
char_bbox = char['bbox']
|
405
|
+
page.draw_rect(char_bbox, color=[1, 0, 0], fill=None, fill_opacity=1, width=0.3, overlay=True,)
|
406
|
+
pdf_docs.save(f'{out_path}/{filename}')
|
magic_pdf/libs/language.py
CHANGED
@@ -16,11 +16,14 @@ def detect_lang(text: str) -> str:
|
|
16
16
|
|
17
17
|
if len(text) == 0:
|
18
18
|
return ""
|
19
|
+
|
20
|
+
text = text.replace("\n", "")
|
19
21
|
try:
|
20
22
|
lang_upper = detect_language(text)
|
21
23
|
except:
|
22
24
|
html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
|
23
25
|
lang_upper = detect_language(html_no_ctrl_chars)
|
26
|
+
|
24
27
|
try:
|
25
28
|
lang = lang_upper.lower()
|
26
29
|
except:
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.
|
1
|
+
__version__ = "1.0.0"
|
magic_pdf/model/__init__.py
CHANGED
@@ -1,126 +1,2 @@
|
|
1
|
-
from typing import Callable
|
2
|
-
|
3
|
-
from abc import ABC, abstractmethod
|
4
|
-
|
5
|
-
from magic_pdf.data.data_reader_writer import DataWriter
|
6
|
-
from magic_pdf.data.dataset import Dataset
|
7
|
-
from magic_pdf.pipe.operators import PipeResult
|
8
|
-
|
9
|
-
|
10
1
|
__use_inside_model__ = True
|
11
|
-
__model_mode__ =
|
12
|
-
|
13
|
-
|
14
|
-
class InferenceResultBase(ABC):
|
15
|
-
|
16
|
-
@abstractmethod
|
17
|
-
def __init__(self, inference_results: list, dataset: Dataset):
|
18
|
-
"""Initialized method.
|
19
|
-
|
20
|
-
Args:
|
21
|
-
inference_results (list): the inference result generated by model
|
22
|
-
dataset (Dataset): the dataset related with model inference result
|
23
|
-
"""
|
24
|
-
self._infer_res = inference_results
|
25
|
-
self._dataset = dataset
|
26
|
-
|
27
|
-
@abstractmethod
|
28
|
-
def draw_model(self, file_path: str) -> None:
|
29
|
-
"""Draw model inference result.
|
30
|
-
|
31
|
-
Args:
|
32
|
-
file_path (str): the output file path
|
33
|
-
"""
|
34
|
-
pass
|
35
|
-
|
36
|
-
@abstractmethod
|
37
|
-
def dump_model(self, writer: DataWriter, file_path: str):
|
38
|
-
"""Dump model inference result to file.
|
39
|
-
|
40
|
-
Args:
|
41
|
-
writer (DataWriter): writer handle
|
42
|
-
file_path (str): the location of target file
|
43
|
-
"""
|
44
|
-
pass
|
45
|
-
|
46
|
-
@abstractmethod
|
47
|
-
def get_infer_res(self):
|
48
|
-
"""Get the inference result.
|
49
|
-
|
50
|
-
Returns:
|
51
|
-
list: the inference result generated by model
|
52
|
-
"""
|
53
|
-
pass
|
54
|
-
|
55
|
-
@abstractmethod
|
56
|
-
def apply(self, proc: Callable, *args, **kwargs):
|
57
|
-
"""Apply callable method which.
|
58
|
-
|
59
|
-
Args:
|
60
|
-
proc (Callable): invoke proc as follows:
|
61
|
-
proc(inference_result, *args, **kwargs)
|
62
|
-
|
63
|
-
Returns:
|
64
|
-
Any: return the result generated by proc
|
65
|
-
"""
|
66
|
-
pass
|
67
|
-
|
68
|
-
@abstractmethod
|
69
|
-
def pipe_auto_mode(
|
70
|
-
self,
|
71
|
-
imageWriter: DataWriter,
|
72
|
-
start_page_id=0,
|
73
|
-
end_page_id=None,
|
74
|
-
debug_mode=False,
|
75
|
-
lang=None,
|
76
|
-
) -> PipeResult:
|
77
|
-
"""Post-proc the model inference result.
|
78
|
-
step1: classify the dataset type
|
79
|
-
step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
|
80
|
-
|
81
|
-
Args:
|
82
|
-
imageWriter (DataWriter): the image writer handle
|
83
|
-
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
|
84
|
-
end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
|
85
|
-
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
|
86
|
-
lang (str, optional): Defaults to None.
|
87
|
-
|
88
|
-
Returns:
|
89
|
-
PipeResult: the result
|
90
|
-
"""
|
91
|
-
pass
|
92
|
-
|
93
|
-
@abstractmethod
|
94
|
-
def pipe_txt_mode(
|
95
|
-
self,
|
96
|
-
imageWriter: DataWriter,
|
97
|
-
start_page_id=0,
|
98
|
-
end_page_id=None,
|
99
|
-
debug_mode=False,
|
100
|
-
lang=None,
|
101
|
-
) -> PipeResult:
|
102
|
-
"""Post-proc the model inference result, Extract the text using the
|
103
|
-
third library, such as `pymupdf`
|
104
|
-
|
105
|
-
Args:
|
106
|
-
imageWriter (DataWriter): the image writer handle
|
107
|
-
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
|
108
|
-
end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
|
109
|
-
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
|
110
|
-
lang (str, optional): Defaults to None.
|
111
|
-
|
112
|
-
Returns:
|
113
|
-
PipeResult: the result
|
114
|
-
"""
|
115
|
-
pass
|
116
|
-
|
117
|
-
@abstractmethod
|
118
|
-
def pipe_ocr_mode(
|
119
|
-
self,
|
120
|
-
imageWriter: DataWriter,
|
121
|
-
start_page_id=0,
|
122
|
-
end_page_id=None,
|
123
|
-
debug_mode=False,
|
124
|
-
lang=None,
|
125
|
-
) -> PipeResult:
|
126
|
-
pass
|
2
|
+
__model_mode__ = 'full'
|