magic-pdf 0.10.5__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. magic_pdf/config/constants.py +7 -0
  2. magic_pdf/config/exceptions.py +7 -0
  3. magic_pdf/data/data_reader_writer/base.py +13 -1
  4. magic_pdf/data/data_reader_writer/filebase.py +1 -1
  5. magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
  6. magic_pdf/data/dataset.py +188 -5
  7. magic_pdf/data/read_api.py +59 -12
  8. magic_pdf/data/utils.py +35 -0
  9. magic_pdf/dict2md/ocr_mkcontent.py +16 -15
  10. magic_pdf/filter/__init__.py +32 -0
  11. magic_pdf/filter/pdf_meta_scan.py +3 -2
  12. magic_pdf/libs/clean_memory.py +11 -4
  13. magic_pdf/libs/config_reader.py +9 -0
  14. magic_pdf/libs/draw_bbox.py +19 -22
  15. magic_pdf/libs/language.py +3 -0
  16. magic_pdf/libs/pdf_check.py +30 -30
  17. magic_pdf/libs/version.py +1 -1
  18. magic_pdf/model/__init__.py +1 -1
  19. magic_pdf/model/batch_analyze.py +275 -0
  20. magic_pdf/model/doc_analyze_by_custom_model.py +104 -92
  21. magic_pdf/model/magic_model.py +4 -435
  22. magic_pdf/model/model_list.py +1 -0
  23. magic_pdf/model/pdf_extract_kit.py +35 -5
  24. magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
  25. magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
  26. magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
  27. magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
  28. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
  29. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
  30. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
  31. magic_pdf/model/sub_modules/model_init.py +43 -7
  32. magic_pdf/model/sub_modules/model_utils.py +17 -5
  33. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
  34. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
  35. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
  36. magic_pdf/operators/__init__.py +94 -0
  37. magic_pdf/operators/models.py +154 -0
  38. magic_pdf/operators/pipes.py +191 -0
  39. magic_pdf/pdf_parse_union_core_v2.py +77 -27
  40. magic_pdf/post_proc/__init__.py +1 -0
  41. magic_pdf/post_proc/llm_aided.py +133 -0
  42. magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
  43. magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
  44. magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
  45. magic_pdf/tools/cli.py +36 -11
  46. magic_pdf/tools/common.py +120 -61
  47. magic_pdf/utils/office_to_pdf.py +29 -0
  48. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/METADATA +78 -25
  49. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/RECORD +54 -55
  50. magic_pdf/para/__init__.py +0 -0
  51. magic_pdf/pdf_parse_by_ocr.py +0 -23
  52. magic_pdf/pdf_parse_by_txt.py +0 -24
  53. magic_pdf/pipe/AbsPipe.py +0 -98
  54. magic_pdf/pipe/OCRPipe.py +0 -41
  55. magic_pdf/pipe/TXTPipe.py +0 -41
  56. magic_pdf/pipe/UNIPipe.py +0 -98
  57. magic_pdf/pipe/__init__.py +0 -0
  58. magic_pdf/rw/AbsReaderWriter.py +0 -17
  59. magic_pdf/rw/DiskReaderWriter.py +0 -74
  60. magic_pdf/rw/S3ReaderWriter.py +0 -142
  61. magic_pdf/rw/__init__.py +0 -0
  62. magic_pdf/user_api.py +0 -121
  63. /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
  64. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/LICENSE.md +0 -0
  65. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/WHEEL +0 -0
  66. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/entry_points.txt +0 -0
  67. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/top_level.txt +0 -0
@@ -51,3 +51,10 @@ class MODEL_NAME:
51
51
  UniMerNet_v2_Small = 'unimernet_small'
52
52
 
53
53
  RAPID_TABLE = 'rapid_table'
54
+
55
+ YOLO_V11_LangDetect = 'yolo_v11n_langdetect'
56
+
57
+
58
+ PARSE_TYPE_TXT = 'txt'
59
+ PARSE_TYPE_OCR = 'ocr'
60
+
@@ -30,3 +30,10 @@ class EmptyData(Exception):
30
30
 
31
31
  def __str__(self):
32
32
  return f'Empty data: {self.msg}'
33
+
34
+ class CUDA_NOT_AVAILABLE(Exception):
35
+ def __init__(self, msg):
36
+ self.msg = msg
37
+
38
+ def __str__(self):
39
+ return f'CUDA not available: {self.msg}'
@@ -48,4 +48,16 @@ class DataWriter(ABC):
48
48
  path (str): the target file where to write
49
49
  data (str): the data want to write
50
50
  """
51
- self.write(path, data.encode())
51
+
52
+ def safe_encode(data: str, method: str):
53
+ try:
54
+ bit_data = data.encode(encoding=method, errors='replace')
55
+ return bit_data, True
56
+ except: # noqa
57
+ return None, False
58
+
59
+ for method in ['utf-8', 'ascii']:
60
+ bit_data, flag = safe_encode(data, method)
61
+ if flag:
62
+ self.write(path, bit_data)
63
+ break
@@ -55,7 +55,7 @@ class FileBasedDataWriter(DataWriter):
55
55
  if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
56
56
  fn_path = os.path.join(self._parent_dir, path)
57
57
 
58
- if not os.path.exists(os.path.dirname(fn_path)):
58
+ if not os.path.exists(os.path.dirname(fn_path)) and os.path.dirname(fn_path) != "":
59
59
  os.makedirs(os.path.dirname(fn_path), exist_ok=True)
60
60
 
61
61
  with open(fn_path, 'wb') as f:
@@ -1,4 +1,4 @@
1
- import os
1
+
2
2
  from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
3
3
  from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
4
4
  from magic_pdf.data.io.s3 import S3Reader, S3Writer
@@ -22,10 +22,10 @@ class MultiS3Mixin:
22
22
  """
23
23
  if len(default_prefix) == 0:
24
24
  raise InvalidConfig('default_prefix must be provided')
25
-
26
- arr = default_prefix.strip("/").split("/")
25
+
26
+ arr = default_prefix.strip('/').split('/')
27
27
  self.default_bucket = arr[0]
28
- self.default_prefix = "/".join(arr[1:])
28
+ self.default_prefix = '/'.join(arr[1:])
29
29
 
30
30
  found_default_bucket_config = False
31
31
  for conf in s3_configs:
@@ -103,7 +103,8 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
103
103
  s3_reader = self.__get_s3_client(bucket_name)
104
104
  else:
105
105
  s3_reader = self.__get_s3_client(self.default_bucket)
106
- path = os.path.join(self.default_prefix, path)
106
+ if self.default_prefix:
107
+ path = self.default_prefix + '/' + path
107
108
  return s3_reader.read_at(path, offset, limit)
108
109
 
109
110
 
@@ -139,5 +140,6 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
139
140
  s3_writer = self.__get_s3_client(bucket_name)
140
141
  else:
141
142
  s3_writer = self.__get_s3_client(self.default_bucket)
142
- path = os.path.join(self.default_prefix, path)
143
+ if self.default_prefix:
144
+ path = self.default_prefix + '/' + path
143
145
  return s3_writer.write(path, data)
magic_pdf/data/dataset.py CHANGED
@@ -1,11 +1,14 @@
1
+ import os
1
2
  from abc import ABC, abstractmethod
2
- from typing import Iterator
3
+ from typing import Callable, Iterator
3
4
 
4
5
  import fitz
6
+ from loguru import logger
5
7
 
6
8
  from magic_pdf.config.enums import SupportedPdfParseMethod
7
9
  from magic_pdf.data.schemas import PageInfo
8
10
  from magic_pdf.data.utils import fitz_doc_to_image
11
+ from magic_pdf.filter import classify
9
12
 
10
13
 
11
14
  class PageableData(ABC):
@@ -28,6 +31,32 @@ class PageableData(ABC):
28
31
  """
29
32
  pass
30
33
 
34
+ @abstractmethod
35
+ def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
36
+ """draw rectangle.
37
+
38
+ Args:
39
+ rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
40
+ color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
41
+ fill (list[float] | None): fill the board with RGB, None means will not fill with color
42
+ fill_opacity (float): opacity of the fill, range from [0, 1]
43
+ width (float): the width of board
44
+ overlay (bool): fill the color in foreground or background. True means fill in background.
45
+ """
46
+ pass
47
+
48
+ @abstractmethod
49
+ def insert_text(self, coord, content, fontsize, color):
50
+ """insert text.
51
+
52
+ Args:
53
+ coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
54
+ content (str): the text content
55
+ fontsize (int): font size of the text
56
+ color (list[float] | None): three element tuple which describe the RGB of the board line, None will use the default font color!
57
+ """
58
+ pass
59
+
31
60
 
32
61
  class Dataset(ABC):
33
62
  @abstractmethod
@@ -66,18 +95,65 @@ class Dataset(ABC):
66
95
  """
67
96
  pass
68
97
 
98
+ @abstractmethod
99
+ def dump_to_file(self, file_path: str):
100
+ """Dump the file
101
+
102
+ Args:
103
+ file_path (str): the file path
104
+ """
105
+ pass
106
+
107
+ @abstractmethod
108
+ def apply(self, proc: Callable, *args, **kwargs):
109
+ """Apply callable method which.
110
+
111
+ Args:
112
+ proc (Callable): invoke proc as follows:
113
+ proc(self, *args, **kwargs)
114
+
115
+ Returns:
116
+ Any: return the result generated by proc
117
+ """
118
+ pass
119
+
120
+ @abstractmethod
121
+ def classify(self) -> SupportedPdfParseMethod:
122
+ """classify the dataset
123
+
124
+ Returns:
125
+ SupportedPdfParseMethod: _description_
126
+ """
127
+ pass
128
+
129
+ @abstractmethod
130
+ def clone(self):
131
+ """clone this dataset
132
+ """
133
+ pass
134
+
69
135
 
70
136
  class PymuDocDataset(Dataset):
71
- def __init__(self, bits: bytes):
137
+ def __init__(self, bits: bytes, lang=None):
72
138
  """Initialize the dataset, which wraps the pymudoc documents.
73
139
 
74
140
  Args:
75
141
  bits (bytes): the bytes of the pdf
76
142
  """
77
- self._records = [Doc(v) for v in fitz.open('pdf', bits)]
143
+ self._raw_fitz = fitz.open('pdf', bits)
144
+ self._records = [Doc(v) for v in self._raw_fitz]
78
145
  self._data_bits = bits
79
146
  self._raw_data = bits
80
147
 
148
+ if lang == '':
149
+ self._lang = None
150
+ elif lang == 'auto':
151
+ from magic_pdf.model.sub_modules.language_detection.utils import auto_detect_lang
152
+ self._lang = auto_detect_lang(bits)
153
+ logger.info(f"lang: {lang}, detect_lang: {self._lang}")
154
+ else:
155
+ self._lang = lang
156
+ logger.info(f"lang: {lang}")
81
157
  def __len__(self) -> int:
82
158
  """The page number of the pdf."""
83
159
  return len(self._records)
@@ -109,6 +185,45 @@ class PymuDocDataset(Dataset):
109
185
  """
110
186
  return self._records[page_id]
111
187
 
188
+ def dump_to_file(self, file_path: str):
189
+ """Dump the file
190
+
191
+ Args:
192
+ file_path (str): the file path
193
+ """
194
+
195
+ dir_name = os.path.dirname(file_path)
196
+ if dir_name not in ('', '.', '..'):
197
+ os.makedirs(dir_name, exist_ok=True)
198
+ self._raw_fitz.save(file_path)
199
+
200
+ def apply(self, proc: Callable, *args, **kwargs):
201
+ """Apply callable method which.
202
+
203
+ Args:
204
+ proc (Callable): invoke proc as follows:
205
+ proc(dataset, *args, **kwargs)
206
+
207
+ Returns:
208
+ Any: return the result generated by proc
209
+ """
210
+ if 'lang' in kwargs and self._lang is not None:
211
+ kwargs['lang'] = self._lang
212
+ return proc(self, *args, **kwargs)
213
+
214
+ def classify(self) -> SupportedPdfParseMethod:
215
+ """classify the dataset
216
+
217
+ Returns:
218
+ SupportedPdfParseMethod: _description_
219
+ """
220
+ return classify(self._data_bits)
221
+
222
+ def clone(self):
223
+ """clone this dataset
224
+ """
225
+ return PymuDocDataset(self._raw_data)
226
+
112
227
 
113
228
  class ImageDataset(Dataset):
114
229
  def __init__(self, bits: bytes):
@@ -118,7 +233,8 @@ class ImageDataset(Dataset):
118
233
  bits (bytes): the bytes of the photo which will be converted to pdf first. then converted to pymudoc.
119
234
  """
120
235
  pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
121
- self._records = [Doc(v) for v in fitz.open('pdf', pdf_bytes)]
236
+ self._raw_fitz = fitz.open('pdf', pdf_bytes)
237
+ self._records = [Doc(v) for v in self._raw_fitz]
122
238
  self._raw_data = bits
123
239
  self._data_bits = pdf_bytes
124
240
 
@@ -153,14 +269,50 @@ class ImageDataset(Dataset):
153
269
  """
154
270
  return self._records[page_id]
155
271
 
272
+ def dump_to_file(self, file_path: str):
273
+ """Dump the file
274
+
275
+ Args:
276
+ file_path (str): the file path
277
+ """
278
+ dir_name = os.path.dirname(file_path)
279
+ if dir_name not in ('', '.', '..'):
280
+ os.makedirs(dir_name, exist_ok=True)
281
+ self._raw_fitz.save(file_path)
282
+
283
+ def apply(self, proc: Callable, *args, **kwargs):
284
+ """Apply callable method which.
285
+
286
+ Args:
287
+ proc (Callable): invoke proc as follows:
288
+ proc(dataset, *args, **kwargs)
289
+
290
+ Returns:
291
+ Any: return the result generated by proc
292
+ """
293
+ return proc(self, *args, **kwargs)
294
+
295
+ def classify(self) -> SupportedPdfParseMethod:
296
+ """classify the dataset
297
+
298
+ Returns:
299
+ SupportedPdfParseMethod: _description_
300
+ """
301
+ return SupportedPdfParseMethod.OCR
302
+
303
+ def clone(self):
304
+ """clone this dataset
305
+ """
306
+ return ImageDataset(self._raw_data)
156
307
 
157
308
  class Doc(PageableData):
158
309
  """Initialized with pymudoc object."""
310
+
159
311
  def __init__(self, doc: fitz.Page):
160
312
  self._doc = doc
161
313
 
162
314
  def get_image(self):
163
- """Return the imge info.
315
+ """Return the image info.
164
316
 
165
317
  Returns:
166
318
  dict: {
@@ -192,3 +344,34 @@ class Doc(PageableData):
192
344
  def __getattr__(self, name):
193
345
  if hasattr(self._doc, name):
194
346
  return getattr(self._doc, name)
347
+
348
+ def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
349
+ """draw rectangle.
350
+
351
+ Args:
352
+ rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
353
+ color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
354
+ fill (list[float] | None): fill the board with RGB, None means will not fill with color
355
+ fill_opacity (float): opacity of the fill, range from [0, 1]
356
+ width (float): the width of board
357
+ overlay (bool): fill the color in foreground or background. True means fill in background.
358
+ """
359
+ self._doc.draw_rect(
360
+ rect_coords,
361
+ color=color,
362
+ fill=fill,
363
+ fill_opacity=fill_opacity,
364
+ width=width,
365
+ overlay=overlay,
366
+ )
367
+
368
+ def insert_text(self, coord, content, fontsize, color):
369
+ """insert text.
370
+
371
+ Args:
372
+ coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
373
+ content (str): the text content
374
+ fontsize (int): font size of the text
375
+ color (list[float] | None): three element tuple which describe the RGB of the board line, None will use the default font color!
376
+ """
377
+ self._doc.insert_text(coord, content, fontsize=fontsize, color=color)
@@ -1,12 +1,14 @@
1
1
  import json
2
2
  import os
3
+ import tempfile
4
+ import shutil
3
5
  from pathlib import Path
4
6
 
5
7
  from magic_pdf.config.exceptions import EmptyData, InvalidParams
6
8
  from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
7
9
  MultiBucketS3DataReader)
8
10
  from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
9
-
11
+ from magic_pdf.utils.office_to_pdf import convert_file_to_pdf, ConvertToPdfError
10
12
 
11
13
  def read_jsonl(
12
14
  s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
@@ -58,23 +60,68 @@ def read_local_pdfs(path: str) -> list[PymuDocDataset]:
58
60
  list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
59
61
  """
60
62
  if os.path.isdir(path):
61
- reader = FileBasedDataReader(path)
62
- return [
63
- PymuDocDataset(reader.read(doc_path.name))
64
- for doc_path in Path(path).glob('*.pdf')
65
- ]
63
+ reader = FileBasedDataReader()
64
+ ret = []
65
+ for root, _, files in os.walk(path):
66
+ for file in files:
67
+ suffix = file.split('.')
68
+ if suffix[-1] == 'pdf':
69
+ ret.append( PymuDocDataset(reader.read(os.path.join(root, file))))
70
+ return ret
66
71
  else:
67
72
  reader = FileBasedDataReader()
68
73
  bits = reader.read(path)
69
74
  return [PymuDocDataset(bits)]
70
75
 
76
+ def read_local_office(path: str) -> list[PymuDocDataset]:
77
+ """Read ms-office file (ppt, pptx, doc, docx) from path or directory.
71
78
 
72
- def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
79
+ Args:
80
+ path (str): ms-office file or directory that contains ms-office files
81
+
82
+ Returns:
83
+ list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
84
+
85
+ Raises:
86
+ ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice
87
+ FileNotFoundError: File not Found
88
+ Exception: Unknown Exception raised
89
+ """
90
+ suffixes = ['.ppt', '.pptx', '.doc', '.docx']
91
+ fns = []
92
+ ret = []
93
+ if os.path.isdir(path):
94
+ for root, _, files in os.walk(path):
95
+ for file in files:
96
+ suffix = Path(file).suffix
97
+ if suffix in suffixes:
98
+ fns.append((os.path.join(root, file)))
99
+ else:
100
+ fns.append(path)
101
+
102
+ reader = FileBasedDataReader()
103
+ temp_dir = tempfile.mkdtemp()
104
+ for fn in fns:
105
+ try:
106
+ convert_file_to_pdf(fn, temp_dir)
107
+ except ConvertToPdfError as e:
108
+ raise e
109
+ except FileNotFoundError as e:
110
+ raise e
111
+ except Exception as e:
112
+ raise e
113
+ fn_path = Path(fn)
114
+ pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
115
+ ret.append(PymuDocDataset(reader.read(pdf_fn)))
116
+ shutil.rmtree(temp_dir)
117
+ return ret
118
+
119
+ def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg']) -> list[ImageDataset]:
73
120
  """Read images from path or directory.
74
121
 
75
122
  Args:
76
123
  path (str): image file path or directory that contains image files
77
- suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png']
124
+ suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
78
125
 
79
126
  Returns:
80
127
  list[ImageDataset]: each image file will converted to a ImageDataset
@@ -82,12 +129,12 @@ def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
82
129
  if os.path.isdir(path):
83
130
  imgs_bits = []
84
131
  s_suffixes = set(suffixes)
85
- reader = FileBasedDataReader(path)
132
+ reader = FileBasedDataReader()
86
133
  for root, _, files in os.walk(path):
87
134
  for file in files:
88
- suffix = file.split('.')
89
- if suffix[-1] in s_suffixes:
90
- imgs_bits.append(reader.read(file))
135
+ suffix = Path(file).suffix
136
+ if suffix in s_suffixes:
137
+ imgs_bits.append(reader.read(os.path.join(root, file)))
91
138
  return [ImageDataset(bits) for bits in imgs_bits]
92
139
  else:
93
140
  reader = FileBasedDataReader()
magic_pdf/data/utils.py CHANGED
@@ -1,6 +1,7 @@
1
1
 
2
2
  import fitz
3
3
  import numpy as np
4
+ from loguru import logger
4
5
 
5
6
  from magic_pdf.utils.annotations import ImportPIL
6
7
 
@@ -30,3 +31,37 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
30
31
  img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
31
32
 
32
33
  return img_dict
34
+
35
+ @ImportPIL
36
+ def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list:
37
+ from PIL import Image
38
+ images = []
39
+ with fitz.open('pdf', pdf_bytes) as doc:
40
+ pdf_page_num = doc.page_count
41
+ end_page_id = (
42
+ end_page_id
43
+ if end_page_id is not None and end_page_id >= 0
44
+ else pdf_page_num - 1
45
+ )
46
+ if end_page_id > pdf_page_num - 1:
47
+ logger.warning('end_page_id is out of range, use images length')
48
+ end_page_id = pdf_page_num - 1
49
+
50
+ for index in range(0, doc.page_count):
51
+ if start_page_id <= index <= end_page_id:
52
+ page = doc[index]
53
+ mat = fitz.Matrix(dpi / 72, dpi / 72)
54
+ pm = page.get_pixmap(matrix=mat, alpha=False)
55
+
56
+ # If the width or height exceeds 4500 after scaling, do not scale further.
57
+ if pm.width > 4500 or pm.height > 4500:
58
+ pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
59
+
60
+ img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
61
+ img = np.array(img)
62
+ img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
63
+ else:
64
+ img_dict = {'img': [], 'width': 0, 'height': 0}
65
+
66
+ images.append(img_dict)
67
+ return images
@@ -7,7 +7,7 @@ from magic_pdf.config.ocr_content_type import BlockType, ContentType
7
7
  from magic_pdf.libs.commons import join_path
8
8
  from magic_pdf.libs.language import detect_lang
9
9
  from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
10
- from magic_pdf.para.para_split_v3 import ListLineTag
10
+ from magic_pdf.post_proc.para_split_v3 import ListLineTag
11
11
 
12
12
 
13
13
  def __is_hyphen_at_line_end(line):
@@ -61,7 +61,8 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
61
61
  if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
62
62
  para_text = merge_para_with_text(para_block)
63
63
  elif para_type == BlockType.Title:
64
- para_text = f'# {merge_para_with_text(para_block)}'
64
+ title_level = get_title_level(para_block)
65
+ para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
65
66
  elif para_type == BlockType.InterlineEquation:
66
67
  para_text = merge_para_with_text(para_block)
67
68
  elif para_type == BlockType.Image:
@@ -125,16 +126,6 @@ def detect_language(text):
125
126
  return 'empty'
126
127
 
127
128
 
128
- # 连写字符拆分
129
- def __replace_ligatures(text: str):
130
- text = re.sub(r'fi', 'fi', text) # 替换 fi 连写符
131
- text = re.sub(r'fl', 'fl', text) # 替换 fl 连写符
132
- text = re.sub(r'ff', 'ff', text) # 替换 ff 连写符
133
- text = re.sub(r'ffi', 'ffi', text) # 替换 ffi 连写符
134
- text = re.sub(r'ffl', 'ffl', text) # 替换 ffl 连写符
135
- return text
136
-
137
-
138
129
  def merge_para_with_text(para_block):
139
130
  block_text = ''
140
131
  for line in para_block['lines']:
@@ -165,8 +156,8 @@ def merge_para_with_text(para_block):
165
156
  if content:
166
157
  langs = ['zh', 'ja', 'ko']
167
158
  # logger.info(f'block_lang: {block_lang}, content: {content}')
168
- if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔
169
- if j == len(line['spans']) - 1:
159
+ if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格
160
+ if j == len(line['spans']) - 1 and span_type not in [ContentType.InlineEquation]:
170
161
  para_text += content
171
162
  else:
172
163
  para_text += f'{content} '
@@ -196,10 +187,11 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
196
187
  'text': merge_para_with_text(para_block),
197
188
  }
198
189
  elif para_type == BlockType.Title:
190
+ title_level = get_title_level(para_block)
199
191
  para_content = {
200
192
  'type': 'text',
201
193
  'text': merge_para_with_text(para_block),
202
- 'text_level': 1,
194
+ 'text_level': title_level,
203
195
  }
204
196
  elif para_type == BlockType.InterlineEquation:
205
197
  para_content = {
@@ -299,3 +291,12 @@ def union_make(pdf_info_dict: list,
299
291
  return '\n\n'.join(output_content)
300
292
  elif make_mode == MakeMode.STANDARD_FORMAT:
301
293
  return output_content
294
+
295
+
296
+ def get_title_level(block):
297
+ title_level = block.get('level', 1)
298
+ if title_level > 4:
299
+ title_level = 4
300
+ elif title_level < 1:
301
+ title_level = 1
302
+ return title_level
@@ -0,0 +1,32 @@
1
+
2
+ from magic_pdf.config.drop_reason import DropReason
3
+ from magic_pdf.config.enums import SupportedPdfParseMethod
4
+ from magic_pdf.filter.pdf_classify_by_type import classify as do_classify
5
+ from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
6
+
7
+
8
+ def classify(pdf_bytes: bytes) -> SupportedPdfParseMethod:
9
+ """根据pdf的元数据,判断是文本pdf,还是ocr pdf."""
10
+ pdf_meta = pdf_meta_scan(pdf_bytes)
11
+ if pdf_meta.get('_need_drop', False): # 如果返回了需要丢弃的标志,则抛出异常
12
+ raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
13
+ else:
14
+ is_encrypted = pdf_meta['is_encrypted']
15
+ is_needs_password = pdf_meta['is_needs_password']
16
+ if is_encrypted or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
17
+ raise Exception(f'pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}')
18
+ else:
19
+ is_text_pdf, results = do_classify(
20
+ pdf_meta['total_page'],
21
+ pdf_meta['page_width_pts'],
22
+ pdf_meta['page_height_pts'],
23
+ pdf_meta['image_info_per_page'],
24
+ pdf_meta['text_len_per_page'],
25
+ pdf_meta['imgs_per_page'],
26
+ pdf_meta['text_layout_per_page'],
27
+ pdf_meta['invalid_chars'],
28
+ )
29
+ if is_text_pdf:
30
+ return SupportedPdfParseMethod.TXT
31
+ else:
32
+ return SupportedPdfParseMethod.OCR
@@ -8,7 +8,7 @@ from loguru import logger
8
8
  from magic_pdf.config.drop_reason import DropReason
9
9
  from magic_pdf.libs.commons import get_top_percent_list, mymax
10
10
  from magic_pdf.libs.language import detect_lang
11
- from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf
11
+ from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf, detect_invalid_chars
12
12
 
13
13
  scan_max_page = 50
14
14
  junk_limit_min = 10
@@ -323,7 +323,8 @@ def get_language(doc: fitz.Document):
323
323
 
324
324
  def check_invalid_chars(pdf_bytes):
325
325
  """乱码检测."""
326
- return detect_invalid_chars_by_pymupdf(pdf_bytes)
326
+ # return detect_invalid_chars_by_pymupdf(pdf_bytes)
327
+ return detect_invalid_chars(pdf_bytes)
327
328
 
328
329
 
329
330
  def pdf_meta_scan(pdf_bytes: bytes):
@@ -3,8 +3,15 @@ import torch
3
3
  import gc
4
4
 
5
5
 
6
- def clean_memory():
7
- if torch.cuda.is_available():
8
- torch.cuda.empty_cache()
9
- torch.cuda.ipc_collect()
6
+ def clean_memory(device='cuda'):
7
+ if device == 'cuda':
8
+ if torch.cuda.is_available():
9
+ torch.cuda.empty_cache()
10
+ torch.cuda.ipc_collect()
11
+ elif str(device).startswith("npu"):
12
+ import torch_npu
13
+ if torch_npu.npu.is_available():
14
+ torch_npu.npu.empty_cache()
15
+ elif str(device).startswith("mps"):
16
+ torch.mps.empty_cache()
10
17
  gc.collect()
@@ -116,6 +116,15 @@ def get_formula_config():
116
116
  else:
117
117
  return formula_config
118
118
 
119
+ def get_llm_aided_config():
120
+ config = read_config()
121
+ llm_aided_config = config.get('llm-aided-config')
122
+ if llm_aided_config is None:
123
+ logger.warning(f"'llm-aided-config' not found in {CONFIG_FILE_NAME}, use 'None' as default")
124
+ return None
125
+ else:
126
+ return llm_aided_config
127
+
119
128
 
120
129
  if __name__ == '__main__':
121
130
  ak, sk, endpoint = get_s3_config('llm-raw')