tretool 0.2.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tretool/transform/pdf.py CHANGED
@@ -1,22 +1,70 @@
1
1
  import os
2
+ import sys
3
+ import logging
4
+ import time
5
+ import json
2
6
  from abc import ABC, abstractmethod
3
7
  from pathlib import Path
4
- from typing import Union, List, Optional
5
- from pdfminer.high_level import extract_text
6
- from pdf2image import convert_from_path
7
- from docx import Document
8
- import pandas as pd
9
- from tabula import read_pdf
10
- from PIL import Image
11
- import json
8
+ from typing import Union, List, Optional, Dict, Callable
9
+ from concurrent.futures import ThreadPoolExecutor
10
+ from functools import wraps, lru_cache
11
+ import argparse
12
+ from datetime import datetime
12
13
 
14
+ # 第三方库导入
15
+ try:
16
+ from pdfminer.high_level import extract_text
17
+ from pdf2image import convert_from_path
18
+ from docx import Document
19
+ import pandas as pd
20
+ from tabula import read_pdf
21
+ from PIL import Image
22
+ from PyPDF2 import PdfReader
23
+ except ImportError as e:
24
+ print(f"缺少依赖库: {e}")
25
+ sys.exit(1)
13
26
 
27
+ # 日志配置
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
31
+ handlers=[
32
+ logging.StreamHandler(),
33
+ logging.FileHandler('pdf_converter.log')
34
+ ]
35
+ )
36
+
37
+ # 自定义异常
14
38
  class ConversionError(Exception):
15
- """自定义转换异常"""
39
+ """基础转换异常"""
40
+ pass
41
+
42
+ class PDFPermissionError(ConversionError):
43
+ """PDF权限错误"""
44
+ pass
45
+
46
+ class PDFCorruptedError(ConversionError):
47
+ """PDF文件损坏"""
16
48
  pass
17
49
 
50
+ class UnsupportedFormatError(ConversionError):
51
+ """不支持的格式"""
52
+ pass
18
53
 
19
54
  class PDFConverter(ABC):
55
+ """
56
+ PDF文件转换器抽象基类
57
+
58
+ 提供PDF文件转换的基础功能,包括:
59
+ - 文件路径验证
60
+ - 输出路径处理
61
+ - 基本错误处理
62
+
63
+ 子类应实现:
64
+ - convert() 方法: 执行实际转换逻辑
65
+ - supported_formats() 类方法: 返回支持的格式列表
66
+ """
67
+
20
68
  def __init__(self, pdf_path: Union[str, Path]):
21
69
  """
22
70
  初始化PDF转换器
@@ -25,19 +73,28 @@ class PDFConverter(ABC):
25
73
  pdf_path: PDF文件路径
26
74
  """
27
75
  self.pdf_path = Path(pdf_path)
76
+ self.logger = logging.getLogger(self.__class__.__name__)
77
+ self._progress_callback = None
78
+
79
+ # 验证文件
28
80
  if not self.pdf_path.exists():
29
81
  raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
30
82
  if self.pdf_path.suffix.lower() != '.pdf':
31
83
  raise ValueError("输入文件必须是PDF格式")
84
+
85
+ self.logger.info(f"初始化转换器,处理文件: {pdf_path}")
32
86
 
33
87
  @abstractmethod
34
- def convert(self, output_path: Union[str, Path], **kwargs):
88
+ def convert(self, output_path: Union[str, Path], **kwargs) -> Union[str, List[str]]:
35
89
  """
36
90
  将PDF转换为目标格式
37
91
 
38
92
  参数:
39
93
  output_path: 输出文件路径
40
94
  **kwargs: 转换选项
95
+
96
+ 返回:
97
+ 转换后的文件路径或路径列表
41
98
  """
42
99
  pass
43
100
 
@@ -47,6 +104,31 @@ class PDFConverter(ABC):
47
104
  """返回支持的格式列表"""
48
105
  return []
49
106
 
107
+ def set_progress_callback(self, callback: Callable[[int, int], None]):
108
+ """设置进度回调函数"""
109
+ self._progress_callback = callback
110
+
111
+ def _update_progress(self, current: int, total: int):
112
+ """更新进度"""
113
+ if self._progress_callback:
114
+ self._progress_callback(current, total)
115
+
116
+ def get_metadata(self) -> Dict[str, str]:
117
+ """获取PDF元数据"""
118
+ try:
119
+ with open(self.pdf_path, 'rb') as f:
120
+ reader = PdfReader(f)
121
+ return {
122
+ 'title': reader.metadata.get('/Title', ''),
123
+ 'author': reader.metadata.get('/Author', ''),
124
+ 'pages': len(reader.pages),
125
+ 'created': reader.metadata.get('/CreationDate', ''),
126
+ 'modified': reader.metadata.get('/ModDate', '')
127
+ }
128
+ except Exception as e:
129
+ self.logger.warning(f"获取元数据失败: {str(e)}")
130
+ return {}
131
+
50
132
  def _prepare_output_path(self, output_path: Union[str, Path],
51
133
  default_extension: str) -> Path:
52
134
  """
@@ -73,9 +155,27 @@ class PDFConverter(ABC):
73
155
 
74
156
  return output_path
75
157
 
158
+ # 重试装饰器
159
+ def retry(max_attempts=3, delay=1, exceptions=(Exception,)):
160
+ def decorator(f):
161
+ @wraps(f)
162
+ def wrapper(*args, **kwargs):
163
+ last_error = None
164
+ for attempt in range(1, max_attempts+1):
165
+ try:
166
+ return f(*args, **kwargs)
167
+ except exceptions as e:
168
+ last_error = e
169
+ if attempt < max_attempts:
170
+ time.sleep(delay)
171
+ raise last_error
172
+ return wrapper
173
+ return decorator
76
174
 
175
+ # 具体转换器实现
77
176
  class PDFToDocxConverter(PDFConverter):
78
- def convert(self, output_path: Union[str, Path], **kwargs):
177
+ @retry(max_attempts=3, delay=0.5)
178
+ def convert(self, output_path: Union[str, Path], **kwargs) -> str:
79
179
  """
80
180
  将PDF转换为Word文档(.docx)
81
181
 
@@ -85,6 +185,9 @@ class PDFToDocxConverter(PDFConverter):
85
185
  - start_page: 开始页(从1开始)
86
186
  - end_page: 结束页
87
187
  - preserve_formatting: 是否保留格式(True/False)
188
+
189
+ 返回:
190
+ 转换后的文件路径
88
191
  """
89
192
  try:
90
193
  output_path = self._prepare_output_path(output_path, 'docx')
@@ -93,27 +196,35 @@ class PDFToDocxConverter(PDFConverter):
93
196
  end_page = kwargs.get('end_page', None)
94
197
  preserve = kwargs.get('preserve_formatting', False)
95
198
 
96
- doc = Document()
97
- text = extract_text(
98
- str(self.pdf_path),
99
- page_numbers=range(start_page-1, end_page) if end_page else None
100
- )
101
-
102
- # 简单的段落处理
103
- for paragraph in text.split('\n\n'):
104
- if paragraph.strip():
105
- para = doc.add_paragraph()
106
- if preserve:
107
- # 这里可以添加更复杂的格式保留逻辑
108
- runs = paragraph.split('\n')
109
- for run in runs:
110
- if run.strip():
111
- para.add_run(run.strip() + ' ')
112
- else:
113
- para.add_run(paragraph.strip())
114
-
115
- doc.save(output_path)
116
- return str(output_path)
199
+ # 尝试使用pdf2docx库(如果安装)
200
+ try:
201
+ from pdf2docx import Converter
202
+ cv = Converter(str(self.pdf_path))
203
+ cv.convert(str(output_path), start=start_page, end=end_page)
204
+ cv.close()
205
+ return str(output_path)
206
+ except ImportError:
207
+ self.logger.warning("pdf2docx未安装,使用基本文本转换")
208
+ # 回退到基本实现
209
+ doc = Document()
210
+ text = extract_text(
211
+ str(self.pdf_path),
212
+ page_numbers=range(start_page-1, end_page) if end_page else None
213
+ )
214
+
215
+ for paragraph in text.split('\n\n'):
216
+ if paragraph.strip():
217
+ para = doc.add_paragraph()
218
+ if preserve:
219
+ runs = paragraph.split('\n')
220
+ for run in runs:
221
+ if run.strip():
222
+ para.add_run(run.strip() + ' ')
223
+ else:
224
+ para.add_run(paragraph.strip())
225
+
226
+ doc.save(output_path)
227
+ return str(output_path)
117
228
  except Exception as e:
118
229
  raise ConversionError(f"转换为DOCX失败: {str(e)}")
119
230
 
@@ -121,9 +232,9 @@ class PDFToDocxConverter(PDFConverter):
121
232
  def supported_formats(cls) -> List[str]:
122
233
  return ['docx']
123
234
 
124
-
125
235
  class PDFToImageConverter(PDFConverter):
126
- def convert(self, output_path: Union[str, Path], **kwargs):
236
+ @retry(max_attempts=3, delay=0.5)
237
+ def convert(self, output_path: Union[str, Path], **kwargs) -> Union[str, List[str]]:
127
238
  """
128
239
  将PDF转换为图像
129
240
 
@@ -134,6 +245,9 @@ class PDFToImageConverter(PDFConverter):
134
245
  - fmt: 图像格式(png/jpg/tiff)
135
246
  - merge: 是否合并所有页为一张长图(True/False)
136
247
  - quality: 图像质量(1-100)
248
+
249
+ 返回:
250
+ 单个文件路径或多个文件路径列表
137
251
  """
138
252
  try:
139
253
  dpi = kwargs.get('dpi', 200)
@@ -150,6 +264,8 @@ class PDFToImageConverter(PDFConverter):
150
264
  fmt=fmt if fmt != 'jpg' else 'jpeg'
151
265
  )
152
266
 
267
+ self._update_progress(0, len(images))
268
+
153
269
  if merge:
154
270
  # 合并所有页为一张长图
155
271
  output_path = self._prepare_output_path(output_path, fmt)
@@ -158,9 +274,10 @@ class PDFToImageConverter(PDFConverter):
158
274
 
159
275
  merged_image = Image.new('RGB', (max_width, total_height))
160
276
  y_offset = 0
161
- for img in images:
277
+ for i, img in enumerate(images):
162
278
  merged_image.paste(img, (0, y_offset))
163
279
  y_offset += img.height
280
+ self._update_progress(i+1, len(images))
164
281
 
165
282
  merged_image.save(output_path, quality=quality)
166
283
  return str(output_path)
@@ -170,25 +287,43 @@ class PDFToImageConverter(PDFConverter):
170
287
  if len(images) == 1:
171
288
  output_path = self._prepare_output_path(output_path, fmt)
172
289
  images[0].save(output_path, quality=quality)
290
+ self._update_progress(1, 1)
173
291
  return str(output_path)
174
292
  else:
175
293
  output_path.mkdir(parents=True, exist_ok=True)
176
294
  output_files = []
177
- for i, image in enumerate(images):
178
- page_path = output_path / f"page_{i+1}.{fmt}"
179
- image.save(page_path, quality=quality)
180
- output_files.append(str(page_path))
295
+
296
+ # 多页并行处理
297
+ if len(images) > 5:
298
+ with ThreadPoolExecutor() as executor:
299
+ results = list(executor.map(
300
+ lambda x: self._save_image(x[1], output_path/f"page_{x[0]+1}.{fmt}", quality),
301
+ enumerate(images)
302
+ ))
303
+ output_files.extend(results)
304
+ else:
305
+ for i, image in enumerate(images):
306
+ page_path = output_path / f"page_{i+1}.{fmt}"
307
+ image.save(page_path, quality=quality)
308
+ output_files.append(str(page_path))
309
+ self._update_progress(i+1, len(images))
310
+
181
311
  return output_files
182
312
  except Exception as e:
183
313
  raise ConversionError(f"转换为图像失败: {str(e)}")
184
314
 
315
+ def _save_image(self, image: Image.Image, path: Path, quality: int) -> str:
316
+ """保存单个图像"""
317
+ image.save(path, quality=quality)
318
+ return str(path)
319
+
185
320
  @classmethod
186
321
  def supported_formats(cls) -> List[str]:
187
322
  return ['png', 'jpg', 'jpeg', 'tiff']
188
323
 
189
-
190
324
  class PDFToTextConverter(PDFConverter):
191
- def convert(self, output_path: Union[str, Path], **kwargs):
325
+ @retry(max_attempts=3, delay=0.5)
326
+ def convert(self, output_path: Union[str, Path], **kwargs) -> str:
192
327
  """
193
328
  将PDF转换为纯文本
194
329
 
@@ -198,6 +333,9 @@ class PDFToTextConverter(PDFConverter):
198
333
  - start_page: 开始页(从1开始)
199
334
  - end_page: 结束页
200
335
  - encoding: 文本编码(默认utf-8)
336
+
337
+ 返回:
338
+ 转换后的文件路径
201
339
  """
202
340
  try:
203
341
  output_path = self._prepare_output_path(output_path, 'txt')
@@ -222,9 +360,9 @@ class PDFToTextConverter(PDFConverter):
222
360
  def supported_formats(cls) -> List[str]:
223
361
  return ['txt']
224
362
 
225
-
226
363
  class PDFToCSVConverter(PDFConverter):
227
- def convert(self, output_path: Union[str, Path], **kwargs):
364
+ @retry(max_attempts=3, delay=0.5)
365
+ def convert(self, output_path: Union[str, Path], **kwargs) -> Union[str, List[str]]:
228
366
  """
229
367
  提取PDF中的表格为CSV
230
368
 
@@ -234,6 +372,9 @@ class PDFToCSVConverter(PDFConverter):
234
372
  - pages: 要提取的页码('all'或数字或列表)
235
373
  - multiple_tables: 如何处理多个表格(separate/merge)
236
374
  - encoding: CSV文件编码(默认utf-8)
375
+
376
+ 返回:
377
+ 单个CSV文件路径或多个CSV文件路径列表
237
378
  """
238
379
  try:
239
380
  pages = kwargs.get('pages', 'all')
@@ -245,11 +386,14 @@ class PDFToCSVConverter(PDFConverter):
245
386
  if not dfs:
246
387
  raise ConversionError("未找到表格数据")
247
388
 
389
+ self._update_progress(0, len(dfs))
390
+
248
391
  if multiple_tables == 'merge':
249
392
  # 合并所有表格
250
393
  output_path = self._prepare_output_path(output_path, 'csv')
251
394
  merged_df = pd.concat(dfs, ignore_index=True)
252
395
  merged_df.to_csv(output_path, index=False, encoding=encoding)
396
+ self._update_progress(1, 1)
253
397
  return str(output_path)
254
398
  else:
255
399
  # 每个表格保存为单独CSV
@@ -257,6 +401,7 @@ class PDFToCSVConverter(PDFConverter):
257
401
  if len(dfs) == 1:
258
402
  output_path = self._prepare_output_path(output_path, 'csv')
259
403
  dfs[0].to_csv(output_path, index=False, encoding=encoding)
404
+ self._update_progress(1, 1)
260
405
  return str(output_path)
261
406
  else:
262
407
  output_path.mkdir(parents=True, exist_ok=True)
@@ -265,6 +410,7 @@ class PDFToCSVConverter(PDFConverter):
265
410
  table_path = output_path / f"table_{i+1}.csv"
266
411
  df.to_csv(table_path, index=False, encoding=encoding)
267
412
  output_files.append(str(table_path))
413
+ self._update_progress(i+1, len(dfs))
268
414
  return output_files
269
415
  except Exception as e:
270
416
  raise ConversionError(f"提取表格失败: {str(e)}")
@@ -273,9 +419,9 @@ class PDFToCSVConverter(PDFConverter):
273
419
  def supported_formats(cls) -> List[str]:
274
420
  return ['csv']
275
421
 
276
-
277
422
  class PDFToHTMLConverter(PDFConverter):
278
- def convert(self, output_path: Union[str, Path], **kwargs):
423
+ @retry(max_attempts=3, delay=0.5)
424
+ def convert(self, output_path: Union[str, Path], **kwargs) -> str:
279
425
  """
280
426
  将PDF转换为HTML
281
427
 
@@ -284,6 +430,9 @@ class PDFToHTMLConverter(PDFConverter):
284
430
  **kwargs:
285
431
  - css: 自定义CSS样式
286
432
  - images: 是否嵌入图像(True/False)
433
+
434
+ 返回:
435
+ 转换后的HTML文件路径
287
436
  """
288
437
  try:
289
438
  output_path = self._prepare_output_path(output_path, 'html')
@@ -298,7 +447,7 @@ class PDFToHTMLConverter(PDFConverter):
298
447
  <meta charset="UTF-8">
299
448
  <title>{self.pdf_path.stem}</title>
300
449
  <style>
301
- {kwargs.get('css', 'body { font-family: Arial; margin: 20px; }')}
450
+ {kwargs.get('css', 'body {{ font-family: Arial; margin: 20px; }}')}
302
451
  </style>
303
452
  </head>
304
453
  <body>
@@ -320,9 +469,21 @@ class PDFToHTMLConverter(PDFConverter):
320
469
  def supported_formats(cls) -> List[str]:
321
470
  return ['html']
322
471
 
323
-
472
+ # 工厂类
324
473
  class PDFConverterFactory:
474
+ _format_map = {
475
+ 'docx': (PDFToDocxConverter, 'Microsoft Word文档'),
476
+ 'txt': (PDFToTextConverter, '纯文本文件'),
477
+ 'png': (PDFToImageConverter, 'PNG图像'),
478
+ 'jpg': (PDFToImageConverter, 'JPEG图像'),
479
+ 'jpeg': (PDFToImageConverter, 'JPEG图像'),
480
+ 'tiff': (PDFToImageConverter, 'TIFF图像'),
481
+ 'csv': (PDFToCSVConverter, 'CSV表格数据'),
482
+ 'html': (PDFToHTMLConverter, 'HTML网页'),
483
+ }
484
+
325
485
  @staticmethod
486
+ @lru_cache(maxsize=32)
326
487
  def get_converter(target_format: str, pdf_path: str) -> PDFConverter:
327
488
  """
328
489
  获取指定格式的转换器
@@ -333,64 +494,81 @@ class PDFConverterFactory:
333
494
 
334
495
  返回:
335
496
  PDFConverter实例
497
+
498
+ 抛出:
499
+ UnsupportedFormatError: 当格式不支持时
336
500
  """
337
- format_map = {
338
- 'docx': PDFToDocxConverter,
339
- 'txt': PDFToTextConverter,
340
- 'png': PDFToImageConverter,
341
- 'jpg': PDFToImageConverter,
342
- 'jpeg': PDFToImageConverter,
343
- 'tiff': PDFToImageConverter,
344
- 'csv': PDFToCSVConverter,
345
- 'html': PDFToHTMLConverter,
346
- }
347
-
348
501
  target_format = target_format.lower()
349
- if target_format not in format_map:
350
- raise ValueError(f"不支持的格式: {target_format}")
502
+ if target_format not in PDFConverterFactory._format_map:
503
+ raise UnsupportedFormatError(f"不支持的格式: {target_format}")
351
504
 
352
- return format_map[target_format](pdf_path)
505
+ return PDFConverterFactory._format_map[target_format][0](pdf_path)
353
506
 
354
507
  @staticmethod
355
- def get_supported_formats() -> dict:
356
- """获取所有支持的格式"""
357
- return {
358
- 'docx': 'Microsoft Word文档',
359
- 'txt': '纯文本文件',
360
- 'png': 'PNG图像',
361
- 'jpg': 'JPEG图像',
362
- 'jpeg': 'JPEG图像',
363
- 'tiff': 'TIFF图像',
364
- 'csv': 'CSV表格数据',
365
- 'html': 'HTML网页',
366
- }
508
+ def get_supported_formats() -> Dict[str, str]:
509
+ """获取所有支持的格式及其描述"""
510
+ return {fmt: desc for fmt, (_, desc) in PDFConverterFactory._format_map.items()}
367
511
 
512
+ # 命令行接口
513
+ def parse_args():
514
+ parser = argparse.ArgumentParser(
515
+ description='PDF转换工具 - 支持多种格式转换',
516
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
517
+ )
518
+ parser.add_argument('input', help='输入PDF文件路径')
519
+ parser.add_argument('output', help='输出文件路径或目录')
520
+ parser.add_argument('-f', '--format', required=True,
521
+ choices=PDFConverterFactory.get_supported_formats().keys(),
522
+ help='目标格式')
523
+ parser.add_argument('--dpi', type=int, default=200,
524
+ help='图像DPI(仅图像转换)')
525
+ parser.add_argument('--start-page', type=int, default=1,
526
+ help='起始页码(从1开始)')
527
+ parser.add_argument('--end-page', type=int,
528
+ help='结束页码(默认为最后一页)')
529
+ parser.add_argument('--preserve-formatting', action='store_true',
530
+ help='保留格式(仅DOCX转换)')
531
+ parser.add_argument('--merge', action='store_true',
532
+ help='合并多页为单个文件(图像/表格)')
533
+ parser.add_argument('--quality', type=int, default=90,
534
+ help='输出质量(1-100, 仅图像)')
535
+ return parser.parse_args()
368
536
 
369
- # 使用示例
370
- if __name__ == "__main__":
537
+ def main():
538
+ args = parse_args()
539
+
371
540
  try:
372
- # 示例1: PDF转Word
373
- print("转换PDF到Word...")
374
- docx_converter = PDFConverterFactory.get_converter('docx', 'example.pdf')
375
- result = docx_converter.convert('output.docx', preserve_formatting=True)
376
- print(f"转换成功: {result}")
541
+ # 获取转换器
542
+ converter = PDFConverterFactory.get_converter(args.format, args.input)
377
543
 
378
- # 示例2: PDF转图像
379
- print("\n转换PDF到图像...")
380
- img_converter = PDFConverterFactory.get_converter('png', 'example.pdf')
381
- result = img_converter.convert('output_images', dpi=300)
382
- print(f"转换成功: {result if isinstance(result, str) else len(result)}个文件")
544
+ # 设置进度回调
545
+ def progress_callback(current, total):
546
+ print(f"\r进度: {current}/{total} ({current/total:.1%})", end='', flush=True)
547
+ converter.set_progress_callback(progress_callback)
383
548
 
384
- # 示例3: 提取表格数据
385
- print("\n提取PDF中的表格...")
386
- csv_converter = PDFConverterFactory.get_converter('csv', 'example.pdf')
387
- result = csv_converter.convert('output_tables', pages='all')
388
- print(f"提取成功: {result if isinstance(result, str) else len(result)}个表格")
549
+ # 执行转换
550
+ print(f"开始转换: {args.input} -> {args.output} ({args.format})")
551
+ result = converter.convert(
552
+ args.output,
553
+ dpi=args.dpi,
554
+ start_page=args.start_page,
555
+ end_page=args.end_page,
556
+ preserve_formatting=args.preserve_formatting,
557
+ merge=args.merge,
558
+ quality=args.quality
559
+ )
389
560
 
390
- # 查看所有支持的格式
391
- print("\n支持的转换格式:")
392
- for fmt, desc in PDFConverterFactory.get_supported_formats().items():
393
- print(f"- {fmt}: {desc}")
561
+ print("\n转换成功!")
562
+ if isinstance(result, list):
563
+ print(f"生成文件 {len(result)} 个:")
564
+ for file in result:
565
+ print(f" - {file}")
566
+ else:
567
+ print(f"输出文件: {result}")
394
568
 
395
569
  except Exception as e:
396
- print(f"发生错误: {str(e)}")
570
+ print(f"\n转换失败: {e}", file=sys.stderr)
571
+ sys.exit(1)
572
+
573
+ if __name__ == "__main__":
574
+ main()