tretool 0.2.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tretool/__init__.py +39 -12
- tretool/config.py +406 -170
- tretool/decoratorlib.py +423 -0
- tretool/encoding.py +404 -75
- tretool/httplib.py +730 -0
- tretool/jsonlib.py +619 -151
- tretool/logger.py +712 -0
- tretool/mathlib.py +0 -33
- tretool/path.py +19 -0
- tretool/platformlib.py +469 -314
- tretool/plugin.py +437 -237
- tretool/smartCache.py +569 -0
- tretool/tasklib.py +730 -0
- tretool/transform/docx.py +544 -0
- tretool/transform/pdf.py +273 -95
- tretool/ziplib.py +664 -0
- {tretool-0.2.1.dist-info → tretool-1.0.0.dist-info}/METADATA +11 -5
- tretool-1.0.0.dist-info/RECORD +24 -0
- tretool/markfunc.py +0 -152
- tretool/memorizeTools.py +0 -24
- tretool/writeLog.py +0 -69
- tretool-0.2.1.dist-info/RECORD +0 -20
- {tretool-0.2.1.dist-info → tretool-1.0.0.dist-info}/WHEEL +0 -0
- {tretool-0.2.1.dist-info → tretool-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {tretool-0.2.1.dist-info → tretool-1.0.0.dist-info}/top_level.txt +0 -0
tretool/transform/pdf.py
CHANGED
@@ -1,22 +1,70 @@
|
|
1
1
|
import os
|
2
|
+
import sys
|
3
|
+
import logging
|
4
|
+
import time
|
5
|
+
import json
|
2
6
|
from abc import ABC, abstractmethod
|
3
7
|
from pathlib import Path
|
4
|
-
from typing import Union, List, Optional
|
5
|
-
from
|
6
|
-
from
|
7
|
-
|
8
|
-
|
9
|
-
from tabula import read_pdf
|
10
|
-
from PIL import Image
|
11
|
-
import json
|
8
|
+
from typing import Union, List, Optional, Dict, Callable
|
9
|
+
from concurrent.futures import ThreadPoolExecutor
|
10
|
+
from functools import wraps, lru_cache
|
11
|
+
import argparse
|
12
|
+
from datetime import datetime
|
12
13
|
|
14
|
+
# 第三方库导入
|
15
|
+
try:
|
16
|
+
from pdfminer.high_level import extract_text
|
17
|
+
from pdf2image import convert_from_path
|
18
|
+
from docx import Document
|
19
|
+
import pandas as pd
|
20
|
+
from tabula import read_pdf
|
21
|
+
from PIL import Image
|
22
|
+
from PyPDF2 import PdfReader
|
23
|
+
except ImportError as e:
|
24
|
+
print(f"缺少依赖库: {e}")
|
25
|
+
sys.exit(1)
|
13
26
|
|
27
|
+
# 日志配置
|
28
|
+
logging.basicConfig(
|
29
|
+
level=logging.INFO,
|
30
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
31
|
+
handlers=[
|
32
|
+
logging.StreamHandler(),
|
33
|
+
logging.FileHandler('pdf_converter.log')
|
34
|
+
]
|
35
|
+
)
|
36
|
+
|
37
|
+
# 自定义异常
|
14
38
|
class ConversionError(Exception):
|
15
|
-
"""
|
39
|
+
"""基础转换异常"""
|
40
|
+
pass
|
41
|
+
|
42
|
+
class PDFPermissionError(ConversionError):
|
43
|
+
"""PDF权限错误"""
|
44
|
+
pass
|
45
|
+
|
46
|
+
class PDFCorruptedError(ConversionError):
|
47
|
+
"""PDF文件损坏"""
|
16
48
|
pass
|
17
49
|
|
50
|
+
class UnsupportedFormatError(ConversionError):
|
51
|
+
"""不支持的格式"""
|
52
|
+
pass
|
18
53
|
|
19
54
|
class PDFConverter(ABC):
|
55
|
+
"""
|
56
|
+
PDF文件转换器抽象基类
|
57
|
+
|
58
|
+
提供PDF文件转换的基础功能,包括:
|
59
|
+
- 文件路径验证
|
60
|
+
- 输出路径处理
|
61
|
+
- 基本错误处理
|
62
|
+
|
63
|
+
子类应实现:
|
64
|
+
- convert() 方法: 执行实际转换逻辑
|
65
|
+
- supported_formats() 类方法: 返回支持的格式列表
|
66
|
+
"""
|
67
|
+
|
20
68
|
def __init__(self, pdf_path: Union[str, Path]):
|
21
69
|
"""
|
22
70
|
初始化PDF转换器
|
@@ -25,19 +73,28 @@ class PDFConverter(ABC):
|
|
25
73
|
pdf_path: PDF文件路径
|
26
74
|
"""
|
27
75
|
self.pdf_path = Path(pdf_path)
|
76
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
77
|
+
self._progress_callback = None
|
78
|
+
|
79
|
+
# 验证文件
|
28
80
|
if not self.pdf_path.exists():
|
29
81
|
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
|
30
82
|
if self.pdf_path.suffix.lower() != '.pdf':
|
31
83
|
raise ValueError("输入文件必须是PDF格式")
|
84
|
+
|
85
|
+
self.logger.info(f"初始化转换器,处理文件: {pdf_path}")
|
32
86
|
|
33
87
|
@abstractmethod
|
34
|
-
def convert(self, output_path: Union[str, Path], **kwargs):
|
88
|
+
def convert(self, output_path: Union[str, Path], **kwargs) -> Union[str, List[str]]:
|
35
89
|
"""
|
36
90
|
将PDF转换为目标格式
|
37
91
|
|
38
92
|
参数:
|
39
93
|
output_path: 输出文件路径
|
40
94
|
**kwargs: 转换选项
|
95
|
+
|
96
|
+
返回:
|
97
|
+
转换后的文件路径或路径列表
|
41
98
|
"""
|
42
99
|
pass
|
43
100
|
|
@@ -47,6 +104,31 @@ class PDFConverter(ABC):
|
|
47
104
|
"""返回支持的格式列表"""
|
48
105
|
return []
|
49
106
|
|
107
|
+
def set_progress_callback(self, callback: Callable[[int, int], None]):
|
108
|
+
"""设置进度回调函数"""
|
109
|
+
self._progress_callback = callback
|
110
|
+
|
111
|
+
def _update_progress(self, current: int, total: int):
|
112
|
+
"""更新进度"""
|
113
|
+
if self._progress_callback:
|
114
|
+
self._progress_callback(current, total)
|
115
|
+
|
116
|
+
def get_metadata(self) -> Dict[str, str]:
|
117
|
+
"""获取PDF元数据"""
|
118
|
+
try:
|
119
|
+
with open(self.pdf_path, 'rb') as f:
|
120
|
+
reader = PdfReader(f)
|
121
|
+
return {
|
122
|
+
'title': reader.metadata.get('/Title', ''),
|
123
|
+
'author': reader.metadata.get('/Author', ''),
|
124
|
+
'pages': len(reader.pages),
|
125
|
+
'created': reader.metadata.get('/CreationDate', ''),
|
126
|
+
'modified': reader.metadata.get('/ModDate', '')
|
127
|
+
}
|
128
|
+
except Exception as e:
|
129
|
+
self.logger.warning(f"获取元数据失败: {str(e)}")
|
130
|
+
return {}
|
131
|
+
|
50
132
|
def _prepare_output_path(self, output_path: Union[str, Path],
|
51
133
|
default_extension: str) -> Path:
|
52
134
|
"""
|
@@ -73,9 +155,27 @@ class PDFConverter(ABC):
|
|
73
155
|
|
74
156
|
return output_path
|
75
157
|
|
158
|
+
# 重试装饰器
|
159
|
+
def retry(max_attempts=3, delay=1, exceptions=(Exception,)):
|
160
|
+
def decorator(f):
|
161
|
+
@wraps(f)
|
162
|
+
def wrapper(*args, **kwargs):
|
163
|
+
last_error = None
|
164
|
+
for attempt in range(1, max_attempts+1):
|
165
|
+
try:
|
166
|
+
return f(*args, **kwargs)
|
167
|
+
except exceptions as e:
|
168
|
+
last_error = e
|
169
|
+
if attempt < max_attempts:
|
170
|
+
time.sleep(delay)
|
171
|
+
raise last_error
|
172
|
+
return wrapper
|
173
|
+
return decorator
|
76
174
|
|
175
|
+
# 具体转换器实现
|
77
176
|
class PDFToDocxConverter(PDFConverter):
|
78
|
-
|
177
|
+
@retry(max_attempts=3, delay=0.5)
|
178
|
+
def convert(self, output_path: Union[str, Path], **kwargs) -> str:
|
79
179
|
"""
|
80
180
|
将PDF转换为Word文档(.docx)
|
81
181
|
|
@@ -85,6 +185,9 @@ class PDFToDocxConverter(PDFConverter):
|
|
85
185
|
- start_page: 开始页(从1开始)
|
86
186
|
- end_page: 结束页
|
87
187
|
- preserve_formatting: 是否保留格式(True/False)
|
188
|
+
|
189
|
+
返回:
|
190
|
+
转换后的文件路径
|
88
191
|
"""
|
89
192
|
try:
|
90
193
|
output_path = self._prepare_output_path(output_path, 'docx')
|
@@ -93,27 +196,35 @@ class PDFToDocxConverter(PDFConverter):
|
|
93
196
|
end_page = kwargs.get('end_page', None)
|
94
197
|
preserve = kwargs.get('preserve_formatting', False)
|
95
198
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
199
|
+
# 尝试使用pdf2docx库(如果安装)
|
200
|
+
try:
|
201
|
+
from pdf2docx import Converter
|
202
|
+
cv = Converter(str(self.pdf_path))
|
203
|
+
cv.convert(str(output_path), start=start_page, end=end_page)
|
204
|
+
cv.close()
|
205
|
+
return str(output_path)
|
206
|
+
except ImportError:
|
207
|
+
self.logger.warning("pdf2docx未安装,使用基本文本转换")
|
208
|
+
# 回退到基本实现
|
209
|
+
doc = Document()
|
210
|
+
text = extract_text(
|
211
|
+
str(self.pdf_path),
|
212
|
+
page_numbers=range(start_page-1, end_page) if end_page else None
|
213
|
+
)
|
214
|
+
|
215
|
+
for paragraph in text.split('\n\n'):
|
216
|
+
if paragraph.strip():
|
217
|
+
para = doc.add_paragraph()
|
218
|
+
if preserve:
|
219
|
+
runs = paragraph.split('\n')
|
220
|
+
for run in runs:
|
221
|
+
if run.strip():
|
222
|
+
para.add_run(run.strip() + ' ')
|
223
|
+
else:
|
224
|
+
para.add_run(paragraph.strip())
|
225
|
+
|
226
|
+
doc.save(output_path)
|
227
|
+
return str(output_path)
|
117
228
|
except Exception as e:
|
118
229
|
raise ConversionError(f"转换为DOCX失败: {str(e)}")
|
119
230
|
|
@@ -121,9 +232,9 @@ class PDFToDocxConverter(PDFConverter):
|
|
121
232
|
def supported_formats(cls) -> List[str]:
|
122
233
|
return ['docx']
|
123
234
|
|
124
|
-
|
125
235
|
class PDFToImageConverter(PDFConverter):
|
126
|
-
|
236
|
+
@retry(max_attempts=3, delay=0.5)
|
237
|
+
def convert(self, output_path: Union[str, Path], **kwargs) -> Union[str, List[str]]:
|
127
238
|
"""
|
128
239
|
将PDF转换为图像
|
129
240
|
|
@@ -134,6 +245,9 @@ class PDFToImageConverter(PDFConverter):
|
|
134
245
|
- fmt: 图像格式(png/jpg/tiff)
|
135
246
|
- merge: 是否合并所有页为一张长图(True/False)
|
136
247
|
- quality: 图像质量(1-100)
|
248
|
+
|
249
|
+
返回:
|
250
|
+
单个文件路径或多个文件路径列表
|
137
251
|
"""
|
138
252
|
try:
|
139
253
|
dpi = kwargs.get('dpi', 200)
|
@@ -150,6 +264,8 @@ class PDFToImageConverter(PDFConverter):
|
|
150
264
|
fmt=fmt if fmt != 'jpg' else 'jpeg'
|
151
265
|
)
|
152
266
|
|
267
|
+
self._update_progress(0, len(images))
|
268
|
+
|
153
269
|
if merge:
|
154
270
|
# 合并所有页为一张长图
|
155
271
|
output_path = self._prepare_output_path(output_path, fmt)
|
@@ -158,9 +274,10 @@ class PDFToImageConverter(PDFConverter):
|
|
158
274
|
|
159
275
|
merged_image = Image.new('RGB', (max_width, total_height))
|
160
276
|
y_offset = 0
|
161
|
-
for img in images:
|
277
|
+
for i, img in enumerate(images):
|
162
278
|
merged_image.paste(img, (0, y_offset))
|
163
279
|
y_offset += img.height
|
280
|
+
self._update_progress(i+1, len(images))
|
164
281
|
|
165
282
|
merged_image.save(output_path, quality=quality)
|
166
283
|
return str(output_path)
|
@@ -170,25 +287,43 @@ class PDFToImageConverter(PDFConverter):
|
|
170
287
|
if len(images) == 1:
|
171
288
|
output_path = self._prepare_output_path(output_path, fmt)
|
172
289
|
images[0].save(output_path, quality=quality)
|
290
|
+
self._update_progress(1, 1)
|
173
291
|
return str(output_path)
|
174
292
|
else:
|
175
293
|
output_path.mkdir(parents=True, exist_ok=True)
|
176
294
|
output_files = []
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
295
|
+
|
296
|
+
# 多页并行处理
|
297
|
+
if len(images) > 5:
|
298
|
+
with ThreadPoolExecutor() as executor:
|
299
|
+
results = list(executor.map(
|
300
|
+
lambda x: self._save_image(x[1], output_path/f"page_{x[0]+1}.{fmt}", quality),
|
301
|
+
enumerate(images)
|
302
|
+
))
|
303
|
+
output_files.extend(results)
|
304
|
+
else:
|
305
|
+
for i, image in enumerate(images):
|
306
|
+
page_path = output_path / f"page_{i+1}.{fmt}"
|
307
|
+
image.save(page_path, quality=quality)
|
308
|
+
output_files.append(str(page_path))
|
309
|
+
self._update_progress(i+1, len(images))
|
310
|
+
|
181
311
|
return output_files
|
182
312
|
except Exception as e:
|
183
313
|
raise ConversionError(f"转换为图像失败: {str(e)}")
|
184
314
|
|
315
|
+
def _save_image(self, image: Image.Image, path: Path, quality: int) -> str:
|
316
|
+
"""保存单个图像"""
|
317
|
+
image.save(path, quality=quality)
|
318
|
+
return str(path)
|
319
|
+
|
185
320
|
@classmethod
|
186
321
|
def supported_formats(cls) -> List[str]:
|
187
322
|
return ['png', 'jpg', 'jpeg', 'tiff']
|
188
323
|
|
189
|
-
|
190
324
|
class PDFToTextConverter(PDFConverter):
|
191
|
-
|
325
|
+
@retry(max_attempts=3, delay=0.5)
|
326
|
+
def convert(self, output_path: Union[str, Path], **kwargs) -> str:
|
192
327
|
"""
|
193
328
|
将PDF转换为纯文本
|
194
329
|
|
@@ -198,6 +333,9 @@ class PDFToTextConverter(PDFConverter):
|
|
198
333
|
- start_page: 开始页(从1开始)
|
199
334
|
- end_page: 结束页
|
200
335
|
- encoding: 文本编码(默认utf-8)
|
336
|
+
|
337
|
+
返回:
|
338
|
+
转换后的文件路径
|
201
339
|
"""
|
202
340
|
try:
|
203
341
|
output_path = self._prepare_output_path(output_path, 'txt')
|
@@ -222,9 +360,9 @@ class PDFToTextConverter(PDFConverter):
|
|
222
360
|
def supported_formats(cls) -> List[str]:
|
223
361
|
return ['txt']
|
224
362
|
|
225
|
-
|
226
363
|
class PDFToCSVConverter(PDFConverter):
|
227
|
-
|
364
|
+
@retry(max_attempts=3, delay=0.5)
|
365
|
+
def convert(self, output_path: Union[str, Path], **kwargs) -> Union[str, List[str]]:
|
228
366
|
"""
|
229
367
|
提取PDF中的表格为CSV
|
230
368
|
|
@@ -234,6 +372,9 @@ class PDFToCSVConverter(PDFConverter):
|
|
234
372
|
- pages: 要提取的页码('all'或数字或列表)
|
235
373
|
- multiple_tables: 如何处理多个表格(separate/merge)
|
236
374
|
- encoding: CSV文件编码(默认utf-8)
|
375
|
+
|
376
|
+
返回:
|
377
|
+
单个CSV文件路径或多个CSV文件路径列表
|
237
378
|
"""
|
238
379
|
try:
|
239
380
|
pages = kwargs.get('pages', 'all')
|
@@ -245,11 +386,14 @@ class PDFToCSVConverter(PDFConverter):
|
|
245
386
|
if not dfs:
|
246
387
|
raise ConversionError("未找到表格数据")
|
247
388
|
|
389
|
+
self._update_progress(0, len(dfs))
|
390
|
+
|
248
391
|
if multiple_tables == 'merge':
|
249
392
|
# 合并所有表格
|
250
393
|
output_path = self._prepare_output_path(output_path, 'csv')
|
251
394
|
merged_df = pd.concat(dfs, ignore_index=True)
|
252
395
|
merged_df.to_csv(output_path, index=False, encoding=encoding)
|
396
|
+
self._update_progress(1, 1)
|
253
397
|
return str(output_path)
|
254
398
|
else:
|
255
399
|
# 每个表格保存为单独CSV
|
@@ -257,6 +401,7 @@ class PDFToCSVConverter(PDFConverter):
|
|
257
401
|
if len(dfs) == 1:
|
258
402
|
output_path = self._prepare_output_path(output_path, 'csv')
|
259
403
|
dfs[0].to_csv(output_path, index=False, encoding=encoding)
|
404
|
+
self._update_progress(1, 1)
|
260
405
|
return str(output_path)
|
261
406
|
else:
|
262
407
|
output_path.mkdir(parents=True, exist_ok=True)
|
@@ -265,6 +410,7 @@ class PDFToCSVConverter(PDFConverter):
|
|
265
410
|
table_path = output_path / f"table_{i+1}.csv"
|
266
411
|
df.to_csv(table_path, index=False, encoding=encoding)
|
267
412
|
output_files.append(str(table_path))
|
413
|
+
self._update_progress(i+1, len(dfs))
|
268
414
|
return output_files
|
269
415
|
except Exception as e:
|
270
416
|
raise ConversionError(f"提取表格失败: {str(e)}")
|
@@ -273,9 +419,9 @@ class PDFToCSVConverter(PDFConverter):
|
|
273
419
|
def supported_formats(cls) -> List[str]:
|
274
420
|
return ['csv']
|
275
421
|
|
276
|
-
|
277
422
|
class PDFToHTMLConverter(PDFConverter):
|
278
|
-
|
423
|
+
@retry(max_attempts=3, delay=0.5)
|
424
|
+
def convert(self, output_path: Union[str, Path], **kwargs) -> str:
|
279
425
|
"""
|
280
426
|
将PDF转换为HTML
|
281
427
|
|
@@ -284,6 +430,9 @@ class PDFToHTMLConverter(PDFConverter):
|
|
284
430
|
**kwargs:
|
285
431
|
- css: 自定义CSS样式
|
286
432
|
- images: 是否嵌入图像(True/False)
|
433
|
+
|
434
|
+
返回:
|
435
|
+
转换后的HTML文件路径
|
287
436
|
"""
|
288
437
|
try:
|
289
438
|
output_path = self._prepare_output_path(output_path, 'html')
|
@@ -298,7 +447,7 @@ class PDFToHTMLConverter(PDFConverter):
|
|
298
447
|
<meta charset="UTF-8">
|
299
448
|
<title>{self.pdf_path.stem}</title>
|
300
449
|
<style>
|
301
|
-
{kwargs.get('css', 'body { font-family: Arial; margin: 20px; }')}
|
450
|
+
{kwargs.get('css', 'body {{ font-family: Arial; margin: 20px; }}')}
|
302
451
|
</style>
|
303
452
|
</head>
|
304
453
|
<body>
|
@@ -320,9 +469,21 @@ class PDFToHTMLConverter(PDFConverter):
|
|
320
469
|
def supported_formats(cls) -> List[str]:
|
321
470
|
return ['html']
|
322
471
|
|
323
|
-
|
472
|
+
# 工厂类
|
324
473
|
class PDFConverterFactory:
|
474
|
+
_format_map = {
|
475
|
+
'docx': (PDFToDocxConverter, 'Microsoft Word文档'),
|
476
|
+
'txt': (PDFToTextConverter, '纯文本文件'),
|
477
|
+
'png': (PDFToImageConverter, 'PNG图像'),
|
478
|
+
'jpg': (PDFToImageConverter, 'JPEG图像'),
|
479
|
+
'jpeg': (PDFToImageConverter, 'JPEG图像'),
|
480
|
+
'tiff': (PDFToImageConverter, 'TIFF图像'),
|
481
|
+
'csv': (PDFToCSVConverter, 'CSV表格数据'),
|
482
|
+
'html': (PDFToHTMLConverter, 'HTML网页'),
|
483
|
+
}
|
484
|
+
|
325
485
|
@staticmethod
|
486
|
+
@lru_cache(maxsize=32)
|
326
487
|
def get_converter(target_format: str, pdf_path: str) -> PDFConverter:
|
327
488
|
"""
|
328
489
|
获取指定格式的转换器
|
@@ -333,64 +494,81 @@ class PDFConverterFactory:
|
|
333
494
|
|
334
495
|
返回:
|
335
496
|
PDFConverter实例
|
497
|
+
|
498
|
+
抛出:
|
499
|
+
UnsupportedFormatError: 当格式不支持时
|
336
500
|
"""
|
337
|
-
format_map = {
|
338
|
-
'docx': PDFToDocxConverter,
|
339
|
-
'txt': PDFToTextConverter,
|
340
|
-
'png': PDFToImageConverter,
|
341
|
-
'jpg': PDFToImageConverter,
|
342
|
-
'jpeg': PDFToImageConverter,
|
343
|
-
'tiff': PDFToImageConverter,
|
344
|
-
'csv': PDFToCSVConverter,
|
345
|
-
'html': PDFToHTMLConverter,
|
346
|
-
}
|
347
|
-
|
348
501
|
target_format = target_format.lower()
|
349
|
-
if target_format not in
|
350
|
-
raise
|
502
|
+
if target_format not in PDFConverterFactory._format_map:
|
503
|
+
raise UnsupportedFormatError(f"不支持的格式: {target_format}")
|
351
504
|
|
352
|
-
return
|
505
|
+
return PDFConverterFactory._format_map[target_format][0](pdf_path)
|
353
506
|
|
354
507
|
@staticmethod
|
355
|
-
def get_supported_formats() ->
|
356
|
-
"""
|
357
|
-
return {
|
358
|
-
'docx': 'Microsoft Word文档',
|
359
|
-
'txt': '纯文本文件',
|
360
|
-
'png': 'PNG图像',
|
361
|
-
'jpg': 'JPEG图像',
|
362
|
-
'jpeg': 'JPEG图像',
|
363
|
-
'tiff': 'TIFF图像',
|
364
|
-
'csv': 'CSV表格数据',
|
365
|
-
'html': 'HTML网页',
|
366
|
-
}
|
508
|
+
def get_supported_formats() -> Dict[str, str]:
|
509
|
+
"""获取所有支持的格式及其描述"""
|
510
|
+
return {fmt: desc for fmt, (_, desc) in PDFConverterFactory._format_map.items()}
|
367
511
|
|
512
|
+
# 命令行接口
|
513
|
+
def parse_args():
|
514
|
+
parser = argparse.ArgumentParser(
|
515
|
+
description='PDF转换工具 - 支持多种格式转换',
|
516
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
517
|
+
)
|
518
|
+
parser.add_argument('input', help='输入PDF文件路径')
|
519
|
+
parser.add_argument('output', help='输出文件路径或目录')
|
520
|
+
parser.add_argument('-f', '--format', required=True,
|
521
|
+
choices=PDFConverterFactory.get_supported_formats().keys(),
|
522
|
+
help='目标格式')
|
523
|
+
parser.add_argument('--dpi', type=int, default=200,
|
524
|
+
help='图像DPI(仅图像转换)')
|
525
|
+
parser.add_argument('--start-page', type=int, default=1,
|
526
|
+
help='起始页码(从1开始)')
|
527
|
+
parser.add_argument('--end-page', type=int,
|
528
|
+
help='结束页码(默认为最后一页)')
|
529
|
+
parser.add_argument('--preserve-formatting', action='store_true',
|
530
|
+
help='保留格式(仅DOCX转换)')
|
531
|
+
parser.add_argument('--merge', action='store_true',
|
532
|
+
help='合并多页为单个文件(图像/表格)')
|
533
|
+
parser.add_argument('--quality', type=int, default=90,
|
534
|
+
help='输出质量(1-100, 仅图像)')
|
535
|
+
return parser.parse_args()
|
368
536
|
|
369
|
-
|
370
|
-
|
537
|
+
def main():
|
538
|
+
args = parse_args()
|
539
|
+
|
371
540
|
try:
|
372
|
-
#
|
373
|
-
|
374
|
-
docx_converter = PDFConverterFactory.get_converter('docx', 'example.pdf')
|
375
|
-
result = docx_converter.convert('output.docx', preserve_formatting=True)
|
376
|
-
print(f"转换成功: {result}")
|
541
|
+
# 获取转换器
|
542
|
+
converter = PDFConverterFactory.get_converter(args.format, args.input)
|
377
543
|
|
378
|
-
#
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
print(f"转换成功: {result if isinstance(result, str) else len(result)}个文件")
|
544
|
+
# 设置进度回调
|
545
|
+
def progress_callback(current, total):
|
546
|
+
print(f"\r进度: {current}/{total} ({current/total:.1%})", end='', flush=True)
|
547
|
+
converter.set_progress_callback(progress_callback)
|
383
548
|
|
384
|
-
#
|
385
|
-
print("
|
386
|
-
|
387
|
-
|
388
|
-
|
549
|
+
# 执行转换
|
550
|
+
print(f"开始转换: {args.input} -> {args.output} ({args.format})")
|
551
|
+
result = converter.convert(
|
552
|
+
args.output,
|
553
|
+
dpi=args.dpi,
|
554
|
+
start_page=args.start_page,
|
555
|
+
end_page=args.end_page,
|
556
|
+
preserve_formatting=args.preserve_formatting,
|
557
|
+
merge=args.merge,
|
558
|
+
quality=args.quality
|
559
|
+
)
|
389
560
|
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
561
|
+
print("\n转换成功!")
|
562
|
+
if isinstance(result, list):
|
563
|
+
print(f"生成文件 {len(result)} 个:")
|
564
|
+
for file in result:
|
565
|
+
print(f" - {file}")
|
566
|
+
else:
|
567
|
+
print(f"输出文件: {result}")
|
394
568
|
|
395
569
|
except Exception as e:
|
396
|
-
print(f"
|
570
|
+
print(f"\n转换失败: {e}", file=sys.stderr)
|
571
|
+
sys.exit(1)
|
572
|
+
|
573
|
+
if __name__ == "__main__":
|
574
|
+
main()
|