tretool 0.2.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,544 @@
1
+ import os
2
+ import sys
3
+ import logging
4
+ import time
5
+ from abc import ABC, abstractmethod
6
+ from pathlib import Path
7
+ from typing import Union, List, Optional, Dict, Callable
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from functools import wraps, lru_cache
10
+ import argparse
11
+
12
+ # 第三方库导入
13
+ try:
14
+ from docx import Document
15
+ from docx.shared import Pt, RGBColor
16
+ from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
17
+ from PIL import Image
18
+ import pandas as pd
19
+ except ImportError as e:
20
+ print(f"缺少依赖库: {e}")
21
+ sys.exit(1)
22
+
23
+ # 日志配置
24
+ logging.basicConfig(
25
+ level=logging.INFO,
26
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
27
+ handlers=[
28
+ logging.StreamHandler(),
29
+ logging.FileHandler('docx_processor.log')
30
+ ]
31
+ )
32
+
33
+ # 自定义异常
34
+ class DocxProcessingError(Exception):
35
+ """基础处理异常"""
36
+ pass
37
+
38
+ class DocxPermissionError(DocxProcessingError):
39
+ """DOCX权限错误"""
40
+ pass
41
+
42
+ class DocxCorruptedError(DocxProcessingError):
43
+ """DOCX文件损坏"""
44
+ pass
45
+
46
+ class UnsupportedFormatError(DocxProcessingError):
47
+ """不支持的格式"""
48
+ pass
49
+
50
+ class DocxProcessor(ABC):
51
+ """
52
+ DOCX文档处理器抽象基类
53
+
54
+ 提供DOCX文档处理的基础功能,包括:
55
+ - 文件路径验证
56
+ - 输出路径处理
57
+ - 基本错误处理
58
+
59
+ 子类应实现:
60
+ - process() 方法: 执行实际处理逻辑
61
+ - supported_operations() 类方法: 返回支持的操作列表
62
+ """
63
+
64
+ def __init__(self, docx_path: Union[str, Path]):
65
+ """
66
+ 初始化DOCX处理器
67
+
68
+ 参数:
69
+ docx_path: DOCX文件路径
70
+ """
71
+ self.docx_path = Path(docx_path)
72
+ self.logger = logging.getLogger(self.__class__.__name__)
73
+ self._progress_callback = None
74
+
75
+ # 验证文件
76
+ if not self.docx_path.exists():
77
+ raise FileNotFoundError(f"DOCX文件不存在: {docx_path}")
78
+ if self.docx_path.suffix.lower() != '.docx':
79
+ raise ValueError("输入文件必须是DOCX格式")
80
+
81
+ self.logger.info(f"初始化处理器,处理文件: {docx_path}")
82
+
83
+ @abstractmethod
84
+ def process(self, output_path: Union[str, Path], **kwargs) -> Union[str, List[str]]:
85
+ """
86
+ 处理DOCX文档
87
+
88
+ 参数:
89
+ output_path: 输出文件路径
90
+ **kwargs: 处理选项
91
+
92
+ 返回:
93
+ 处理后的文件路径或路径列表
94
+ """
95
+ pass
96
+
97
+ @classmethod
98
+ @abstractmethod
99
+ def supported_operations(cls) -> List[str]:
100
+ """返回支持的操作列表"""
101
+ return []
102
+
103
+ def set_progress_callback(self, callback: Callable[[int, int], None]):
104
+ """设置进度回调函数"""
105
+ self._progress_callback = callback
106
+
107
+ def _update_progress(self, current: int, total: int):
108
+ """更新进度"""
109
+ if self._progress_callback:
110
+ self._progress_callback(current, total)
111
+
112
+ def get_metadata(self) -> Dict[str, str]:
113
+ """获取DOCX元数据"""
114
+ try:
115
+ doc = Document(self.docx_path)
116
+ return {
117
+ 'title': doc.core_properties.title,
118
+ 'author': doc.core_properties.author,
119
+ 'created': str(doc.core_properties.created),
120
+ 'modified': str(doc.core_properties.modified),
121
+ 'paragraphs': len(doc.paragraphs),
122
+ 'tables': len(doc.tables)
123
+ }
124
+ except Exception as e:
125
+ self.logger.warning(f"获取元数据失败: {str(e)}")
126
+ return {}
127
+
128
+ def _prepare_output_path(self, output_path: Union[str, Path],
129
+ default_extension: str) -> Path:
130
+ """
131
+ 准备输出路径
132
+
133
+ 参数:
134
+ output_path: 输出路径
135
+ default_extension: 默认文件扩展名
136
+
137
+ 返回:
138
+ 处理后的Path对象
139
+ """
140
+ output_path = Path(output_path)
141
+
142
+ # 如果是目录,自动生成文件名
143
+ if output_path.is_dir():
144
+ output_path = output_path / f"{self.docx_path.stem}.{default_extension}"
145
+ # 如果没有扩展名,添加默认扩展名
146
+ elif not output_path.suffix:
147
+ output_path = output_path.with_suffix(f".{default_extension}")
148
+
149
+ # 创建父目录(如果不存在)
150
+ output_path.parent.mkdir(parents=True, exist_ok=True)
151
+
152
+ return output_path
153
+
154
+ # 重试装饰器
155
+ def retry(max_attempts=3, delay=1, exceptions=(Exception,)):
156
+ def decorator(f):
157
+ @wraps(f)
158
+ def wrapper(*args, **kwargs):
159
+ last_error = None
160
+ for attempt in range(1, max_attempts+1):
161
+ try:
162
+ return f(*args, **kwargs)
163
+ except exceptions as e:
164
+ last_error = e
165
+ if attempt < max_attempts:
166
+ time.sleep(delay)
167
+ raise last_error
168
+ return wrapper
169
+ return decorator
170
+
171
+ # 具体处理器实现
172
+ class DocxToPDFConverter(DocxProcessor):
173
+ @retry(max_attempts=3, delay=0.5)
174
+ def process(self, output_path: Union[str, Path], **kwargs) -> str:
175
+ """
176
+ 将DOCX转换为PDF
177
+
178
+ 参数:
179
+ output_path: 输出文件路径
180
+ **kwargs:
181
+ - pages: 要转换的页码范围(如'1-3,5')
182
+ - quality: 输出质量(high/medium/low)
183
+
184
+ 返回:
185
+ 转换后的文件路径
186
+ """
187
+ try:
188
+ output_path = self._prepare_output_path(output_path, 'pdf')
189
+
190
+ # 尝试使用docx2pdf库
191
+ try:
192
+ from docx2pdf import convert
193
+ convert(str(self.docx_path), str(output_path))
194
+ return str(output_path)
195
+ except ImportError:
196
+ self.logger.warning("docx2pdf未安装,尝试使用unoconv")
197
+ # 回退到unoconv
198
+ try:
199
+ import subprocess
200
+ subprocess.run(['unoconv', '-f', 'pdf', '-o', str(output_path), str(self.docx_path)], check=True)
201
+ return str(output_path)
202
+ except Exception as e:
203
+ raise DocxProcessingError(f"转换为PDF失败,请安装docx2pdf或unoconv: {str(e)}")
204
+ except Exception as e:
205
+ raise DocxProcessingError(f"转换为PDF失败: {str(e)}")
206
+
207
+ @classmethod
208
+ def supported_operations(cls) -> List[str]:
209
+ return ['pdf']
210
+
211
+ class DocxToTextConverter(DocxProcessor):
212
+ @retry(max_attempts=3, delay=0.5)
213
+ def process(self, output_path: Union[str, Path], **kwargs) -> str:
214
+ """
215
+ 将DOCX转换为纯文本
216
+
217
+ 参数:
218
+ output_path: 输出文件路径
219
+ **kwargs:
220
+ - include_tables: 是否包含表格内容(True/False)
221
+ - encoding: 文本编码(默认utf-8)
222
+
223
+ 返回:
224
+ 转换后的文件路径
225
+ """
226
+ try:
227
+ output_path = self._prepare_output_path(output_path, 'txt')
228
+ include_tables = kwargs.get('include_tables', False)
229
+ encoding = kwargs.get('encoding', 'utf-8')
230
+
231
+ doc = Document(self.docx_path)
232
+ text = []
233
+
234
+ for para in doc.paragraphs:
235
+ text.append(para.text)
236
+
237
+ if include_tables:
238
+ for table in doc.tables:
239
+ for row in table.rows:
240
+ for cell in row.cells:
241
+ text.append(cell.text)
242
+
243
+ with open(output_path, 'w', encoding=encoding) as f:
244
+ f.write('\n'.join(text))
245
+
246
+ return str(output_path)
247
+ except Exception as e:
248
+ raise DocxProcessingError(f"转换为文本失败: {str(e)}")
249
+
250
+ @classmethod
251
+ def supported_operations(cls) -> List[str]:
252
+ return ['txt']
253
+
254
+ class DocxToHTMLConverter(DocxProcessor):
255
+ @retry(max_attempts=3, delay=0.5)
256
+ def process(self, output_path: Union[str, Path], **kwargs) -> str:
257
+ """
258
+ 将DOCX转换为HTML
259
+
260
+ 参数:
261
+ output_path: 输出文件路径
262
+ **kwargs:
263
+ - css: 自定义CSS样式
264
+ - include_images: 是否包含图像(True/False)
265
+
266
+ 返回:
267
+ 转换后的HTML文件路径
268
+ """
269
+ try:
270
+ output_path = self._prepare_output_path(output_path, 'html')
271
+
272
+ # 尝试使用pandoc
273
+ try:
274
+ import subprocess
275
+ subprocess.run([
276
+ 'pandoc', '-s', str(self.docx_path),
277
+ '-o', str(output_path),
278
+ '--css', kwargs.get('css', '')
279
+ ], check=True)
280
+ return str(output_path)
281
+ except Exception:
282
+ self.logger.warning("pandoc不可用,使用基本转换")
283
+ # 基本实现
284
+ doc = Document(self.docx_path)
285
+ html_content = [
286
+ '<!DOCTYPE html>',
287
+ '<html>',
288
+ '<head>',
289
+ '<meta charset="UTF-8">',
290
+ f'<title>{self.docx_path.stem}</title>',
291
+ f'<style>{kwargs.get("css", "body { font-family: Arial; }")}</style>',
292
+ '</head>',
293
+ '<body>'
294
+ ]
295
+
296
+ for para in doc.paragraphs:
297
+ html_content.append(f'<p>{para.text}</p>')
298
+
299
+ html_content.extend(['</body>', '</html>'])
300
+
301
+ with open(output_path, 'w', encoding='utf-8') as f:
302
+ f.write('\n'.join(html_content))
303
+
304
+ return str(output_path)
305
+ except Exception as e:
306
+ raise DocxProcessingError(f"转换为HTML失败: {str(e)}")
307
+
308
+ @classmethod
309
+ def supported_operations(cls) -> List[str]:
310
+ return ['html']
311
+
312
+ class DocxToCSVConverter(DocxProcessor):
313
+ @retry(max_attempts=3, delay=0.5)
314
+ def process(self, output_path: Union[str, Path], **kwargs) -> Union[str, List[str]]:
315
+ """
316
+ 提取DOCX中的表格为CSV
317
+
318
+ 参数:
319
+ output_path: 输出文件路径或目录
320
+ **kwargs:
321
+ - table_indexes: 要提取的表格索引列表(如[0,2])
322
+ - encoding: CSV文件编码(默认utf-8)
323
+
324
+ 返回:
325
+ 单个CSV文件路径或多个CSV文件路径列表
326
+ """
327
+ try:
328
+ table_indexes = kwargs.get('table_indexes', None)
329
+ encoding = kwargs.get('encoding', 'utf-8')
330
+
331
+ doc = Document(self.docx_path)
332
+ tables = doc.tables
333
+
334
+ if not tables:
335
+ raise DocxProcessingError("未找到表格数据")
336
+
337
+ if table_indexes is None:
338
+ table_indexes = range(len(tables))
339
+
340
+ output_path = Path(output_path)
341
+ if len(table_indexes) == 1:
342
+ output_path = self._prepare_output_path(output_path, 'csv')
343
+ df = self._table_to_dataframe(tables[table_indexes[0]])
344
+ df.to_csv(output_path, index=False, encoding=encoding)
345
+ return str(output_path)
346
+ else:
347
+ output_path.mkdir(parents=True, exist_ok=True)
348
+ output_files = []
349
+ for i in table_indexes:
350
+ if i < len(tables):
351
+ table_path = output_path / f"table_{i}.csv"
352
+ df = self._table_to_dataframe(tables[i])
353
+ df.to_csv(table_path, index=False, encoding=encoding)
354
+ output_files.append(str(table_path))
355
+ return output_files
356
+ except Exception as e:
357
+ raise DocxProcessingError(f"提取表格失败: {str(e)}")
358
+
359
+ def _table_to_dataframe(self, table) -> pd.DataFrame:
360
+ """将Word表格转换为DataFrame"""
361
+ data = []
362
+ for row in table.rows:
363
+ row_data = []
364
+ for cell in row.cells:
365
+ row_data.append(cell.text)
366
+ data.append(row_data)
367
+ return pd.DataFrame(data)
368
+
369
+ @classmethod
370
+ def supported_operations(cls) -> List[str]:
371
+ return ['csv']
372
+
373
+ class DocxEditor(DocxProcessor):
374
+ """DOCX文档编辑器"""
375
+ @retry(max_attempts=3, delay=0.5)
376
+ def process(self, output_path: Union[str, Path], **kwargs) -> str:
377
+ """
378
+ 编辑DOCX文档
379
+
380
+ 参数:
381
+ output_path: 输出文件路径
382
+ **kwargs:
383
+ - replace: 替换字典 {旧文本: 新文本}
384
+ - add_watermark: 水印文本
385
+ - font_size: 字体大小(默认12)
386
+ - font_color: 字体颜色(十六进制,如#FF0000)
387
+
388
+ 返回:
389
+ 编辑后的文件路径
390
+ """
391
+ try:
392
+ output_path = self._prepare_output_path(output_path, 'docx')
393
+
394
+ doc = Document(self.docx_path)
395
+
396
+ # 文本替换
397
+ if 'replace' in kwargs:
398
+ for old_text, new_text in kwargs['replace'].items():
399
+ for para in doc.paragraphs:
400
+ if old_text in para.text:
401
+ para.text = para.text.replace(old_text, new_text)
402
+
403
+ # 添加水印
404
+ if 'add_watermark' in kwargs:
405
+ watermark = kwargs['add_watermark']
406
+ font_size = kwargs.get('font_size', 12)
407
+ font_color = kwargs.get('font_color', '#C0C0C0')
408
+
409
+ for section in doc.sections:
410
+ header = section.header
411
+ paragraph = header.paragraphs[0] if header.paragraphs else header.add_paragraph()
412
+ paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
413
+
414
+ run = paragraph.add_run(watermark)
415
+ run.font.size = Pt(font_size)
416
+ run.font.color.rgb = RGBColor(*self._hex_to_rgb(font_color))
417
+
418
+ doc.save(output_path)
419
+ return str(output_path)
420
+ except Exception as e:
421
+ raise DocxProcessingError(f"编辑文档失败: {str(e)}")
422
+
423
+ def _hex_to_rgb(self, hex_color: str) -> tuple:
424
+ """十六进制颜色转RGB"""
425
+ hex_color = hex_color.lstrip('#')
426
+ return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
427
+
428
+ @classmethod
429
+ def supported_operations(cls) -> List[str]:
430
+ return ['edit']
431
+
432
+ # 工厂类
433
+ class DocxProcessorFactory:
434
+ _operation_map = {
435
+ 'pdf': (DocxToPDFConverter, 'PDF文档'),
436
+ 'txt': (DocxToTextConverter, '纯文本文件'),
437
+ 'html': (DocxToHTMLConverter, 'HTML网页'),
438
+ 'csv': (DocxToCSVConverter, 'CSV表格数据'),
439
+ 'edit': (DocxEditor, '文档编辑')
440
+ }
441
+
442
+ @staticmethod
443
+ @lru_cache(maxsize=32)
444
+ def get_processor(operation: str, docx_path: str) -> DocxProcessor:
445
+ """
446
+ 获取指定操作的处理器
447
+
448
+ 参数:
449
+ operation: 操作类型
450
+ docx_path: DOCX文件路径
451
+
452
+ 返回:
453
+ DocxProcessor实例
454
+
455
+ 抛出:
456
+ UnsupportedOperationError: 当操作不支持时
457
+ """
458
+ operation = operation.lower()
459
+ if operation not in DocxProcessorFactory._operation_map:
460
+ raise UnsupportedFormatError(f"不支持的操作: {operation}")
461
+
462
+ return DocxProcessorFactory._operation_map[operation][0](docx_path)
463
+
464
+ @staticmethod
465
+ def get_supported_operations() -> Dict[str, str]:
466
+ """获取所有支持的操作及其描述"""
467
+ return {op: desc for op, (_, desc) in DocxProcessorFactory._operation_map.items()}
468
+
469
+ # 命令行接口
470
+ def parse_args():
471
+ parser = argparse.ArgumentParser(
472
+ description='DOCX文档处理工具 - 支持多种格式转换和编辑',
473
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
474
+ )
475
+ parser.add_argument('input', help='输入DOCX文件路径')
476
+ parser.add_argument('output', help='输出文件路径或目录')
477
+ parser.add_argument('-o', '--operation', required=True,
478
+ choices=DocxProcessorFactory.get_supported_operations().keys(),
479
+ help='操作类型')
480
+ parser.add_argument('--pages', help='页码范围(仅PDF转换)')
481
+ parser.add_argument('--quality', choices=['high', 'medium', 'low'],
482
+ default='medium', help='输出质量(仅PDF转换)')
483
+ parser.add_argument('--include-tables', action='store_true',
484
+ help='包含表格内容(仅文本转换)')
485
+ parser.add_argument('--table-indexes', help='要提取的表格索引(如0,2,3)')
486
+ parser.add_argument('--replace', nargs=2, action='append',
487
+ metavar=('OLD', 'NEW'), help='文本替换(编辑操作)')
488
+ parser.add_argument('--watermark', help='水印文本(编辑操作)')
489
+ parser.add_argument('--font-size', type=int, default=12,
490
+ help='水印字体大小(编辑操作)')
491
+ parser.add_argument('--font-color', default='#C0C0C0',
492
+ help='水印字体颜色(十六进制,编辑操作)')
493
+ return parser.parse_args()
494
+
495
+ def main():
496
+ args = parse_args()
497
+
498
+ try:
499
+ # 获取处理器
500
+ processor = DocxProcessorFactory.get_processor(args.operation, args.input)
501
+
502
+ # 准备参数
503
+ kwargs = {}
504
+ if args.operation == 'pdf':
505
+ kwargs.update({
506
+ 'pages': args.pages,
507
+ 'quality': args.quality
508
+ })
509
+ elif args.operation == 'txt':
510
+ kwargs['include_tables'] = args.include_tables
511
+ elif args.operation == 'csv':
512
+ if args.table_indexes:
513
+ kwargs['table_indexes'] = [int(i) for i in args.table_indexes.split(',')]
514
+ elif args.operation == 'edit':
515
+ if args.replace:
516
+ kwargs['replace'] = dict(args.replace)
517
+ if args.watermark:
518
+ kwargs['add_watermark'] = args.watermark
519
+ kwargs['font_size'] = args.font_size
520
+ kwargs['font_color'] = args.font_color
521
+
522
+ # 设置进度回调
523
+ def progress_callback(current, total):
524
+ print(f"\r进度: {current}/{total} ({current/total:.1%})", end='', flush=True)
525
+ processor.set_progress_callback(progress_callback)
526
+
527
+ # 执行处理
528
+ print(f"开始处理: {args.input} -> {args.output} ({args.operation})")
529
+ result = processor.process(args.output, **kwargs)
530
+
531
+ print("\n处理成功!")
532
+ if isinstance(result, list):
533
+ print(f"生成文件 {len(result)} 个:")
534
+ for file in result:
535
+ print(f" - {file}")
536
+ else:
537
+ print(f"输出文件: {result}")
538
+
539
+ except Exception as e:
540
+ print(f"\n处理失败: {e}", file=sys.stderr)
541
+ sys.exit(1)
542
+
543
+ if __name__ == "__main__":
544
+ main()