pysfi 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pysfi-0.1.10.dist-info → pysfi-0.1.11.dist-info}/METADATA +7 -7
- pysfi-0.1.11.dist-info/RECORD +60 -0
- {pysfi-0.1.10.dist-info → pysfi-0.1.11.dist-info}/entry_points.txt +12 -2
- sfi/__init__.py +1 -1
- sfi/alarmclock/alarmclock.py +40 -40
- sfi/bumpversion/__init__.py +1 -1
- sfi/cleanbuild/cleanbuild.py +155 -0
- sfi/condasetup/condasetup.py +116 -0
- sfi/docscan/__init__.py +1 -1
- sfi/docscan/docscan_gui.py +1 -1
- sfi/docscan/lang/eng.py +152 -152
- sfi/docscan/lang/zhcn.py +170 -170
- sfi/filedate/filedate.py +185 -112
- sfi/gittool/__init__.py +2 -0
- sfi/gittool/gittool.py +401 -0
- sfi/llmclient/llmclient.py +592 -0
- sfi/llmquantize/llmquantize.py +480 -0
- sfi/llmserver/llmserver.py +335 -0
- sfi/makepython/makepython.py +2 -2
- sfi/pdfsplit/pdfsplit.py +4 -4
- sfi/pyarchive/pyarchive.py +418 -0
- sfi/pyembedinstall/pyembedinstall.py +629 -0
- sfi/pylibpack/pylibpack.py +813 -269
- sfi/pylibpack/rules/numpy.json +22 -0
- sfi/pylibpack/rules/pymupdf.json +10 -0
- sfi/pylibpack/rules/pyqt5.json +19 -0
- sfi/pylibpack/rules/pyside2.json +23 -0
- sfi/pylibpack/rules/scipy.json +23 -0
- sfi/pylibpack/rules/shiboken2.json +24 -0
- sfi/pyloadergen/pyloadergen.py +271 -572
- sfi/pypack/pypack.py +822 -471
- sfi/pyprojectparse/__init__.py +0 -0
- sfi/pyprojectparse/pyprojectparse.py +500 -0
- sfi/pysourcepack/pysourcepack.py +308 -369
- sfi/quizbase/__init__.py +0 -0
- sfi/quizbase/quizbase.py +828 -0
- sfi/quizbase/quizbase_gui.py +987 -0
- sfi/regexvalidate/__init__.py +0 -0
- sfi/regexvalidate/regex_help.html +284 -0
- sfi/regexvalidate/regexvalidate.py +468 -0
- sfi/taskkill/taskkill.py +0 -2
- pysfi-0.1.10.dist-info/RECORD +0 -39
- sfi/embedinstall/embedinstall.py +0 -478
- sfi/projectparse/projectparse.py +0 -152
- {pysfi-0.1.10.dist-info → pysfi-0.1.11.dist-info}/WHEEL +0 -0
- /sfi/{embedinstall → llmquantize}/__init__.py +0 -0
- /sfi/{projectparse → pyembedinstall}/__init__.py +0 -0
sfi/docscan/lang/zhcn.py
CHANGED
|
@@ -1,170 +1,170 @@
|
|
|
1
|
-
"""Chinese translations for docscan_gui."""
|
|
2
|
-
|
|
3
|
-
TRANSLATIONS = {
|
|
4
|
-
# Window and Tab Titles
|
|
5
|
-
"window_title": "文档扫描器",
|
|
6
|
-
"input_config_tab": "输入配置",
|
|
7
|
-
"scan_options_tab": "扫描选项",
|
|
8
|
-
# Input Section
|
|
9
|
-
"input_directory": "输入目录:",
|
|
10
|
-
"browse": "浏览...",
|
|
11
|
-
"rules_file": "规则文件:",
|
|
12
|
-
"file_types": "文件类型:",
|
|
13
|
-
# Options Section
|
|
14
|
-
"use_pdf_ocr": "使用 PDF OCR",
|
|
15
|
-
"use_process_pool": "使用进程池 (CPU 密集型)",
|
|
16
|
-
"threads": "线程数:",
|
|
17
|
-
"batch_size": "批处理大小:",
|
|
18
|
-
# Action Buttons
|
|
19
|
-
"start_scan": "开始扫描",
|
|
20
|
-
"pause": "暂停",
|
|
21
|
-
"resume": "恢复",
|
|
22
|
-
"stop": "停止",
|
|
23
|
-
"save_results": "保存结果",
|
|
24
|
-
"clear_results": "清除结果",
|
|
25
|
-
# Results Section
|
|
26
|
-
"results": "结果",
|
|
27
|
-
"files_scanned": "已扫描文件:",
|
|
28
|
-
"files_with_matches": "包含匹配项的文件:",
|
|
29
|
-
"progress_log": "进度日志:",
|
|
30
|
-
"match_details": "匹配详情:",
|
|
31
|
-
"selected_match_context": "选中匹配项的上下文:",
|
|
32
|
-
# Table Headers
|
|
33
|
-
"file": "文件",
|
|
34
|
-
"type": "类型",
|
|
35
|
-
"matches": "匹配数",
|
|
36
|
-
"time": "时间 (s)",
|
|
37
|
-
# Dialogs and Messages
|
|
38
|
-
"select_input_directory": "选择输入目录",
|
|
39
|
-
"select_rules_file": "选择规则文件",
|
|
40
|
-
"json_files": "JSON 文件 (*.json)",
|
|
41
|
-
"save_results_dialog": "保存结果",
|
|
42
|
-
"default_results_filename": "扫描结果_{datetime}.json",
|
|
43
|
-
# Error Messages
|
|
44
|
-
"error": "错误",
|
|
45
|
-
"warning": "警告",
|
|
46
|
-
"success": "成功",
|
|
47
|
-
"invalid_input_directory": "无效的输入目录",
|
|
48
|
-
"no_valid_rules": "没有找到有效的规则",
|
|
49
|
-
"failed_to_load_rules": "加载规则失败: {error}",
|
|
50
|
-
"no_results_to_save": "没有可保存的结果",
|
|
51
|
-
"failed_to_save_results": "保存结果失败: {error}",
|
|
52
|
-
"scan_failed": "扫描失败: {error}",
|
|
53
|
-
# Progress Messages
|
|
54
|
-
"starting_scan": "开始扫描...",
|
|
55
|
-
"scan_complete": "扫描完成!",
|
|
56
|
-
"pausing_scan": "暂停扫描...",
|
|
57
|
-
"stopping_scan": "停止扫描...",
|
|
58
|
-
"scan_completed": "扫描完成",
|
|
59
|
-
"scan_stopped": "扫描已停止",
|
|
60
|
-
"scan_resumed": "扫描已恢复",
|
|
61
|
-
"found_matches_files": "在 {count} 个文件中找到匹配项",
|
|
62
|
-
# File Info
|
|
63
|
-
"file_info": "文件",
|
|
64
|
-
"type_info": "类型",
|
|
65
|
-
"size": "大小",
|
|
66
|
-
"bytes": "字节",
|
|
67
|
-
# Match Info
|
|
68
|
-
"rule": "规则",
|
|
69
|
-
"description": "描述",
|
|
70
|
-
"line": "行 {line}: {match}",
|
|
71
|
-
"context": "上下文",
|
|
72
|
-
"default_file_types": "pdf,docx,xlsx,pptx,txt,odt,rtf,epub,csv,xml,html,md,jpg,jpeg,png,gif,bmp,tiff",
|
|
73
|
-
"default_rules_file": "rules.json",
|
|
74
|
-
# Menu Items
|
|
75
|
-
"file_menu": "文件(&F)",
|
|
76
|
-
"settings_menu": "设置(&S)",
|
|
77
|
-
"help_menu": "帮助(&H)",
|
|
78
|
-
"preferences": "首选项(&P)...",
|
|
79
|
-
"exit": "退出(&X)",
|
|
80
|
-
"about": "关于(&A)",
|
|
81
|
-
"about_title": "关于文档扫描器",
|
|
82
|
-
"about_text": "文档扫描器 GUI\n\n版本 1.0",
|
|
83
|
-
"language": "语言",
|
|
84
|
-
# Open Results
|
|
85
|
-
"open_results": "打开结果...",
|
|
86
|
-
"open_results_file": "打开扫描结果",
|
|
87
|
-
"loaded_results_from": "已从以下位置加载结果: {path}",
|
|
88
|
-
"results_loaded_successfully": "结果已成功从以下位置加载:\n{path}",
|
|
89
|
-
"failed_to_load_results": "加载结果失败: {error}",
|
|
90
|
-
# Settings Dialog
|
|
91
|
-
"language_settings": "语言设置",
|
|
92
|
-
"processing_options": "处理选项",
|
|
93
|
-
"performance_settings": "性能设置",
|
|
94
|
-
"language_label": "语言:",
|
|
95
|
-
"ocr_tooltip": "为扫描的 PDF 文件启用 OCR 以从图像中提取文本",
|
|
96
|
-
"process_pool_tooltip": "对 CPU 密集型操作使用多进程(可能会增加内存使用)",
|
|
97
|
-
"threads_tooltip": "工作线程数量(较高的值可能会提高速度但会使用更多 CPU)",
|
|
98
|
-
"batch_size_tooltip": "每批处理的文件数量(较大的批次可能会提高吞吐量)",
|
|
99
|
-
"file_types_tooltip": "要扫描的文件类型(逗号分隔)",
|
|
100
|
-
# Results Saved Message
|
|
101
|
-
"results_saved_to": "结果已保存至:\t{path}",
|
|
102
|
-
# Default Values
|
|
103
|
-
"files_scanned_zero": "已扫描文件: 0",
|
|
104
|
-
"files_with_matches_zero": "包含匹配项的文件: 0",
|
|
105
|
-
# Apply Button
|
|
106
|
-
"apply": "应用",
|
|
107
|
-
# Command-line specific translations
|
|
108
|
-
"document_scanner_description": "扫描文档并使用特定规则提取文本、图像和元数据。",
|
|
109
|
-
"input_directory_help": "包含待扫描文档的输入目录",
|
|
110
|
-
"rules_file_help": "规则文件 (JSON格式)",
|
|
111
|
-
"recursive_help": "递归扫描文件",
|
|
112
|
-
"file_types_help": "要扫描的文件类型(逗号分隔)",
|
|
113
|
-
"use_pdf_ocr_help": "对基于图像的PDF文件使用OCR",
|
|
114
|
-
"use_process_pool_help": "使用进程池而非线程池(更适合CPU密集型任务)",
|
|
115
|
-
"batch_size_help": "每批次处理的文件数量",
|
|
116
|
-
"threads_help": "并行扫描的线程数",
|
|
117
|
-
"progress_help": "显示进度条",
|
|
118
|
-
"verbose_help": "详细输出",
|
|
119
|
-
"language_help": "设置语言(en表示英文,zh表示中文)",
|
|
120
|
-
# Status and logging messages
|
|
121
|
-
"scanning_directory": "正在扫描目录: {directory}",
|
|
122
|
-
"found_files_to_scan": "发现 {count} 个文件待扫描",
|
|
123
|
-
"scan_stopped_before_submitting_tasks": "用户在提交所有任务之前停止扫描",
|
|
124
|
-
"scan_paused": "扫描已暂停",
|
|
125
|
-
"scan_stopped_while_paused": "扫描在暂停状态下停止",
|
|
126
|
-
"scan_stopped_by_user_canceling_tasks": "用户停止扫描,正在取消剩余任务...",
|
|
127
|
-
"task_timeout_scan_may_be_stopping": "任务超时,扫描可能正在停止",
|
|
128
|
-
"error_scanning_file": "扫描文件时出错: {error}",
|
|
129
|
-
"progress_report": "进度: {processed}/{total} 文件已处理",
|
|
130
|
-
"force_shutting_down_executor": "强制关闭执行器...",
|
|
131
|
-
"scan_stopped_processed_files": "扫描已停止。已处理 {processed} 个文件",
|
|
132
|
-
"scan_complete_found_matches": "扫描完成。在 {matches_count} 个文件中找到匹配项",
|
|
133
|
-
"found_matches_in_file": "找到匹配项: {file_name}",
|
|
134
|
-
"processed_file_info": "已处理 {file_name} ({ext}),耗时 {time:.3f}秒 - 找到 {matches_count} 个匹配项",
|
|
135
|
-
"could_not_extract_text_from_file": "无法从 {file_path} 提取文本: {error}",
|
|
136
|
-
"pymupdf_failed_for_file": "{file_name} 的 PyMuPDF 失败: {error}",
|
|
137
|
-
"pypdf_also_failed_for_file": "{file_name} 的 pypdf 也失败: {error}",
|
|
138
|
-
"no_pdf_library_installed": "未安装PDF库 (pymupdf 或 pypdf)",
|
|
139
|
-
"pymupdf_not_installed": "未安装 PyMuPDF",
|
|
140
|
-
"no_pages_found_in_file": "在 {file_path} 中未找到页面",
|
|
141
|
-
"no_metadata_found_in_file": "在 {file_path} 中未找到元数据",
|
|
142
|
-
"pymupdf_error_trying_fallback": "PyMuPDF 在 {file_path} 上出错: {error},尝试使用 pypdf 回退",
|
|
143
|
-
"pypdf_not_installed_skipping_extraction": "未安装 pypdf,跳过PDF提取",
|
|
144
|
-
"error_extracting_pdf_with_pypdf": "使用 pypdf 提取PDF时出错: {error}",
|
|
145
|
-
"odfpy_not_installed_skipping_extraction": "未安装 odfpy,跳过ODT提取",
|
|
146
|
-
"error_extracting_odt": "提取ODT时出错: {error}",
|
|
147
|
-
"error_extracting_rtf": "提取RTF时出错: {error}",
|
|
148
|
-
"ebooklib_not_installed_skipping_extraction": "未安装 ebooklib,跳过EPUB提取",
|
|
149
|
-
"error_extracting_epub": "提取EPUB时出错: {error}",
|
|
150
|
-
"error_extracting_csv": "提取CSV时出错: {error}",
|
|
151
|
-
"error_extracting_xml": "提取XML时出错: {error}",
|
|
152
|
-
"error_extracting_html": "提取HTML时出错: {error}",
|
|
153
|
-
"error_extracting_markdown": "提取Markdown时出错: {error}",
|
|
154
|
-
"python_docx_not_installed_skipping_extraction": "未安装 python-docx,跳过DOCX提取",
|
|
155
|
-
"openpyxl_not_installed_skipping_extraction": "未安装 openpyxl,跳过XLSX提取",
|
|
156
|
-
"python_pptx_not_installed_skipping_extraction": "未安装 python-pptx,跳过PPTX提取",
|
|
157
|
-
"pillow_or_tesseract_not_installed_skipping_ocr": "未安装 PIL 或 pytesseract,跳过图像OCR",
|
|
158
|
-
"could_not_perform_ocr_on_file": "无法对 {file_path} 执行OCR: {error}",
|
|
159
|
-
"input_directory_does_not_exist": "输入目录不存在: {input_dir}",
|
|
160
|
-
"using_rules_file": "使用规则文件: {rules_file}",
|
|
161
|
-
"invalid_json_in_rules_file": "规则文件中的JSON无效: {error}",
|
|
162
|
-
"invalid_rules_format": "无效的规则格式。期望包含'rules'键的列表或字典",
|
|
163
|
-
"no_valid_rules_found": "未找到有效规则",
|
|
164
|
-
"total_time_elapsed": "总耗时: {time:.2f}秒",
|
|
165
|
-
"invalid_regex_pattern": "无效的正则表达式模式 '{pattern}': {error}",
|
|
166
|
-
"rules_file_does_not_exist_alt": "规则文件不存在: {rules_file}",
|
|
167
|
-
"image_files_supported": "支持的图像文件(需要OCR)",
|
|
168
|
-
"include_image_formats": "包含图像格式",
|
|
169
|
-
"include_image_formats_tooltip": "在扫描中包含图像格式(jpg, jpeg, png, gif, bmp, tiff)",
|
|
170
|
-
}
|
|
1
|
+
"""Chinese translations for docscan_gui."""
|
|
2
|
+
|
|
3
|
+
TRANSLATIONS = {
|
|
4
|
+
# Window and Tab Titles
|
|
5
|
+
"window_title": "文档扫描器",
|
|
6
|
+
"input_config_tab": "输入配置",
|
|
7
|
+
"scan_options_tab": "扫描选项",
|
|
8
|
+
# Input Section
|
|
9
|
+
"input_directory": "输入目录:",
|
|
10
|
+
"browse": "浏览...",
|
|
11
|
+
"rules_file": "规则文件:",
|
|
12
|
+
"file_types": "文件类型:",
|
|
13
|
+
# Options Section
|
|
14
|
+
"use_pdf_ocr": "使用 PDF OCR",
|
|
15
|
+
"use_process_pool": "使用进程池 (CPU 密集型)",
|
|
16
|
+
"threads": "线程数:",
|
|
17
|
+
"batch_size": "批处理大小:",
|
|
18
|
+
# Action Buttons
|
|
19
|
+
"start_scan": "开始扫描",
|
|
20
|
+
"pause": "暂停",
|
|
21
|
+
"resume": "恢复",
|
|
22
|
+
"stop": "停止",
|
|
23
|
+
"save_results": "保存结果",
|
|
24
|
+
"clear_results": "清除结果",
|
|
25
|
+
# Results Section
|
|
26
|
+
"results": "结果",
|
|
27
|
+
"files_scanned": "已扫描文件:",
|
|
28
|
+
"files_with_matches": "包含匹配项的文件:",
|
|
29
|
+
"progress_log": "进度日志:",
|
|
30
|
+
"match_details": "匹配详情:",
|
|
31
|
+
"selected_match_context": "选中匹配项的上下文:",
|
|
32
|
+
# Table Headers
|
|
33
|
+
"file": "文件",
|
|
34
|
+
"type": "类型",
|
|
35
|
+
"matches": "匹配数",
|
|
36
|
+
"time": "时间 (s)",
|
|
37
|
+
# Dialogs and Messages
|
|
38
|
+
"select_input_directory": "选择输入目录",
|
|
39
|
+
"select_rules_file": "选择规则文件",
|
|
40
|
+
"json_files": "JSON 文件 (*.json)",
|
|
41
|
+
"save_results_dialog": "保存结果",
|
|
42
|
+
"default_results_filename": "扫描结果_{datetime}.json",
|
|
43
|
+
# Error Messages
|
|
44
|
+
"error": "错误",
|
|
45
|
+
"warning": "警告",
|
|
46
|
+
"success": "成功",
|
|
47
|
+
"invalid_input_directory": "无效的输入目录",
|
|
48
|
+
"no_valid_rules": "没有找到有效的规则",
|
|
49
|
+
"failed_to_load_rules": "加载规则失败: {error}",
|
|
50
|
+
"no_results_to_save": "没有可保存的结果",
|
|
51
|
+
"failed_to_save_results": "保存结果失败: {error}",
|
|
52
|
+
"scan_failed": "扫描失败: {error}",
|
|
53
|
+
# Progress Messages
|
|
54
|
+
"starting_scan": "开始扫描...",
|
|
55
|
+
"scan_complete": "扫描完成!",
|
|
56
|
+
"pausing_scan": "暂停扫描...",
|
|
57
|
+
"stopping_scan": "停止扫描...",
|
|
58
|
+
"scan_completed": "扫描完成",
|
|
59
|
+
"scan_stopped": "扫描已停止",
|
|
60
|
+
"scan_resumed": "扫描已恢复",
|
|
61
|
+
"found_matches_files": "在 {count} 个文件中找到匹配项",
|
|
62
|
+
# File Info
|
|
63
|
+
"file_info": "文件",
|
|
64
|
+
"type_info": "类型",
|
|
65
|
+
"size": "大小",
|
|
66
|
+
"bytes": "字节",
|
|
67
|
+
# Match Info
|
|
68
|
+
"rule": "规则",
|
|
69
|
+
"description": "描述",
|
|
70
|
+
"line": "行 {line}: {match}",
|
|
71
|
+
"context": "上下文",
|
|
72
|
+
"default_file_types": "pdf,docx,xlsx,pptx,txt,odt,rtf,epub,csv,xml,html,md,jpg,jpeg,png,gif,bmp,tiff",
|
|
73
|
+
"default_rules_file": "rules.json",
|
|
74
|
+
# Menu Items
|
|
75
|
+
"file_menu": "文件(&F)",
|
|
76
|
+
"settings_menu": "设置(&S)",
|
|
77
|
+
"help_menu": "帮助(&H)",
|
|
78
|
+
"preferences": "首选项(&P)...",
|
|
79
|
+
"exit": "退出(&X)",
|
|
80
|
+
"about": "关于(&A)",
|
|
81
|
+
"about_title": "关于文档扫描器",
|
|
82
|
+
"about_text": "文档扫描器 GUI\n\n版本 1.0",
|
|
83
|
+
"language": "语言",
|
|
84
|
+
# Open Results
|
|
85
|
+
"open_results": "打开结果...",
|
|
86
|
+
"open_results_file": "打开扫描结果",
|
|
87
|
+
"loaded_results_from": "已从以下位置加载结果: {path}",
|
|
88
|
+
"results_loaded_successfully": "结果已成功从以下位置加载:\n{path}",
|
|
89
|
+
"failed_to_load_results": "加载结果失败: {error}",
|
|
90
|
+
# Settings Dialog
|
|
91
|
+
"language_settings": "语言设置",
|
|
92
|
+
"processing_options": "处理选项",
|
|
93
|
+
"performance_settings": "性能设置",
|
|
94
|
+
"language_label": "语言:",
|
|
95
|
+
"ocr_tooltip": "为扫描的 PDF 文件启用 OCR 以从图像中提取文本",
|
|
96
|
+
"process_pool_tooltip": "对 CPU 密集型操作使用多进程(可能会增加内存使用)",
|
|
97
|
+
"threads_tooltip": "工作线程数量(较高的值可能会提高速度但会使用更多 CPU)",
|
|
98
|
+
"batch_size_tooltip": "每批处理的文件数量(较大的批次可能会提高吞吐量)",
|
|
99
|
+
"file_types_tooltip": "要扫描的文件类型(逗号分隔)",
|
|
100
|
+
# Results Saved Message
|
|
101
|
+
"results_saved_to": "结果已保存至:\t{path}",
|
|
102
|
+
# Default Values
|
|
103
|
+
"files_scanned_zero": "已扫描文件: 0",
|
|
104
|
+
"files_with_matches_zero": "包含匹配项的文件: 0",
|
|
105
|
+
# Apply Button
|
|
106
|
+
"apply": "应用",
|
|
107
|
+
# Command-line specific translations
|
|
108
|
+
"document_scanner_description": "扫描文档并使用特定规则提取文本、图像和元数据。",
|
|
109
|
+
"input_directory_help": "包含待扫描文档的输入目录",
|
|
110
|
+
"rules_file_help": "规则文件 (JSON格式)",
|
|
111
|
+
"recursive_help": "递归扫描文件",
|
|
112
|
+
"file_types_help": "要扫描的文件类型(逗号分隔)",
|
|
113
|
+
"use_pdf_ocr_help": "对基于图像的PDF文件使用OCR",
|
|
114
|
+
"use_process_pool_help": "使用进程池而非线程池(更适合CPU密集型任务)",
|
|
115
|
+
"batch_size_help": "每批次处理的文件数量",
|
|
116
|
+
"threads_help": "并行扫描的线程数",
|
|
117
|
+
"progress_help": "显示进度条",
|
|
118
|
+
"verbose_help": "详细输出",
|
|
119
|
+
"language_help": "设置语言(en表示英文,zh表示中文)",
|
|
120
|
+
# Status and logging messages
|
|
121
|
+
"scanning_directory": "正在扫描目录: {directory}",
|
|
122
|
+
"found_files_to_scan": "发现 {count} 个文件待扫描",
|
|
123
|
+
"scan_stopped_before_submitting_tasks": "用户在提交所有任务之前停止扫描",
|
|
124
|
+
"scan_paused": "扫描已暂停",
|
|
125
|
+
"scan_stopped_while_paused": "扫描在暂停状态下停止",
|
|
126
|
+
"scan_stopped_by_user_canceling_tasks": "用户停止扫描,正在取消剩余任务...",
|
|
127
|
+
"task_timeout_scan_may_be_stopping": "任务超时,扫描可能正在停止",
|
|
128
|
+
"error_scanning_file": "扫描文件时出错: {error}",
|
|
129
|
+
"progress_report": "进度: {processed}/{total} 文件已处理",
|
|
130
|
+
"force_shutting_down_executor": "强制关闭执行器...",
|
|
131
|
+
"scan_stopped_processed_files": "扫描已停止。已处理 {processed} 个文件",
|
|
132
|
+
"scan_complete_found_matches": "扫描完成。在 {matches_count} 个文件中找到匹配项",
|
|
133
|
+
"found_matches_in_file": "找到匹配项: {file_name}",
|
|
134
|
+
"processed_file_info": "已处理 {file_name} ({ext}),耗时 {time:.3f}秒 - 找到 {matches_count} 个匹配项",
|
|
135
|
+
"could_not_extract_text_from_file": "无法从 {file_path} 提取文本: {error}",
|
|
136
|
+
"pymupdf_failed_for_file": "{file_name} 的 PyMuPDF 失败: {error}",
|
|
137
|
+
"pypdf_also_failed_for_file": "{file_name} 的 pypdf 也失败: {error}",
|
|
138
|
+
"no_pdf_library_installed": "未安装PDF库 (pymupdf 或 pypdf)",
|
|
139
|
+
"pymupdf_not_installed": "未安装 PyMuPDF",
|
|
140
|
+
"no_pages_found_in_file": "在 {file_path} 中未找到页面",
|
|
141
|
+
"no_metadata_found_in_file": "在 {file_path} 中未找到元数据",
|
|
142
|
+
"pymupdf_error_trying_fallback": "PyMuPDF 在 {file_path} 上出错: {error},尝试使用 pypdf 回退",
|
|
143
|
+
"pypdf_not_installed_skipping_extraction": "未安装 pypdf,跳过PDF提取",
|
|
144
|
+
"error_extracting_pdf_with_pypdf": "使用 pypdf 提取PDF时出错: {error}",
|
|
145
|
+
"odfpy_not_installed_skipping_extraction": "未安装 odfpy,跳过ODT提取",
|
|
146
|
+
"error_extracting_odt": "提取ODT时出错: {error}",
|
|
147
|
+
"error_extracting_rtf": "提取RTF时出错: {error}",
|
|
148
|
+
"ebooklib_not_installed_skipping_extraction": "未安装 ebooklib,跳过EPUB提取",
|
|
149
|
+
"error_extracting_epub": "提取EPUB时出错: {error}",
|
|
150
|
+
"error_extracting_csv": "提取CSV时出错: {error}",
|
|
151
|
+
"error_extracting_xml": "提取XML时出错: {error}",
|
|
152
|
+
"error_extracting_html": "提取HTML时出错: {error}",
|
|
153
|
+
"error_extracting_markdown": "提取Markdown时出错: {error}",
|
|
154
|
+
"python_docx_not_installed_skipping_extraction": "未安装 python-docx,跳过DOCX提取",
|
|
155
|
+
"openpyxl_not_installed_skipping_extraction": "未安装 openpyxl,跳过XLSX提取",
|
|
156
|
+
"python_pptx_not_installed_skipping_extraction": "未安装 python-pptx,跳过PPTX提取",
|
|
157
|
+
"pillow_or_tesseract_not_installed_skipping_ocr": "未安装 PIL 或 pytesseract,跳过图像OCR",
|
|
158
|
+
"could_not_perform_ocr_on_file": "无法对 {file_path} 执行OCR: {error}",
|
|
159
|
+
"input_directory_does_not_exist": "输入目录不存在: {input_dir}",
|
|
160
|
+
"using_rules_file": "使用规则文件: {rules_file}",
|
|
161
|
+
"invalid_json_in_rules_file": "规则文件中的JSON无效: {error}",
|
|
162
|
+
"invalid_rules_format": "无效的规则格式。期望包含'rules'键的列表或字典",
|
|
163
|
+
"no_valid_rules_found": "未找到有效规则",
|
|
164
|
+
"total_time_elapsed": "总耗时: {time:.2f}秒",
|
|
165
|
+
"invalid_regex_pattern": "无效的正则表达式模式 '{pattern}': {error}",
|
|
166
|
+
"rules_file_does_not_exist_alt": "规则文件不存在: {rules_file}",
|
|
167
|
+
"image_files_supported": "支持的图像文件(需要OCR)",
|
|
168
|
+
"include_image_formats": "包含图像格式",
|
|
169
|
+
"include_image_formats_tooltip": "在扫描中包含图像格式(jpg, jpeg, png, gif, bmp, tiff)",
|
|
170
|
+
}
|
sfi/filedate/filedate.py
CHANGED
|
@@ -1,112 +1,185 @@
|
|
|
1
|
-
"""Remove file date prefix and replace with creation/modification date."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import argparse
|
|
6
|
-
import concurrent.futures
|
|
7
|
-
import logging
|
|
8
|
-
import
|
|
9
|
-
import
|
|
10
|
-
|
|
11
|
-
from
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
1
|
+
"""Remove file date prefix and replace with creation/modification date."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import concurrent.futures
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
import re
|
|
10
|
+
import time
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from functools import cached_property, lru_cache
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
DETECT_SEPARATORS: str = "-_#.~"
|
|
16
|
+
SEP: str = "_"
|
|
17
|
+
MAX_RETRY: int = 100
|
|
18
|
+
DATE_PATTERN = re.compile(r"(20|19)\d{2}((0[1-9])|(1[012]))((0[1-9])|([12]\d)|(3[01]))")
|
|
19
|
+
|
|
20
|
+
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass(frozen=True)
|
|
25
|
+
class SingleFileRenamer:
|
|
26
|
+
"""Renamer for single file."""
|
|
27
|
+
|
|
28
|
+
path: Path
|
|
29
|
+
|
|
30
|
+
@staticmethod
|
|
31
|
+
def rename(path: Path) -> None:
|
|
32
|
+
renamer = SingleFileRenamer(path)
|
|
33
|
+
|
|
34
|
+
dest_path = renamer.filepath_renamed
|
|
35
|
+
sequence = 1
|
|
36
|
+
while dest_path.exists() and sequence <= MAX_RETRY:
|
|
37
|
+
logger.warning(f"{dest_path} already exists, adding unique suffix.")
|
|
38
|
+
dest_path = renamer.filepath_renamed.with_name(
|
|
39
|
+
f"{renamer.filestem_renamed}({sequence}){renamer.file_suffix}",
|
|
40
|
+
)
|
|
41
|
+
sequence += 1
|
|
42
|
+
|
|
43
|
+
# If we've reached the max retry limit and the path still exists, give up
|
|
44
|
+
if dest_path.exists() and sequence > MAX_RETRY:
|
|
45
|
+
logger.error(f"Max retry reached, giving up on {renamer.path}.")
|
|
46
|
+
return
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
renamer.path.rename(dest_path)
|
|
50
|
+
except Exception:
|
|
51
|
+
logger.error(f"Rename failed: {renamer.path} -> {dest_path}")
|
|
52
|
+
else:
|
|
53
|
+
logger.info(f"Rename: {renamer.path} -> {dest_path}")
|
|
54
|
+
|
|
55
|
+
@cached_property
|
|
56
|
+
def file_suffix(self) -> str:
|
|
57
|
+
"""Get file suffix"""
|
|
58
|
+
return self.path.suffix
|
|
59
|
+
|
|
60
|
+
@cached_property
|
|
61
|
+
def filepath_renamed(self) -> Path:
|
|
62
|
+
"""Get renamed filepath"""
|
|
63
|
+
return self.path.with_name(self.filename_renamed)
|
|
64
|
+
|
|
65
|
+
@cached_property
|
|
66
|
+
def filestem_renamed(self) -> str:
|
|
67
|
+
"""Get renamed file stem"""
|
|
68
|
+
# Extract stem from the filename string
|
|
69
|
+
suffix = self.path.suffix
|
|
70
|
+
return (
|
|
71
|
+
self.filename_renamed[: -len(suffix)]
|
|
72
|
+
if suffix and self.filename_renamed.endswith(suffix)
|
|
73
|
+
else self.filename_renamed
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
@cached_property
|
|
77
|
+
def filename_renamed(self) -> str:
|
|
78
|
+
"""Get renamed filename"""
|
|
79
|
+
return f"{self.time_mark}{SEP}{self.filestem_without_date}{self.path.suffix}"
|
|
80
|
+
|
|
81
|
+
@cached_property
|
|
82
|
+
def filestem_without_date(self) -> str:
|
|
83
|
+
"""Get file stem without date prefix"""
|
|
84
|
+
|
|
85
|
+
@lru_cache(maxsize=1024)
|
|
86
|
+
def remove_date_prefix(filestem: str) -> str:
|
|
87
|
+
"""Remove date prefix from filename."""
|
|
88
|
+
|
|
89
|
+
# Handle the case where the filestem is just a hidden file (e.g., .txt) with no date
|
|
90
|
+
if (
|
|
91
|
+
filestem.startswith(".")
|
|
92
|
+
and not filestem[1:].isdigit()
|
|
93
|
+
and len(filestem) > 1
|
|
94
|
+
) and not re.search(DATE_PATTERN, filestem):
|
|
95
|
+
return ""
|
|
96
|
+
|
|
97
|
+
match = re.search(DATE_PATTERN, filestem)
|
|
98
|
+
if not match:
|
|
99
|
+
logger.debug(f"No date prefix found: {filestem}")
|
|
100
|
+
return filestem
|
|
101
|
+
b, e = match.start(), match.end()
|
|
102
|
+
if b >= 1 and filestem[b - 1] in DETECT_SEPARATORS:
|
|
103
|
+
filestem = filestem[: b - 1] + filestem[e:]
|
|
104
|
+
elif e < len(filestem) and filestem[e] in DETECT_SEPARATORS:
|
|
105
|
+
filestem = filestem[:b] + filestem[e + 1 :]
|
|
106
|
+
return remove_date_prefix(filestem)
|
|
107
|
+
|
|
108
|
+
return remove_date_prefix(self.filestem)
|
|
109
|
+
|
|
110
|
+
@cached_property
|
|
111
|
+
def filestem(self) -> str:
|
|
112
|
+
"""Get current file stem."""
|
|
113
|
+
return self.path.stem
|
|
114
|
+
|
|
115
|
+
@cached_property
|
|
116
|
+
def filestat(self) -> os.stat_result:
|
|
117
|
+
"""Get file stat."""
|
|
118
|
+
return self.path.stat()
|
|
119
|
+
|
|
120
|
+
@cached_property
|
|
121
|
+
def modified_time(self) -> float:
|
|
122
|
+
"""Get modified time."""
|
|
123
|
+
return self.filestat.st_mtime
|
|
124
|
+
|
|
125
|
+
@cached_property
|
|
126
|
+
def created_time(self) -> float:
|
|
127
|
+
"""Get created time."""
|
|
128
|
+
return self.filestat.st_ctime
|
|
129
|
+
|
|
130
|
+
@cached_property
|
|
131
|
+
def time_mark(self) -> str:
|
|
132
|
+
"""Get time mark."""
|
|
133
|
+
return time.strftime(
|
|
134
|
+
"%Y%m%d",
|
|
135
|
+
time.localtime(max((self.modified_time, self.created_time))),
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@dataclass(frozen=True)
|
|
140
|
+
class MultiFileRenamer:
|
|
141
|
+
"""Renamer for multiple files."""
|
|
142
|
+
|
|
143
|
+
paths: list[str]
|
|
144
|
+
|
|
145
|
+
@staticmethod
|
|
146
|
+
def rename_files(paths: list[str]) -> None:
|
|
147
|
+
renamer = MultiFileRenamer(paths)
|
|
148
|
+
if not renamer.filtered_paths:
|
|
149
|
+
logger.error("No valid files to process.")
|
|
150
|
+
return
|
|
151
|
+
|
|
152
|
+
t0 = time.perf_counter()
|
|
153
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
|
|
154
|
+
executor.map(SingleFileRenamer.rename, renamer.filtered_paths)
|
|
155
|
+
logger.info(f"Done in {time.perf_counter() - t0:.4f}s")
|
|
156
|
+
|
|
157
|
+
@cached_property
|
|
158
|
+
def converted_paths(self) -> list[Path]:
|
|
159
|
+
"""Get filtered paths."""
|
|
160
|
+
return [Path(p) for p in self.paths]
|
|
161
|
+
|
|
162
|
+
@cached_property
|
|
163
|
+
def filtered_paths(self) -> list[Path]:
|
|
164
|
+
"""Get filtered paths."""
|
|
165
|
+
return [p for p in self.converted_paths if p.exists()]
|
|
166
|
+
|
|
167
|
+
@cached_property
|
|
168
|
+
def missing_paths(self) -> list[Path]:
|
|
169
|
+
"""Get missing paths."""
|
|
170
|
+
return list(set(self.converted_paths) - set(self.filtered_paths))
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def main() -> None:
|
|
174
|
+
parser = argparse.ArgumentParser(
|
|
175
|
+
prog="filedate",
|
|
176
|
+
description="Remove file date prefix and replace with creation/modification date.",
|
|
177
|
+
)
|
|
178
|
+
parser.add_argument("targets", type=str, nargs="+", help="List of input files")
|
|
179
|
+
parser.add_argument("--debug", "-d", action="store_true", help="Enable debug mode")
|
|
180
|
+
args = parser.parse_args()
|
|
181
|
+
|
|
182
|
+
if args.debug:
|
|
183
|
+
logger.setLevel(logging.DEBUG)
|
|
184
|
+
|
|
185
|
+
MultiFileRenamer.rename_files(args.targets)
|