orbitkit 0.8.50__tar.gz → 0.8.51__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {orbitkit-0.8.50/orbitkit.egg-info → orbitkit-0.8.51}/PKG-INFO +2 -15
- orbitkit-0.8.51/orbitkit/VERSION +1 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/util/__init__.py +3 -0
- orbitkit-0.8.51/orbitkit/util/universal_extractor.py +497 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51/orbitkit.egg-info}/PKG-INFO +2 -15
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit.egg-info/SOURCES.txt +1 -0
- orbitkit-0.8.50/orbitkit/VERSION +0 -1
- {orbitkit-0.8.50 → orbitkit-0.8.51}/LICENSE +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/MANIFEST.in +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/README.md +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/__init__.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/airflow_handler/__init__.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/airflow_handler/file_handler.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/audio_transcoder/__init__.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/constant/__init__.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/constant/report_schema.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/id_srv/__init__.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/id_srv/id_gen.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/id_srv/id_perm_like.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/lark_send/__init__.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/lark_send/lark.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/llm_tools/__init__.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/orbit_type/__init__.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/orbit_type/tools.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_embedding/__init__.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_extractor/__init__.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_extractor/exceptions.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_extractor_simple/base.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_extractor_simple/core.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_extractor_simple/utils.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_writer/__init__.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/util/cache_asset_downloader.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/util/common.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/util/customize_regix_manager.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/util/secret_manager.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/util/util_aliyun.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/util/util_aws.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/util/util_date.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/util/util_html.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/util/util_kafka.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/util/util_md5.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/util/util_selenium.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/util/util_simple_timer.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/util/util_str.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/util/util_type_mapping.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit/util/util_url.py +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit.egg-info/dependency_links.txt +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit.egg-info/not-zip-safe +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit.egg-info/requires.txt +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/orbitkit.egg-info/top_level.txt +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/setup.cfg +0 -0
- {orbitkit-0.8.50 → orbitkit-0.8.51}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: orbitkit
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.51
|
|
4
4
|
Summary: This project is only for Orbit Tech internal use.
|
|
5
5
|
Home-page: https://github.com/clown-0726/orbitkit
|
|
6
6
|
Author: Lilu Cao
|
|
@@ -37,19 +37,6 @@ Requires-Dist: prettytable>=3.16.0
|
|
|
37
37
|
Requires-Dist: pytz>=2025.2
|
|
38
38
|
Requires-Dist: Deprecated
|
|
39
39
|
Requires-Dist: func_timeout
|
|
40
|
-
Dynamic: author
|
|
41
|
-
Dynamic: author-email
|
|
42
|
-
Dynamic: classifier
|
|
43
|
-
Dynamic: description
|
|
44
|
-
Dynamic: description-content-type
|
|
45
|
-
Dynamic: home-page
|
|
46
|
-
Dynamic: license
|
|
47
|
-
Dynamic: license-file
|
|
48
|
-
Dynamic: maintainer
|
|
49
|
-
Dynamic: maintainer-email
|
|
50
|
-
Dynamic: platform
|
|
51
|
-
Dynamic: requires-dist
|
|
52
|
-
Dynamic: summary
|
|
53
40
|
|
|
54
41
|
# orbitkit
|
|
55
42
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.8.51
|
|
@@ -0,0 +1,497 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import tempfile
|
|
4
|
+
import zipfile
|
|
5
|
+
import tarfile
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional, List, Union, Set
|
|
8
|
+
import logging
|
|
9
|
+
import platform
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import rarfile
|
|
13
|
+
import py7zr
|
|
14
|
+
except ImportError:
|
|
15
|
+
raise ValueError(
|
|
16
|
+
"Please install below packages before using this function.\n"
|
|
17
|
+
"- rarfile\n"
|
|
18
|
+
"- py7zr\n"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
if platform.system() == 'Windows':
|
|
24
|
+
logger.warning(
|
|
25
|
+
"Windows system requires additional UnRAR tool to extract RAR files. Please download and install UnRAR from https://www.rarlab.com/rar_add.htm")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class UniversalExtractor:
|
|
29
|
+
"""
|
|
30
|
+
通用压缩文件解压类,支持常见压缩格式和特殊情况处理
|
|
31
|
+
|
|
32
|
+
可以作为上下文管理器使用以自动清理临时目录:
|
|
33
|
+
with UniversalExtractor() as extractor:
|
|
34
|
+
result = extractor.extract('file.zip')
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
# 支持的压缩格式及其对应扩展名
|
|
38
|
+
SUPPORTED_FORMATS = {
|
|
39
|
+
'zip': ['.zip'],
|
|
40
|
+
'tar': ['.tar'],
|
|
41
|
+
'tar.gz': ['.tar.gz', '.tgz'],
|
|
42
|
+
'tar.bz2': ['.tar.bz2', '.tbz2'],
|
|
43
|
+
'tar.xz': ['.tar.xz', '.txz'],
|
|
44
|
+
'rar': ['.rar'],
|
|
45
|
+
'7z': ['.7z']
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
def __init__(self):
|
|
49
|
+
self.temp_dir = None
|
|
50
|
+
self._temp_dirs = [] # 跟踪所有创建的临时目录
|
|
51
|
+
|
|
52
|
+
def __enter__(self):
|
|
53
|
+
"""上下文管理器入口"""
|
|
54
|
+
return self
|
|
55
|
+
|
|
56
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
57
|
+
"""上下文管理器退出,清理临时目录"""
|
|
58
|
+
self.cleanup()
|
|
59
|
+
return False
|
|
60
|
+
|
|
61
|
+
def cleanup(self):
|
|
62
|
+
"""清理所有临时目录"""
|
|
63
|
+
for temp_dir in self._temp_dirs:
|
|
64
|
+
if temp_dir and os.path.exists(temp_dir):
|
|
65
|
+
try:
|
|
66
|
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
67
|
+
logger.info(f"已清理临时目录: {temp_dir}")
|
|
68
|
+
except Exception as e:
|
|
69
|
+
logger.warning(f"清理临时目录失败 {temp_dir}: {str(e)}")
|
|
70
|
+
self._temp_dirs.clear()
|
|
71
|
+
self.temp_dir = None
|
|
72
|
+
|
|
73
|
+
def _get_file_format(self, file_path: Union[str, Path]) -> Optional[str]:
|
|
74
|
+
"""根据文件扩展名判断压缩格式"""
|
|
75
|
+
file_path = Path(file_path)
|
|
76
|
+
suffix = file_path.suffix.lower()
|
|
77
|
+
|
|
78
|
+
# 检查多部分扩展名(如.tar.gz)
|
|
79
|
+
if len(file_path.suffixes) > 1:
|
|
80
|
+
multi_suffix = ''.join(file_path.suffixes[-2:]).lower()
|
|
81
|
+
for format_name, extensions in self.SUPPORTED_FORMATS.items():
|
|
82
|
+
if multi_suffix in extensions:
|
|
83
|
+
return format_name
|
|
84
|
+
|
|
85
|
+
# 检查单扩展名
|
|
86
|
+
for format_name, extensions in self.SUPPORTED_FORMATS.items():
|
|
87
|
+
if suffix in extensions:
|
|
88
|
+
return format_name
|
|
89
|
+
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
def _create_output_dir(self, output_path: Optional[str] = None) -> str:
|
|
93
|
+
"""创建输出目录"""
|
|
94
|
+
if output_path:
|
|
95
|
+
os.makedirs(output_path, exist_ok=True)
|
|
96
|
+
return output_path
|
|
97
|
+
else:
|
|
98
|
+
self.temp_dir = tempfile.mkdtemp(prefix="unpack_")
|
|
99
|
+
self._temp_dirs.append(self.temp_dir) # 跟踪临时目录
|
|
100
|
+
return self.temp_dir
|
|
101
|
+
|
|
102
|
+
def _extract_zip(self, archive_path: str, output_path: str, password: Optional[str] = None) -> List[str]:
|
|
103
|
+
"""解压ZIP文件"""
|
|
104
|
+
extracted_files = []
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
with zipfile.ZipFile(archive_path, 'r') as zip_ref:
|
|
108
|
+
# 尝试检测加密文件
|
|
109
|
+
encrypted_files = [f for f in zip_ref.namelist() if zip_ref.getinfo(f).flag_bits & 0x1]
|
|
110
|
+
if encrypted_files and not password:
|
|
111
|
+
logger.warning("发现加密文件,但未提供密码")
|
|
112
|
+
|
|
113
|
+
# 尝试解压
|
|
114
|
+
try:
|
|
115
|
+
zip_ref.extractall(output_path, pwd=password.encode() if password else None)
|
|
116
|
+
except RuntimeError as e:
|
|
117
|
+
if "Bad password" in str(e) or "password" in str(e).lower():
|
|
118
|
+
raise ValueError("密码错误或需要密码") from e
|
|
119
|
+
raise
|
|
120
|
+
|
|
121
|
+
# 获取所有提取的文件
|
|
122
|
+
for member in zip_ref.namelist():
|
|
123
|
+
member_path = os.path.join(output_path, member)
|
|
124
|
+
if os.path.exists(member_path) and os.path.isfile(member_path):
|
|
125
|
+
extracted_files.append(member_path)
|
|
126
|
+
|
|
127
|
+
except zipfile.BadZipFile:
|
|
128
|
+
raise ValueError("ZIP文件损坏或格式不正确")
|
|
129
|
+
|
|
130
|
+
return extracted_files
|
|
131
|
+
|
|
132
|
+
def _extract_tar(self, archive_path: str, output_path: str) -> List[str]:
|
|
133
|
+
"""解压TAR文件"""
|
|
134
|
+
extracted_files = []
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
with tarfile.open(archive_path, 'r') as tar_ref:
|
|
138
|
+
# 安全提取,防止路径遍历攻击
|
|
139
|
+
safe_members = []
|
|
140
|
+
for member in tar_ref.getmembers():
|
|
141
|
+
# 确保成员路径安全
|
|
142
|
+
member_path = os.path.normpath(member.name)
|
|
143
|
+
if member_path.startswith(('..', '/', '\\')):
|
|
144
|
+
# 只保留文件名,移除危险路径
|
|
145
|
+
member.name = os.path.basename(member_path)
|
|
146
|
+
logger.warning(f"检测到不安全路径,已重命名: {member_path} -> {member.name}")
|
|
147
|
+
safe_members.append(member)
|
|
148
|
+
|
|
149
|
+
# 只提取安全的成员
|
|
150
|
+
for member in safe_members:
|
|
151
|
+
tar_ref.extract(member, output_path)
|
|
152
|
+
if member.isfile():
|
|
153
|
+
member_path = os.path.join(output_path, member.name)
|
|
154
|
+
if os.path.exists(member_path):
|
|
155
|
+
extracted_files.append(member_path)
|
|
156
|
+
|
|
157
|
+
except tarfile.ReadError:
|
|
158
|
+
raise ValueError("TAR文件损坏或格式不正确")
|
|
159
|
+
|
|
160
|
+
return extracted_files
|
|
161
|
+
|
|
162
|
+
def _extract_rar(self, archive_path: str, output_path: str, password: Optional[str] = None) -> List[str]:
|
|
163
|
+
"""解压RAR文件"""
|
|
164
|
+
extracted_files = []
|
|
165
|
+
|
|
166
|
+
try:
|
|
167
|
+
# 检查是否安装了unrar工具
|
|
168
|
+
if not rarfile.UNRAR_TOOL:
|
|
169
|
+
logger.warning("未找到unrar工具,RAR解压可能受限")
|
|
170
|
+
|
|
171
|
+
with rarfile.RarFile(archive_path, 'r') as rar_ref:
|
|
172
|
+
# 尝试解压
|
|
173
|
+
try:
|
|
174
|
+
rar_ref.extractall(output_path, pwd=password)
|
|
175
|
+
except rarfile.PasswordRequired:
|
|
176
|
+
raise ValueError("需要密码来解压RAR文件")
|
|
177
|
+
except rarfile.RarCannotExec:
|
|
178
|
+
raise ValueError("需要unrar工具,无法解压RAR文件")
|
|
179
|
+
|
|
180
|
+
# 获取所有提取的文件
|
|
181
|
+
for member in rar_ref.infolist():
|
|
182
|
+
if not member.isdir():
|
|
183
|
+
member_path = os.path.join(output_path, member.filename)
|
|
184
|
+
if os.path.exists(member_path):
|
|
185
|
+
extracted_files.append(member_path)
|
|
186
|
+
|
|
187
|
+
except rarfile.NotRarFile:
|
|
188
|
+
raise ValueError("不是有效的RAR文件")
|
|
189
|
+
except rarfile.BadRarFile:
|
|
190
|
+
raise ValueError("RAR文件损坏或格式不正确")
|
|
191
|
+
except rarfile.NeedFirstVolume:
|
|
192
|
+
raise ValueError("需要多卷RAR文件的第一卷")
|
|
193
|
+
|
|
194
|
+
return extracted_files
|
|
195
|
+
|
|
196
|
+
def _extract_7z(self, archive_path: str, output_path: str, password: Optional[str] = None) -> List[str]:
|
|
197
|
+
"""解压7Z文件"""
|
|
198
|
+
extracted_files = []
|
|
199
|
+
|
|
200
|
+
try:
|
|
201
|
+
with py7zr.SevenZipFile(archive_path, 'r', password=password) as zip_ref:
|
|
202
|
+
zip_ref.extractall(output_path)
|
|
203
|
+
|
|
204
|
+
# 获取所有提取的文件
|
|
205
|
+
for member in zip_ref.list():
|
|
206
|
+
if not member.is_directory:
|
|
207
|
+
member_path = os.path.join(output_path, member.filename)
|
|
208
|
+
if os.path.exists(member_path):
|
|
209
|
+
extracted_files.append(member_path)
|
|
210
|
+
|
|
211
|
+
except py7zr.Bad7zFile:
|
|
212
|
+
raise ValueError("7Z文件损坏或格式不正确")
|
|
213
|
+
except py7zr.PasswordRequired:
|
|
214
|
+
raise ValueError("需要密码来解压7Z文件")
|
|
215
|
+
except Exception as e:
|
|
216
|
+
if str(e) == 'Corrupt input data':
|
|
217
|
+
raise ValueError("7Z文件密码错误")
|
|
218
|
+
raise e
|
|
219
|
+
|
|
220
|
+
return extracted_files
|
|
221
|
+
|
|
222
|
+
def _remove_empty_directories(self, directory: str):
|
|
223
|
+
"""递归删除空目录"""
|
|
224
|
+
for root, dirs, files in os.walk(directory, topdown=False):
|
|
225
|
+
for dir_name in dirs:
|
|
226
|
+
dir_path = os.path.join(root, dir_name)
|
|
227
|
+
try:
|
|
228
|
+
# 尝试删除目录,如果目录不为空会抛出OSError异常
|
|
229
|
+
os.rmdir(dir_path)
|
|
230
|
+
logger.debug(f"删除空目录: {dir_path}")
|
|
231
|
+
except OSError:
|
|
232
|
+
# 目录非空,保留
|
|
233
|
+
pass
|
|
234
|
+
|
|
235
|
+
def _flatten_directory(self, directory: str):
|
|
236
|
+
"""将目录中的所有文件移动到顶层目录,并删除子目录"""
|
|
237
|
+
# 先收集所有需要移动的文件,避免在遍历时修改目录结构
|
|
238
|
+
files_to_move = []
|
|
239
|
+
for root, dirs, files in os.walk(directory):
|
|
240
|
+
for file in files:
|
|
241
|
+
src_path = os.path.join(root, file)
|
|
242
|
+
# 如果文件不在顶层目录
|
|
243
|
+
if root != directory:
|
|
244
|
+
files_to_move.append((src_path, file))
|
|
245
|
+
|
|
246
|
+
# 然后再移动文件
|
|
247
|
+
for src_path, file in files_to_move:
|
|
248
|
+
# 生成目标文件名,处理同名文件
|
|
249
|
+
base_name = file
|
|
250
|
+
dest_path = os.path.join(directory, base_name)
|
|
251
|
+
counter = 1
|
|
252
|
+
|
|
253
|
+
# 处理文件名冲突
|
|
254
|
+
while os.path.exists(dest_path):
|
|
255
|
+
name, ext = os.path.splitext(base_name)
|
|
256
|
+
dest_path = os.path.join(directory, f"{name}_{counter}{ext}")
|
|
257
|
+
counter += 1
|
|
258
|
+
|
|
259
|
+
# 移动文件
|
|
260
|
+
try:
|
|
261
|
+
shutil.move(src_path, dest_path)
|
|
262
|
+
except FileNotFoundError:
|
|
263
|
+
# 文件可能已被移动,跳过
|
|
264
|
+
logger.warning(f"文件不存在,跳过: {src_path}")
|
|
265
|
+
|
|
266
|
+
# 删除所有子目录
|
|
267
|
+
self._remove_empty_directories(directory)
|
|
268
|
+
|
|
269
|
+
def _handle_nested_archives(self, output_path: str, password: Optional[str] = None,
|
|
270
|
+
recursive: bool = True, current_depth: int = 0):
|
|
271
|
+
"""处理嵌套压缩文件(压缩包中的压缩包)"""
|
|
272
|
+
if not recursive:
|
|
273
|
+
return
|
|
274
|
+
|
|
275
|
+
max_nested_level = 10 # 防止无限递归
|
|
276
|
+
if current_depth >= max_nested_level:
|
|
277
|
+
logger.warning(f"达到最大嵌套深度 {max_nested_level},停止递归解压")
|
|
278
|
+
return
|
|
279
|
+
|
|
280
|
+
# 先收集所有压缩文件,避免在遍历时修改目录结构
|
|
281
|
+
archive_files = []
|
|
282
|
+
for root, _, files in os.walk(output_path):
|
|
283
|
+
for file in files:
|
|
284
|
+
file_path = os.path.join(root, file)
|
|
285
|
+
file_format = self._get_file_format(file_path)
|
|
286
|
+
if file_format:
|
|
287
|
+
archive_files.append(file_path)
|
|
288
|
+
|
|
289
|
+
# 然后处理所有收集到的压缩文件
|
|
290
|
+
for file_path in archive_files:
|
|
291
|
+
if not os.path.exists(file_path):
|
|
292
|
+
# 文件可能已被删除
|
|
293
|
+
continue
|
|
294
|
+
|
|
295
|
+
try:
|
|
296
|
+
# 为嵌套压缩包创建解压目录,处理多扩展名
|
|
297
|
+
file_format = self._get_file_format(file_path)
|
|
298
|
+
# 获取不含压缩扩展名的路径
|
|
299
|
+
nested_output = self._get_path_without_archive_extension(file_path, file_format)
|
|
300
|
+
os.makedirs(nested_output, exist_ok=True)
|
|
301
|
+
|
|
302
|
+
# 递归解压,增加深度
|
|
303
|
+
self._extract_file_internal(file_path, nested_output, password, recursive, current_depth + 1)
|
|
304
|
+
|
|
305
|
+
# 删除原始压缩包
|
|
306
|
+
os.remove(file_path)
|
|
307
|
+
logger.info(f"已解压嵌套压缩文件 (深度 {current_depth + 1}): {file_path}")
|
|
308
|
+
|
|
309
|
+
except Exception as e:
|
|
310
|
+
logger.warning(f"无法解压嵌套压缩文件 {file_path}: {str(e)}")
|
|
311
|
+
# 保留原始压缩包
|
|
312
|
+
|
|
313
|
+
def is_archive_file(self, file_path: Union[str, Path]) -> bool:
|
|
314
|
+
"""根据文件扩展名判断是否为压缩文件"""
|
|
315
|
+
file_format = self._get_file_format(file_path)
|
|
316
|
+
return False if not file_format else True
|
|
317
|
+
|
|
318
|
+
def _get_path_without_archive_extension(self, file_path: str, file_format: str) -> str:
|
|
319
|
+
"""获取去除压缩扩展名后的路径,处理多扩展名如 .tar.gz"""
|
|
320
|
+
path = Path(file_path)
|
|
321
|
+
|
|
322
|
+
# 对于多部分扩展名
|
|
323
|
+
if file_format in ['tar.gz', 'tar.bz2', 'tar.xz']:
|
|
324
|
+
# 移除两个扩展名
|
|
325
|
+
return str(path.parent / path.stem.rsplit('.', 1)[0])
|
|
326
|
+
else:
|
|
327
|
+
# 单扩展名
|
|
328
|
+
return str(path.parent / path.stem)
|
|
329
|
+
|
|
330
|
+
def _extract_file_internal(self,
|
|
331
|
+
archive_path: Union[str, Path],
|
|
332
|
+
output_path: str,
|
|
333
|
+
password: Optional[str] = None,
|
|
334
|
+
recursive: bool = True,
|
|
335
|
+
current_depth: int = 0) -> List[str]:
|
|
336
|
+
"""
|
|
337
|
+
内部解压方法,支持深度追踪
|
|
338
|
+
"""
|
|
339
|
+
# 获取文件格式
|
|
340
|
+
file_format = self._get_file_format(archive_path)
|
|
341
|
+
if not file_format:
|
|
342
|
+
raise ValueError(f"不支持的压缩格式: {archive_path}")
|
|
343
|
+
|
|
344
|
+
# 根据格式选择解压方法
|
|
345
|
+
if file_format == 'zip':
|
|
346
|
+
extracted_files = self._extract_zip(str(archive_path), output_path, password)
|
|
347
|
+
elif file_format in ['tar', 'tar.gz', 'tar.bz2', 'tar.xz']:
|
|
348
|
+
extracted_files = self._extract_tar(str(archive_path), output_path)
|
|
349
|
+
elif file_format == 'rar':
|
|
350
|
+
extracted_files = self._extract_rar(str(archive_path), output_path, password)
|
|
351
|
+
elif file_format == '7z':
|
|
352
|
+
extracted_files = self._extract_7z(str(archive_path), output_path, password)
|
|
353
|
+
else:
|
|
354
|
+
raise ValueError(f"不支持的压缩格式: {file_format}")
|
|
355
|
+
|
|
356
|
+
logger.info(f"成功解压 {len(extracted_files)} 个文件")
|
|
357
|
+
|
|
358
|
+
# 处理嵌套压缩文件
|
|
359
|
+
if recursive:
|
|
360
|
+
self._handle_nested_archives(output_path, password, recursive, current_depth)
|
|
361
|
+
|
|
362
|
+
return extracted_files
|
|
363
|
+
|
|
364
|
+
def _extract_file(self,
|
|
365
|
+
archive_path: Union[str, Path],
|
|
366
|
+
output_path: str,
|
|
367
|
+
password: Optional[str] = None,
|
|
368
|
+
recursive: bool = True) -> List[str]:
|
|
369
|
+
"""
|
|
370
|
+
解压方法的公共接口
|
|
371
|
+
"""
|
|
372
|
+
return self._extract_file_internal(archive_path, output_path, password, recursive, current_depth=0)
|
|
373
|
+
|
|
374
|
+
def _filter_files_by_extension(self, directory: str, extensions: Optional[Set[str]] = None) -> List[str]:
|
|
375
|
+
"""根据扩展名过滤文件,删除不符合条件的文件并清理空文件夹"""
|
|
376
|
+
if not extensions:
|
|
377
|
+
return []
|
|
378
|
+
|
|
379
|
+
kept_files = []
|
|
380
|
+
removed_files = []
|
|
381
|
+
|
|
382
|
+
# 首先收集所有文件并分类
|
|
383
|
+
for root, dirs, files in os.walk(directory):
|
|
384
|
+
for file in files:
|
|
385
|
+
file_path = os.path.join(root, file)
|
|
386
|
+
file_ext = os.path.splitext(file)[1].lower()
|
|
387
|
+
|
|
388
|
+
if file_ext in extensions:
|
|
389
|
+
kept_files.append(file_path)
|
|
390
|
+
else:
|
|
391
|
+
removed_files.append(file_path)
|
|
392
|
+
|
|
393
|
+
# 然后删除不需要的文件
|
|
394
|
+
for file_path in removed_files:
|
|
395
|
+
try:
|
|
396
|
+
os.remove(file_path)
|
|
397
|
+
except FileNotFoundError:
|
|
398
|
+
logger.warning(f"文件已不存在,跳过删除: {file_path}")
|
|
399
|
+
|
|
400
|
+
logger.info(f"根据扩展名过滤: 保留了 {len(kept_files)} 个文件, 删除了 {len(removed_files)} 个文件")
|
|
401
|
+
|
|
402
|
+
# 清理空目录
|
|
403
|
+
self._remove_empty_directories(directory)
|
|
404
|
+
|
|
405
|
+
return kept_files
|
|
406
|
+
|
|
407
|
+
def extract(self,
|
|
408
|
+
archive_path: Union[str, Path],
|
|
409
|
+
output_path: Optional[str] = None,
|
|
410
|
+
password: Optional[str] = None,
|
|
411
|
+
flatten: bool = False,
|
|
412
|
+
recursive: bool = True,
|
|
413
|
+
extensions: Optional[List[str]] = None) -> str:
|
|
414
|
+
"""
|
|
415
|
+
解压压缩文件
|
|
416
|
+
|
|
417
|
+
参数:
|
|
418
|
+
archive_path: 压缩文件路径
|
|
419
|
+
output_path: 输出目录路径,如果为None则使用临时目录
|
|
420
|
+
password: 解压密码(如果需要)
|
|
421
|
+
flatten: 是否将所有文件提取到同一级目录,默认为 False
|
|
422
|
+
recursive: 是否递归解压嵌套压缩文件,默认为 True
|
|
423
|
+
extensions: 只保留指定扩展名的文件,如果为None则保留所有文件
|
|
424
|
+
|
|
425
|
+
返回:
|
|
426
|
+
解压后的目录路径
|
|
427
|
+
"""
|
|
428
|
+
# 检查文件是否存在
|
|
429
|
+
archive_path = Path(archive_path)
|
|
430
|
+
if not archive_path.exists():
|
|
431
|
+
raise FileNotFoundError(f"压缩文件不存在: {archive_path}")
|
|
432
|
+
|
|
433
|
+
# 处理扩展名参数
|
|
434
|
+
ext_set = None
|
|
435
|
+
if extensions is not None and len(extensions) > 0:
|
|
436
|
+
# 确保扩展名以点开头并转换为小写
|
|
437
|
+
ext_set = set()
|
|
438
|
+
for ext in extensions:
|
|
439
|
+
if not ext.startswith('.'):
|
|
440
|
+
ext = '.' + ext
|
|
441
|
+
ext_set.add(ext.lower())
|
|
442
|
+
|
|
443
|
+
# 创建输出目录
|
|
444
|
+
output_dir = self._create_output_dir(output_path)
|
|
445
|
+
logger.info(f"解压到目录: {output_dir}")
|
|
446
|
+
|
|
447
|
+
try:
|
|
448
|
+
# 先解压文件(包括递归解压嵌套压缩包)
|
|
449
|
+
self._extract_file(archive_path, output_dir, password, recursive)
|
|
450
|
+
|
|
451
|
+
# 如果需要展平,在所有解压完成后进行
|
|
452
|
+
if flatten:
|
|
453
|
+
self._flatten_directory(output_dir)
|
|
454
|
+
|
|
455
|
+
# 最后,根据扩展名过滤文件(只有在明确指定扩展名时才过滤)
|
|
456
|
+
if ext_set:
|
|
457
|
+
self._filter_files_by_extension(output_dir, ext_set)
|
|
458
|
+
|
|
459
|
+
except Exception as e:
|
|
460
|
+
# 清理输出目录(如果是临时目录)
|
|
461
|
+
if output_path is None and os.path.exists(output_dir):
|
|
462
|
+
shutil.rmtree(output_dir, ignore_errors=True)
|
|
463
|
+
# 从跟踪列表中移除
|
|
464
|
+
if output_dir in self._temp_dirs:
|
|
465
|
+
self._temp_dirs.remove(output_dir)
|
|
466
|
+
raise e
|
|
467
|
+
|
|
468
|
+
return output_dir
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
# 使用示例
|
|
472
|
+
if __name__ == "__main__":
|
|
473
|
+
pass
|
|
474
|
+
# 方式1:使用上下文管理器(推荐,自动清理临时目录)
|
|
475
|
+
# with UniversalExtractor() as extractor:
|
|
476
|
+
# result_path = extractor.extract("archive.zip")
|
|
477
|
+
# # 使用 result_path...
|
|
478
|
+
# # 退出 with 块后自动清理临时目录
|
|
479
|
+
|
|
480
|
+
# 方式2:传统方式
|
|
481
|
+
# extractor = UniversalExtractor()
|
|
482
|
+
|
|
483
|
+
# 带密码解压
|
|
484
|
+
# result_path = extractor.extract("encrypted.zip", password="secret")
|
|
485
|
+
|
|
486
|
+
# 指定输出路径并扁平化
|
|
487
|
+
# result_path = extractor.extract("archive.rar", output_path="/tmp/output", flatten=True)
|
|
488
|
+
|
|
489
|
+
# 只提取特定扩展名的文件
|
|
490
|
+
# result_path = extractor.extract("archive.zip", extensions=[".txt", ".jpg"])
|
|
491
|
+
# result_path = extractor.extract("archive.zip", extensions=["txt", "jpg"]) # 也可以不带点
|
|
492
|
+
|
|
493
|
+
# 基本用法
|
|
494
|
+
# result_path = extractor.extract(file_path, output_path=os.path.join("output", file_name))
|
|
495
|
+
|
|
496
|
+
# 手动清理临时目录(如果不使用上下文管理器)
|
|
497
|
+
# extractor.cleanup()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: orbitkit
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.51
|
|
4
4
|
Summary: This project is only for Orbit Tech internal use.
|
|
5
5
|
Home-page: https://github.com/clown-0726/orbitkit
|
|
6
6
|
Author: Lilu Cao
|
|
@@ -37,19 +37,6 @@ Requires-Dist: prettytable>=3.16.0
|
|
|
37
37
|
Requires-Dist: pytz>=2025.2
|
|
38
38
|
Requires-Dist: Deprecated
|
|
39
39
|
Requires-Dist: func_timeout
|
|
40
|
-
Dynamic: author
|
|
41
|
-
Dynamic: author-email
|
|
42
|
-
Dynamic: classifier
|
|
43
|
-
Dynamic: description
|
|
44
|
-
Dynamic: description-content-type
|
|
45
|
-
Dynamic: home-page
|
|
46
|
-
Dynamic: license
|
|
47
|
-
Dynamic: license-file
|
|
48
|
-
Dynamic: maintainer
|
|
49
|
-
Dynamic: maintainer-email
|
|
50
|
-
Dynamic: platform
|
|
51
|
-
Dynamic: requires-dist
|
|
52
|
-
Dynamic: summary
|
|
53
40
|
|
|
54
41
|
# orbitkit
|
|
55
42
|
|
|
@@ -60,6 +60,7 @@ orbitkit/util/cache_asset_downloader.py
|
|
|
60
60
|
orbitkit/util/common.py
|
|
61
61
|
orbitkit/util/customize_regix_manager.py
|
|
62
62
|
orbitkit/util/secret_manager.py
|
|
63
|
+
orbitkit/util/universal_extractor.py
|
|
63
64
|
orbitkit/util/util_aliyun.py
|
|
64
65
|
orbitkit/util/util_aliyun_oss_simple.py
|
|
65
66
|
orbitkit/util/util_aws.py
|
orbitkit-0.8.50/orbitkit/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.8.50
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|