xtn-tools-pro 1.0.0.7.3__py3-none-any.whl → 1.0.0.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xtn_tools_pro/utils/log.py +1 -1
- xtn_tools_pro/utils/set_data.py +126 -60
- {xtn_tools_pro-1.0.0.7.3.dist-info → xtn_tools_pro-1.0.0.7.4.dist-info}/METADATA +1 -1
- {xtn_tools_pro-1.0.0.7.3.dist-info → xtn_tools_pro-1.0.0.7.4.dist-info}/RECORD +7 -7
- {xtn_tools_pro-1.0.0.7.3.dist-info → xtn_tools_pro-1.0.0.7.4.dist-info}/LICENSE +0 -0
- {xtn_tools_pro-1.0.0.7.3.dist-info → xtn_tools_pro-1.0.0.7.4.dist-info}/WHEEL +0 -0
- {xtn_tools_pro-1.0.0.7.3.dist-info → xtn_tools_pro-1.0.0.7.4.dist-info}/top_level.txt +0 -0
xtn_tools_pro/utils/log.py
CHANGED
xtn_tools_pro/utils/set_data.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python
|
2
2
|
# -*- coding: utf-8 -*-
|
3
3
|
|
4
|
-
#
|
4
|
+
# 说明
|
5
5
|
# 大文件去重
|
6
6
|
# History:
|
7
7
|
# Date Author Version Modification
|
@@ -14,10 +14,10 @@ import hashlib
|
|
14
14
|
from tqdm import tqdm
|
15
15
|
from xtn_tools_pro.utils.log import Log
|
16
16
|
from xtn_tools_pro.utils.helpers import get_orderId_random
|
17
|
-
from xtn_tools_pro.utils.file_utils import mkdirs_dir, get_file_extension,is_dir,get_listdir
|
17
|
+
from xtn_tools_pro.utils.file_utils import mkdirs_dir, get_file_extension, is_dir, get_listdir
|
18
18
|
|
19
19
|
|
20
|
-
class
|
20
|
+
class PSetDataObj:
|
21
21
|
def __init__(self):
|
22
22
|
# 随机生成一个临时文件夹
|
23
23
|
self.__order_id = get_orderId_random()
|
@@ -34,7 +34,7 @@ class PppSetDataObj:
|
|
34
34
|
"""
|
35
35
|
对单个文件去重,air版本,不对文件做任何修改,去重任何数据
|
36
36
|
:param set_file_path:单文件路径
|
37
|
-
:param num_shards
|
37
|
+
:param num_shards:临时文件切片,推荐:数据越大值越大 1000
|
38
38
|
:return:
|
39
39
|
"""
|
40
40
|
if get_file_extension(set_file_path) != ".txt":
|
@@ -48,7 +48,7 @@ class PppSetDataObj:
|
|
48
48
|
|
49
49
|
num_shards = 3000 if num_shards >= 3000 else num_shards
|
50
50
|
num_shards = 3000 if line_count >= 30000000 else num_shards
|
51
|
-
num_shards = 1000 if num_shards <=
|
51
|
+
num_shards = 1000 if num_shards <= 1 else num_shards
|
52
52
|
|
53
53
|
shard_file_obj_list = []
|
54
54
|
shard_path_list = []
|
@@ -58,8 +58,9 @@ class PppSetDataObj:
|
|
58
58
|
shard_file_obj_list.append(open(shard_path, "w", encoding="utf-8"))
|
59
59
|
|
60
60
|
with open(set_file_path, "r", encoding="utf-8") as f_r:
|
61
|
-
tqdm_f = tqdm(f_r, total=line_count, desc="正在去重(1/2)",
|
62
|
-
|
61
|
+
tqdm_f = tqdm(f_r, total=line_count, desc="正在去重(1/2)",
|
62
|
+
bar_format="{l_bar}{bar}|{n}/{total} [预计完成时间:{remaining}]")
|
63
|
+
for line_i in tqdm_f:
|
63
64
|
line = line_i.strip().encode()
|
64
65
|
line_hash = hashlib.md5(line).hexdigest()
|
65
66
|
shard_id = int(line_hash, 16) % num_shards
|
@@ -69,7 +70,8 @@ class PppSetDataObj:
|
|
69
70
|
shard_file_obj.close()
|
70
71
|
|
71
72
|
result_w_path = os.path.join(self.__now_current_working_dir, "000_去重结果.txt")
|
72
|
-
tqdm_f = tqdm(shard_path_list, total=len(shard_path_list), desc="正在去重(2/2)",
|
73
|
+
tqdm_f = tqdm(shard_path_list, total=len(shard_path_list), desc="正在去重(2/2)",
|
74
|
+
bar_format="{l_bar}{bar}|{n}/{total} [预计完成时间:{remaining}]")
|
73
75
|
with open(result_w_path, "w", encoding="utf-8") as f_w:
|
74
76
|
for shard_path in tqdm_f:
|
75
77
|
with open(shard_path, "r", encoding="utf-8") as f_r:
|
@@ -86,11 +88,11 @@ class PppSetDataObj:
|
|
86
88
|
line_count = sum(1 for _ in fp_r)
|
87
89
|
self.__logger.info(f"文件处理完毕,去重后总行数为:{line_count},结果路径:{result_w_path}")
|
88
90
|
|
89
|
-
def set_file_data_pro(self, set_file_dir_path, num_shards=
|
91
|
+
def set_file_data_pro(self, set_file_dir_path, num_shards=3000):
|
90
92
|
"""
|
91
93
|
对文件夹下的所有txt文件去重,pro版本,不对文件做任何修改,去重任何数据
|
92
94
|
:param set_file_dir_path:文件夹路径
|
93
|
-
:param num_shards
|
95
|
+
:param num_shards:临时文件切片,推荐:数据越大值越大 1000
|
94
96
|
:return:
|
95
97
|
"""
|
96
98
|
if not is_dir(set_file_dir_path):
|
@@ -101,55 +103,119 @@ class PppSetDataObj:
|
|
101
103
|
set_file_path_list = []
|
102
104
|
for set_file_name in get_listdir(set_file_dir_path):
|
103
105
|
if fnmatch.fnmatch(set_file_name, '*.txt'):
|
104
|
-
set_file_path_list.append(os.path.join(set_file_dir_path,set_file_name))
|
106
|
+
set_file_path_list.append(os.path.join(set_file_dir_path, set_file_name))
|
105
107
|
self.__logger.info(f"当前文件夹下可去重文件数量为:{len(set_file_path_list)}")
|
106
108
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
109
|
+
num_shards = 3000 if num_shards >= 3000 else num_shards
|
110
|
+
num_shards = 1000 if num_shards <= 1000 else num_shards
|
111
|
+
|
112
|
+
shard_file_obj_list = []
|
113
|
+
shard_path_list = []
|
114
|
+
for _ in range(num_shards):
|
115
|
+
shard_path = f"{os.path.join(self.__now_current_working_dir, f'{self.__order_id}_shard_{_}.tmp')}"
|
116
|
+
shard_path_list.append(shard_path)
|
117
|
+
shard_file_obj_list.append(open(shard_path, "w", encoding="utf-8"))
|
118
|
+
|
119
|
+
for _ in range(len(set_file_path_list)):
|
120
|
+
set_file_path = set_file_path_list[_]
|
121
|
+
with open(set_file_path, "r", encoding="utf-8") as fp_r:
|
122
|
+
line_count = sum(1 for _ in fp_r)
|
123
|
+
# self.__logger.info(f"{set_file_path}读取完成,总行数为:{line_count}")
|
124
|
+
|
125
|
+
with open(set_file_path, "r", encoding="utf-8") as f_r:
|
126
|
+
tqdm_f = tqdm(f_r, total=line_count, desc=f"正在去重({_ + 1}/{len(set_file_path_list) + 1})",
|
127
|
+
bar_format="{l_bar}{bar}|{n}/{total} [预计完成时间:{remaining}]")
|
128
|
+
for line_i in tqdm_f:
|
129
|
+
line = line_i.strip().encode()
|
130
|
+
line_hash = hashlib.md5(line).hexdigest()
|
131
|
+
shard_id = int(line_hash, 16) % num_shards
|
132
|
+
shard_file_obj_list[shard_id].write(line_i)
|
133
|
+
|
134
|
+
for shard_file_obj in shard_file_obj_list:
|
135
|
+
shard_file_obj.close()
|
136
|
+
|
137
|
+
result_w_path = os.path.join(self.__now_current_working_dir, "000_去重结果.txt")
|
138
|
+
tqdm_f = tqdm(shard_path_list, total=len(shard_path_list),
|
139
|
+
desc=f"正在去重({len(set_file_path_list) + 1}/{len(set_file_path_list) + 1})",
|
140
|
+
bar_format="{l_bar}{bar}|{n}/{total} [预计完成时间:{remaining}]")
|
141
|
+
with open(result_w_path, "w", encoding="utf-8") as f_w:
|
142
|
+
for shard_path in tqdm_f:
|
143
|
+
with open(shard_path, "r", encoding="utf-8") as f_r:
|
144
|
+
seen_list = []
|
145
|
+
for line_i in f_r.readlines():
|
146
|
+
line = line_i.strip()
|
147
|
+
seen_list.append(line)
|
148
|
+
seen_list = list(set(seen_list))
|
149
|
+
w_txt = "\n".join(seen_list)
|
150
|
+
f_w.write(w_txt + "\n")
|
151
|
+
os.remove(shard_path) # 删除临时文件
|
152
|
+
|
153
|
+
with open(result_w_path, "r", encoding="utf-8") as fp_r:
|
154
|
+
line_count = sum(1 for _ in fp_r)
|
155
|
+
self.__logger.info(f"文件处理完毕,去重后总行数为:{line_count},结果路径:{result_w_path}")
|
156
|
+
|
157
|
+
def merging_data(self, file_dir_path, merging_new_file_name="合并"):
|
158
|
+
"""
|
159
|
+
传入一个文件夹,合并这个文件夹下所有.txt的数据
|
160
|
+
:param file_dir_path: 文件夹
|
161
|
+
:param merging_new_file_name: 新的输出位置
|
162
|
+
:return:
|
163
|
+
"""
|
164
|
+
if not is_dir(file_dir_path):
|
165
|
+
self.__logger.critical("文件夹不存在或不合法")
|
166
|
+
return
|
167
|
+
|
168
|
+
self.__logger.info("正在统计文件可合并数量...")
|
169
|
+
file_path_list = []
|
170
|
+
for set_file_name in get_listdir(file_dir_path):
|
171
|
+
if fnmatch.fnmatch(set_file_name, '*.txt'):
|
172
|
+
if set_file_name == f"{merging_new_file_name}.txt": continue
|
173
|
+
file_path_list.append(os.path.join(file_dir_path, set_file_name))
|
174
|
+
self.__logger.info(f"当前文件夹下可合并文件数量为:{len(file_path_list)}")
|
175
|
+
|
176
|
+
result_w_path = os.path.join(file_dir_path, f"{merging_new_file_name}.txt")
|
177
|
+
|
178
|
+
with open(result_w_path, "w", encoding="utf-8") as f_w:
|
179
|
+
for _ in range(len(file_path_list)):
|
180
|
+
file_path = file_path_list[_]
|
181
|
+
with open(file_path, "r", encoding="utf-8") as fp_r:
|
182
|
+
line_count = sum(1 for _ in fp_r)
|
183
|
+
|
184
|
+
with open(file_path, "r", encoding="utf-8") as f_r:
|
185
|
+
tqdm_f = tqdm(f_r, total=line_count,
|
186
|
+
desc=f"正在合并({_ + 1}/{len(file_path_list)})",
|
187
|
+
bar_format="{l_bar}{bar}|{n}/{total} [预计完成时间:{remaining}]")
|
188
|
+
for line_i in tqdm_f:
|
189
|
+
line = line_i.strip()
|
190
|
+
f_w.write(line + "\n")
|
191
|
+
|
192
|
+
def split_data(self, file_path, split_new_file_name="分割", file_index=1, file_max_line=1000000):
|
193
|
+
if get_file_extension(file_path) != ".txt":
|
194
|
+
self.__logger.critical("文件不合法,只接受.txt文件")
|
195
|
+
return
|
196
|
+
self.__logger.info("正在读取文件总行数...")
|
197
|
+
|
198
|
+
with open(file_path, "r", encoding="utf-8") as fp_r:
|
199
|
+
line_count = sum(1 for _ in fp_r)
|
200
|
+
self.__logger.info(f"读取文件完成,总行数为:{line_count}")
|
201
|
+
|
202
|
+
with open(file_path, "r", encoding="utf-8") as f_r:
|
203
|
+
tqdm_f = tqdm(f_r, total=line_count, desc="正在分割(1/1)",
|
204
|
+
bar_format="{l_bar}{bar}|{n}/{total} [预计完成时间:{remaining}]")
|
205
|
+
temp_line_list = []
|
206
|
+
parent_path = os.path.dirname(file_path)
|
207
|
+
for line_i in tqdm_f:
|
208
|
+
line = line_i.strip()
|
209
|
+
temp_line_list.append(line)
|
210
|
+
if len(temp_line_list) == file_max_line:
|
211
|
+
result_w_path = os.path.join(parent_path, f"{split_new_file_name}_{file_index}.txt")
|
212
|
+
self.__list_to_write_file(result_w_path, temp_line_list)
|
213
|
+
temp_line_list = []
|
214
|
+
file_index += 1
|
215
|
+
if temp_line_list:
|
216
|
+
result_w_path = os.path.join(parent_path, f"{split_new_file_name}_{file_index}.txt")
|
217
|
+
self.__list_to_write_file(result_w_path, temp_line_list)
|
218
|
+
|
219
|
+
def __list_to_write_file(self, file_w_path, data_list):
|
220
|
+
with open(file_w_path, "w", encoding="utf-8") as result_w_f:
|
221
|
+
result_w_f.write("\n".join(data_list))
|
@@ -17,13 +17,13 @@ xtn_tools_pro/utils/__init__.py,sha256=I1_n_NP23F2lBqlF4EOlnOdLYxM8M4pbn63UhJN1h
|
|
17
17
|
xtn_tools_pro/utils/crypto.py,sha256=oyzFqWum_oimUtzhfVCELQhdMjxDbLu-nOWfcNmazcc,4087
|
18
18
|
xtn_tools_pro/utils/file_utils.py,sha256=obaBP7CaBCsXxzqGeWzV2l0yw7vicgKOaXzmpMV8ips,2567
|
19
19
|
xtn_tools_pro/utils/helpers.py,sha256=H-a3gnahIah3kJqyKzzKlPWtVQYcFlJncz2rAfBqIiw,4444
|
20
|
-
xtn_tools_pro/utils/log.py,sha256=
|
20
|
+
xtn_tools_pro/utils/log.py,sha256=mf5huJDA8xVxxFWPG_tl_vOsAA2_ywGDFycYSGHIDCo,10202
|
21
21
|
xtn_tools_pro/utils/retry.py,sha256=0wjHsR5DBBKpv4naMfxiky8kprrZes4WURIfFQ4H708,1657
|
22
|
-
xtn_tools_pro/utils/set_data.py,sha256=
|
22
|
+
xtn_tools_pro/utils/set_data.py,sha256=IthfAclck7AbaxOIKOgJZ2wdcfEmlvC-C63Tywcr4bA,11180
|
23
23
|
xtn_tools_pro/utils/sql.py,sha256=EAKzbkZP7Q09j15Gm6o0_uq0qgQmcCQT6EAawbpp4v0,6263
|
24
24
|
xtn_tools_pro/utils/time_utils.py,sha256=TUtzG61PeVYXhaQd6pBrXAdlz7tBispNIRQRcGhE2No,4859
|
25
|
-
xtn_tools_pro-1.0.0.7.
|
26
|
-
xtn_tools_pro-1.0.0.7.
|
27
|
-
xtn_tools_pro-1.0.0.7.
|
28
|
-
xtn_tools_pro-1.0.0.7.
|
29
|
-
xtn_tools_pro-1.0.0.7.
|
25
|
+
xtn_tools_pro-1.0.0.7.4.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
26
|
+
xtn_tools_pro-1.0.0.7.4.dist-info/METADATA,sha256=gwssgHhEwWGdiJBfIaTSKm6fDbZYU1AqPYJjTFm8EtE,498
|
27
|
+
xtn_tools_pro-1.0.0.7.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
28
|
+
xtn_tools_pro-1.0.0.7.4.dist-info/top_level.txt,sha256=jyB3FLDEr8zE1U7wHczTgIbvUpALhR-ULF7RVEO7O2U,14
|
29
|
+
xtn_tools_pro-1.0.0.7.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|