xtn-tools-pro 1.0.0.7.3__py3-none-any.whl → 1.0.0.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -78,7 +78,7 @@ class Log:
78
78
  is_write_to_file=False,
79
79
  color=True,
80
80
  mode='a',
81
- max_bytes=1024000000,
81
+ max_bytes=1024000000, # 1 * 1024 * 1024 * 1024
82
82
  backup_count=0,
83
83
  encoding="utf-8",
84
84
  save_time_log_path=None):
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # 说明:
4
+ # 说明
5
5
  # 大文件去重
6
6
  # History:
7
7
  # Date Author Version Modification
@@ -14,10 +14,10 @@ import hashlib
14
14
  from tqdm import tqdm
15
15
  from xtn_tools_pro.utils.log import Log
16
16
  from xtn_tools_pro.utils.helpers import get_orderId_random
17
- from xtn_tools_pro.utils.file_utils import mkdirs_dir, get_file_extension,is_dir,get_listdir
17
+ from xtn_tools_pro.utils.file_utils import mkdirs_dir, get_file_extension, is_dir, get_listdir
18
18
 
19
19
 
20
- class PppSetDataObj:
20
+ class PSetDataObj:
21
21
  def __init__(self):
22
22
  # 随机生成一个临时文件夹
23
23
  self.__order_id = get_orderId_random()
@@ -34,7 +34,7 @@ class PppSetDataObj:
34
34
  """
35
35
  对单个文件去重,air版本,不对文件做任何修改,去重任何数据
36
36
  :param set_file_path:单文件路径
37
- :param num_shards:临时文件切片,推荐:数据越大值越大 10、100、1000、10000
37
+ :param num_shards:临时文件切片,推荐:数据越大值越大 1000
38
38
  :return:
39
39
  """
40
40
  if get_file_extension(set_file_path) != ".txt":
@@ -48,7 +48,7 @@ class PppSetDataObj:
48
48
 
49
49
  num_shards = 3000 if num_shards >= 3000 else num_shards
50
50
  num_shards = 3000 if line_count >= 30000000 else num_shards
51
- num_shards = 1000 if num_shards <= 0 else num_shards
51
+ num_shards = 1000 if num_shards <= 1 else num_shards
52
52
 
53
53
  shard_file_obj_list = []
54
54
  shard_path_list = []
@@ -58,8 +58,9 @@ class PppSetDataObj:
58
58
  shard_file_obj_list.append(open(shard_path, "w", encoding="utf-8"))
59
59
 
60
60
  with open(set_file_path, "r", encoding="utf-8") as f_r:
61
- tqdm_f = tqdm(f_r, total=line_count, desc="正在去重(1/2)", unit="lines")
62
- for idx, line_i in enumerate(tqdm_f):
61
+ tqdm_f = tqdm(f_r, total=line_count, desc="正在去重(1/2)",
62
+ bar_format="{l_bar}{bar}|{n}/{total} [预计完成时间:{remaining}]")
63
+ for line_i in tqdm_f:
63
64
  line = line_i.strip().encode()
64
65
  line_hash = hashlib.md5(line).hexdigest()
65
66
  shard_id = int(line_hash, 16) % num_shards
@@ -69,7 +70,8 @@ class PppSetDataObj:
69
70
  shard_file_obj.close()
70
71
 
71
72
  result_w_path = os.path.join(self.__now_current_working_dir, "000_去重结果.txt")
72
- tqdm_f = tqdm(shard_path_list, total=len(shard_path_list), desc="正在去重(2/2)", unit="lines")
73
+ tqdm_f = tqdm(shard_path_list, total=len(shard_path_list), desc="正在去重(2/2)",
74
+ bar_format="{l_bar}{bar}|{n}/{total} [预计完成时间:{remaining}]")
73
75
  with open(result_w_path, "w", encoding="utf-8") as f_w:
74
76
  for shard_path in tqdm_f:
75
77
  with open(shard_path, "r", encoding="utf-8") as f_r:
@@ -86,11 +88,11 @@ class PppSetDataObj:
86
88
  line_count = sum(1 for _ in fp_r)
87
89
  self.__logger.info(f"文件处理完毕,去重后总行数为:{line_count},结果路径:{result_w_path}")
88
90
 
89
- def set_file_data_pro(self, set_file_dir_path, num_shards=1000):
91
+ def set_file_data_pro(self, set_file_dir_path, num_shards=3000):
90
92
  """
91
93
  对文件夹下的所有txt文件去重,pro版本,不对文件做任何修改,去重任何数据
92
94
  :param set_file_dir_path:文件夹路径
93
- :param num_shards:临时文件切片,推荐:数据越大值越大 10、100、1000、10000
95
+ :param num_shards:临时文件切片,推荐:数据越大值越大 1000
94
96
  :return:
95
97
  """
96
98
  if not is_dir(set_file_dir_path):
@@ -101,55 +103,119 @@ class PppSetDataObj:
101
103
  set_file_path_list = []
102
104
  for set_file_name in get_listdir(set_file_dir_path):
103
105
  if fnmatch.fnmatch(set_file_name, '*.txt'):
104
- set_file_path_list.append(os.path.join(set_file_dir_path,set_file_name))
106
+ set_file_path_list.append(os.path.join(set_file_dir_path, set_file_name))
105
107
  self.__logger.info(f"当前文件夹下可去重文件数量为:{len(set_file_path_list)}")
106
108
 
107
- for set_file_path in set_file_path_list:
108
- pass
109
- # with open(set_file_path, "r", encoding="utf-8") as fp_r:
110
- # line_count = sum(1 for _ in fp_r)
111
- # self.__logger.info(f"读取文件完成,总行数为:{line_count}")
112
-
113
-
114
-
115
-
116
-
117
- # num_shards = 3000 if num_shards >= 3000 else num_shards
118
- # num_shards = 3000 if line_count >= 30000000 else num_shards
119
- # num_shards = 1000 if num_shards <= 0 else num_shards
120
- #
121
- # shard_file_obj_list = []
122
- # shard_path_list = []
123
- # for _ in range(num_shards):
124
- # shard_path = f"{os.path.join(self.__now_current_working_dir, f'{self.__order_id}_shard_{_}.tmp')}"
125
- # shard_path_list.append(shard_path)
126
- # shard_file_obj_list.append(open(shard_path, "w", encoding="utf-8"))
127
- #
128
- # with open(set_file_path, "r", encoding="utf-8") as f_r:
129
- # tqdm_f = tqdm(f_r, total=line_count, desc="正在去重(1/2)", unit="lines")
130
- # for idx, line_i in enumerate(tqdm_f):
131
- # line = line_i.strip().encode()
132
- # line_hash = hashlib.md5(line).hexdigest()
133
- # shard_id = int(line_hash, 16) % num_shards
134
- # shard_file_obj_list[shard_id].write(line_i)
135
- #
136
- # for shard_file_obj in shard_file_obj_list:
137
- # shard_file_obj.close()
138
- #
139
- # result_w_path = os.path.join(self.__now_current_working_dir, "000_去重结果.txt")
140
- # tqdm_f = tqdm(shard_path_list, total=len(shard_path_list), desc="正在去重(2/2)", unit="lines")
141
- # with open(result_w_path, "w", encoding="utf-8") as f_w:
142
- # for shard_path in tqdm_f:
143
- # with open(shard_path, "r", encoding="utf-8") as f_r:
144
- # seen_list = []
145
- # for line_i in f_r.readlines():
146
- # line = line_i.strip()
147
- # seen_list.append(line)
148
- # seen_list = list(set(seen_list))
149
- # w_txt = "\n".join(seen_list)
150
- # f_w.write(w_txt + "\n")
151
- # os.remove(shard_path) # 删除临时文件
152
- #
153
- # with open(result_w_path, "r", encoding="utf-8") as fp_r:
154
- # line_count = sum(1 for _ in fp_r)
155
- # self.__logger.info(f"文件处理完毕,去重后总行数为:{line_count},结果路径:{result_w_path}")
109
+ num_shards = 3000 if num_shards >= 3000 else num_shards
110
+ num_shards = 1000 if num_shards <= 1000 else num_shards
111
+
112
+ shard_file_obj_list = []
113
+ shard_path_list = []
114
+ for _ in range(num_shards):
115
+ shard_path = f"{os.path.join(self.__now_current_working_dir, f'{self.__order_id}_shard_{_}.tmp')}"
116
+ shard_path_list.append(shard_path)
117
+ shard_file_obj_list.append(open(shard_path, "w", encoding="utf-8"))
118
+
119
+ for _ in range(len(set_file_path_list)):
120
+ set_file_path = set_file_path_list[_]
121
+ with open(set_file_path, "r", encoding="utf-8") as fp_r:
122
+ line_count = sum(1 for _ in fp_r)
123
+ # self.__logger.info(f"{set_file_path}读取完成,总行数为:{line_count}")
124
+
125
+ with open(set_file_path, "r", encoding="utf-8") as f_r:
126
+ tqdm_f = tqdm(f_r, total=line_count, desc=f"正在去重({_ + 1}/{len(set_file_path_list) + 1})",
127
+ bar_format="{l_bar}{bar}|{n}/{total} [预计完成时间:{remaining}]")
128
+ for line_i in tqdm_f:
129
+ line = line_i.strip().encode()
130
+ line_hash = hashlib.md5(line).hexdigest()
131
+ shard_id = int(line_hash, 16) % num_shards
132
+ shard_file_obj_list[shard_id].write(line_i)
133
+
134
+ for shard_file_obj in shard_file_obj_list:
135
+ shard_file_obj.close()
136
+
137
+ result_w_path = os.path.join(self.__now_current_working_dir, "000_去重结果.txt")
138
+ tqdm_f = tqdm(shard_path_list, total=len(shard_path_list),
139
+ desc=f"正在去重({len(set_file_path_list) + 1}/{len(set_file_path_list) + 1})",
140
+ bar_format="{l_bar}{bar}|{n}/{total} [预计完成时间:{remaining}]")
141
+ with open(result_w_path, "w", encoding="utf-8") as f_w:
142
+ for shard_path in tqdm_f:
143
+ with open(shard_path, "r", encoding="utf-8") as f_r:
144
+ seen_list = []
145
+ for line_i in f_r.readlines():
146
+ line = line_i.strip()
147
+ seen_list.append(line)
148
+ seen_list = list(set(seen_list))
149
+ w_txt = "\n".join(seen_list)
150
+ f_w.write(w_txt + "\n")
151
+ os.remove(shard_path) # 删除临时文件
152
+
153
+ with open(result_w_path, "r", encoding="utf-8") as fp_r:
154
+ line_count = sum(1 for _ in fp_r)
155
+ self.__logger.info(f"文件处理完毕,去重后总行数为:{line_count},结果路径:{result_w_path}")
156
+
157
+ def merging_data(self, file_dir_path, merging_new_file_name="合并"):
158
+ """
159
+ 传入一个文件夹,合并这个文件夹下所有.txt的数据
160
+ :param file_dir_path: 文件夹
161
+ :param merging_new_file_name: 新的输出位置
162
+ :return:
163
+ """
164
+ if not is_dir(file_dir_path):
165
+ self.__logger.critical("文件夹不存在或不合法")
166
+ return
167
+
168
+ self.__logger.info("正在统计文件可合并数量...")
169
+ file_path_list = []
170
+ for set_file_name in get_listdir(file_dir_path):
171
+ if fnmatch.fnmatch(set_file_name, '*.txt'):
172
+ if set_file_name == f"{merging_new_file_name}.txt": continue
173
+ file_path_list.append(os.path.join(file_dir_path, set_file_name))
174
+ self.__logger.info(f"当前文件夹下可合并文件数量为:{len(file_path_list)}")
175
+
176
+ result_w_path = os.path.join(file_dir_path, f"{merging_new_file_name}.txt")
177
+
178
+ with open(result_w_path, "w", encoding="utf-8") as f_w:
179
+ for _ in range(len(file_path_list)):
180
+ file_path = file_path_list[_]
181
+ with open(file_path, "r", encoding="utf-8") as fp_r:
182
+ line_count = sum(1 for _ in fp_r)
183
+
184
+ with open(file_path, "r", encoding="utf-8") as f_r:
185
+ tqdm_f = tqdm(f_r, total=line_count,
186
+ desc=f"正在合并({_ + 1}/{len(file_path_list)})",
187
+ bar_format="{l_bar}{bar}|{n}/{total} [预计完成时间:{remaining}]")
188
+ for line_i in tqdm_f:
189
+ line = line_i.strip()
190
+ f_w.write(line + "\n")
191
+
192
+ def split_data(self, file_path, split_new_file_name="分割", file_index=1, file_max_line=1000000):
193
+ if get_file_extension(file_path) != ".txt":
194
+ self.__logger.critical("文件不合法,只接受.txt文件")
195
+ return
196
+ self.__logger.info("正在读取文件总行数...")
197
+
198
+ with open(file_path, "r", encoding="utf-8") as fp_r:
199
+ line_count = sum(1 for _ in fp_r)
200
+ self.__logger.info(f"读取文件完成,总行数为:{line_count}")
201
+
202
+ with open(file_path, "r", encoding="utf-8") as f_r:
203
+ tqdm_f = tqdm(f_r, total=line_count, desc="正在分割(1/1)",
204
+ bar_format="{l_bar}{bar}|{n}/{total} [预计完成时间:{remaining}]")
205
+ temp_line_list = []
206
+ parent_path = os.path.dirname(file_path)
207
+ for line_i in tqdm_f:
208
+ line = line_i.strip()
209
+ temp_line_list.append(line)
210
+ if len(temp_line_list) == file_max_line:
211
+ result_w_path = os.path.join(parent_path, f"{split_new_file_name}_{file_index}.txt")
212
+ self.__list_to_write_file(result_w_path, temp_line_list)
213
+ temp_line_list = []
214
+ file_index += 1
215
+ if temp_line_list:
216
+ result_w_path = os.path.join(parent_path, f"{split_new_file_name}_{file_index}.txt")
217
+ self.__list_to_write_file(result_w_path, temp_line_list)
218
+
219
+ def __list_to_write_file(self, file_w_path, data_list):
220
+ with open(file_w_path, "w", encoding="utf-8") as result_w_f:
221
+ result_w_f.write("\n".join(data_list))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: xtn-tools-pro
3
- Version: 1.0.0.7.3
3
+ Version: 1.0.0.7.4
4
4
  Summary: xtn 开发工具
5
5
  Author: xtn
6
6
  Author-email: czw011122@gmail.com
@@ -17,13 +17,13 @@ xtn_tools_pro/utils/__init__.py,sha256=I1_n_NP23F2lBqlF4EOlnOdLYxM8M4pbn63UhJN1h
17
17
  xtn_tools_pro/utils/crypto.py,sha256=oyzFqWum_oimUtzhfVCELQhdMjxDbLu-nOWfcNmazcc,4087
18
18
  xtn_tools_pro/utils/file_utils.py,sha256=obaBP7CaBCsXxzqGeWzV2l0yw7vicgKOaXzmpMV8ips,2567
19
19
  xtn_tools_pro/utils/helpers.py,sha256=H-a3gnahIah3kJqyKzzKlPWtVQYcFlJncz2rAfBqIiw,4444
20
- xtn_tools_pro/utils/log.py,sha256=pAye_sXH-y-8v2vNf-OwOTk2Exkjl6y7V_y_Hpk_d0s,10176
20
+ xtn_tools_pro/utils/log.py,sha256=mf5huJDA8xVxxFWPG_tl_vOsAA2_ywGDFycYSGHIDCo,10202
21
21
  xtn_tools_pro/utils/retry.py,sha256=0wjHsR5DBBKpv4naMfxiky8kprrZes4WURIfFQ4H708,1657
22
- xtn_tools_pro/utils/set_data.py,sha256=vNhE_jCG-3p6KFnY_jbQ0vQ7EV1gB9D4Jb0S5ZoD4IM,7529
22
+ xtn_tools_pro/utils/set_data.py,sha256=IthfAclck7AbaxOIKOgJZ2wdcfEmlvC-C63Tywcr4bA,11180
23
23
  xtn_tools_pro/utils/sql.py,sha256=EAKzbkZP7Q09j15Gm6o0_uq0qgQmcCQT6EAawbpp4v0,6263
24
24
  xtn_tools_pro/utils/time_utils.py,sha256=TUtzG61PeVYXhaQd6pBrXAdlz7tBispNIRQRcGhE2No,4859
25
- xtn_tools_pro-1.0.0.7.3.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
- xtn_tools_pro-1.0.0.7.3.dist-info/METADATA,sha256=e7BnO3AKRCicejlwWdIVafht9964e0EJmA5ORQeRPa0,498
27
- xtn_tools_pro-1.0.0.7.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
28
- xtn_tools_pro-1.0.0.7.3.dist-info/top_level.txt,sha256=jyB3FLDEr8zE1U7wHczTgIbvUpALhR-ULF7RVEO7O2U,14
29
- xtn_tools_pro-1.0.0.7.3.dist-info/RECORD,,
25
+ xtn_tools_pro-1.0.0.7.4.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
+ xtn_tools_pro-1.0.0.7.4.dist-info/METADATA,sha256=gwssgHhEwWGdiJBfIaTSKm6fDbZYU1AqPYJjTFm8EtE,498
27
+ xtn_tools_pro-1.0.0.7.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
28
+ xtn_tools_pro-1.0.0.7.4.dist-info/top_level.txt,sha256=jyB3FLDEr8zE1U7wHczTgIbvUpALhR-ULF7RVEO7O2U,14
29
+ xtn_tools_pro-1.0.0.7.4.dist-info/RECORD,,