xtn-tools-pro 1.0.1.0.4__py3-none-any.whl → 1.0.1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,15 +30,28 @@ class PSetDataObj:
30
30
  is_write_to_file=False,
31
31
  color=True, mode='a', save_time_log_path='./logs')
32
32
 
33
+ def get_file_line_cnt(self, txt_file_path):
34
+ """
35
+ 传入一个 txt 文件 读取总行数
36
+ :param txt_file_path:
37
+ :return:
38
+ """
39
+ if get_file_extension(txt_file_path) != ".txt":
40
+ return 0
41
+
42
+ with open(txt_file_path, "r", encoding="utf-8") as fp_r:
43
+ a_line_count = sum(1 for _ in fp_r)
44
+ return a_line_count
45
+
33
46
  def set_file_data_air(self, set_file_path, num_shards=1000):
34
47
  """
35
- 对单个文件去重,air版本,不对文件做任何修改,去重任何数据
48
+ 对单个文件去重,air版本,不对文件做任何修改,去重任何数据
36
49
  :param set_file_path:单文件路径
37
- :param num_shards:临时文件切片,推荐:数据越大值越大 1000
50
+ :param num_shards:临时文件切片,推荐:数据越大值越大 1000
38
51
  :return:
39
52
  """
40
53
  if get_file_extension(set_file_path) != ".txt":
41
- self.__logger.critical("文件不合法,只接受.txt文件")
54
+ self.__logger.critical("文件不合法,只接受.txt文件")
42
55
  return
43
56
  self.__logger.info("正在读取文件总行数...")
44
57
 
@@ -90,9 +103,9 @@ class PSetDataObj:
90
103
 
91
104
  def set_file_data_pro(self, set_file_dir_path, num_shards=3000):
92
105
  """
93
- 对文件夹下的所有txt文件去重,pro版本,不对文件做任何修改,去重任何数据
106
+ 对文件夹下的所有txt文件去重,pro版本,不对文件做任何修改,去重任何数据
94
107
  :param set_file_dir_path:文件夹路径
95
- :param num_shards:临时文件切片,推荐:数据越大值越大 1000
108
+ :param num_shards:临时文件切片,推荐:数据越大值越大 1000
96
109
  :return:
97
110
  """
98
111
  if not is_dir(set_file_dir_path):
@@ -154,6 +167,111 @@ class PSetDataObj:
154
167
  line_count = sum(1 for _ in fp_r)
155
168
  self.__logger.info(f"文件处理完毕,去重后总行数为:{line_count},结果路径:{result_w_path}")
156
169
 
170
+ def set_file_data_max(self, a_set_file_path, b_set_file_path):
171
+ """
172
+ 对两个a、b文件去重,从a文件的元素中剔除掉b文件里所有元素
173
+ :param a_set_file_path: a文件路径
174
+ :param b_set_file_path: b文件路径
175
+ :return:
176
+ """
177
+ if get_file_extension(a_set_file_path) != ".txt" or get_file_extension(b_set_file_path) != ".txt":
178
+ self.__logger.critical("文件不合法,只接受.txt文件")
179
+ return
180
+ self.__logger.info("正在读取a、b文件总行数...")
181
+ a_line_count = self.get_file_line_cnt(a_set_file_path)
182
+ self.__logger.info(f"读取a文件完成,总行数为:{a_line_count},路径:{a_set_file_path}")
183
+ b_line_count = self.get_file_line_cnt(b_set_file_path)
184
+ self.__logger.info(f"读取b文件完成,总行数为:{b_line_count},路径:{b_set_file_path}")
185
+
186
+ num_shards = 50000
187
+ buffer_size = 2500 # 缓冲区行数,越大I/O越少但内存占用越高
188
+ # self.__now_current_working_dir = r"D:\000\xtnkk-tools\demos\temp_25032714561875181921"
189
+ # self.__order_id = "25032714561875181921"
190
+
191
+ # 初始化文件
192
+ ab_shard_path_dict = {}
193
+ for _ in range(num_shards):
194
+ a_shard_path = f"{os.path.join(self.__now_current_working_dir, f'{self.__order_id}_shard_{_}_a.tmp')}"
195
+ b_shard_path = f"{os.path.join(self.__now_current_working_dir, f'{self.__order_id}_shard_{_}_b.tmp')}"
196
+ ab_shard_path_dict[_] = {
197
+ "a": a_shard_path,
198
+ "b": b_shard_path,
199
+ }
200
+ with open(a_shard_path, "w", encoding="utf-8"):
201
+ pass
202
+ with open(b_shard_path, "w", encoding="utf-8"):
203
+ pass
204
+
205
+ self.__set_file_data_max_writelinesBuffers(a_set_file_path, a_line_count, num_shards, buffer_size, "a")
206
+ self.__set_file_data_max_writelinesBuffers(b_set_file_path, b_line_count, num_shards, buffer_size, "b")
207
+
208
+ result_w_path = os.path.join(self.__now_current_working_dir, "000_去重结果.txt")
209
+ result_tqdm_f = tqdm(ab_shard_path_dict, total=len(ab_shard_path_dict),
210
+ desc=f"正在合并文件", bar_format="{l_bar}{bar}|{n}/{total} [预计完成时间:{remaining}]")
211
+
212
+ with open(result_w_path, "w", encoding="utf-8") as result_f_w:
213
+ for _ in result_tqdm_f:
214
+ a_shard_path = ab_shard_path_dict[_]["a"]
215
+ b_shard_path = ab_shard_path_dict[_]["b"]
216
+ a_seen_list = []
217
+ b_seen_list = []
218
+ with open(a_shard_path, "r", encoding="utf-8") as a_f_r:
219
+ for line_i in a_f_r.readlines():
220
+ line = line_i.strip()
221
+ a_seen_list.append(line)
222
+ with open(b_shard_path, "r", encoding="utf-8") as b_f_r:
223
+ for line_i in b_f_r.readlines():
224
+ line = line_i.strip()
225
+ b_seen_list.append(line)
226
+
227
+ seen_list = list(set(a_seen_list) - set(b_seen_list))
228
+ w_txt = "\n".join(seen_list)
229
+ result_f_w.write(w_txt + "\n")
230
+ os.remove(a_shard_path) # 删除临时文件
231
+ os.remove(b_shard_path) # 删除临时文件
232
+
233
+ with open(result_w_path, "r", encoding="utf-8") as fp_r:
234
+ line_count = sum(1 for _ in fp_r)
235
+ self.__logger.info(f"文件处理完毕,去重后总行数为:{line_count},结果路径:{result_w_path}")
236
+
237
+ def __set_file_data_max_writelinesBuffers(self, set_file_path, file_line_count, num_shards, buffer_size,
238
+ tmp_file_suffix):
239
+ """
240
+ set_file_data_max 专用,用于写临时文件
241
+ :param set_file_path:
242
+ :param file_line_count:
243
+ :param num_shards:
244
+ :param buffer_size:
245
+ :param tmp_file_suffix:
246
+ :return:
247
+ """
248
+ # 初始化分片缓冲区
249
+ buffers_dict = {i: [] for i in range(num_shards)}
250
+ with open(set_file_path, "r", encoding="utf-8") as a_f_r:
251
+ for line_i in tqdm(a_f_r,
252
+ total=file_line_count,
253
+ desc=f"正在去重{tmp_file_suffix}文件",
254
+ bar_format="{l_bar}{bar}|{n}/{total} [预计完成时间:{remaining}]"):
255
+ line = line_i.strip().encode()
256
+ line_hash = hashlib.md5(line).hexdigest()
257
+ shard_id = int(line_hash, 16) % num_shards
258
+ buffers_dict[shard_id].append(line_i) # 写入缓冲区
259
+
260
+ # 缓冲满时写入
261
+ if len(buffers_dict[shard_id]) >= buffer_size:
262
+ shard_path = f"{os.path.join(self.__now_current_working_dir, f'{self.__order_id}_shard_{shard_id}_{tmp_file_suffix}.tmp')}"
263
+ with open(shard_path, "a", encoding="utf-8") as a_shard_fw:
264
+ a_shard_fw.writelines(buffers_dict[shard_id])
265
+ buffers_dict[shard_id].clear()
266
+
267
+ # 最终刷新所有缓冲区
268
+ for shard_id in buffers_dict:
269
+ if buffers_dict[shard_id]:
270
+ a_shard_path = f"{os.path.join(self.__now_current_working_dir, f'{self.__order_id}_shard_{shard_id}_{tmp_file_suffix}.tmp')}"
271
+ with open(a_shard_path, "a", encoding="utf-8") as a_shard_fw:
272
+ a_shard_fw.writelines(buffers_dict[shard_id])
273
+ buffers_dict[shard_id].clear()
274
+
157
275
  def merging_data(self, file_dir_path, merging_new_file_name="合并"):
158
276
  """
159
277
  传入一个文件夹,合并这个文件夹下所有.txt的数据
@@ -190,8 +308,16 @@ class PSetDataObj:
190
308
  f_w.write(line + "\n")
191
309
 
192
310
  def split_data(self, file_path, split_new_file_name="分割", file_index=1, file_max_line=1000000):
311
+ """
312
+ 传入一个txt文件,按 file_max_line 分割
313
+ :param file_path:
314
+ :param split_new_file_name:
315
+ :param file_index:
316
+ :param file_max_line:
317
+ :return:
318
+ """
193
319
  if get_file_extension(file_path) != ".txt":
194
- self.__logger.critical("文件不合法,只接受.txt文件")
320
+ self.__logger.critical("文件不合法,只接受.txt文件")
195
321
  return
196
322
  self.__logger.info("正在读取文件总行数...")
197
323
 
@@ -217,5 +343,11 @@ class PSetDataObj:
217
343
  self.__list_to_write_file(result_w_path, temp_line_list)
218
344
 
219
345
  def __list_to_write_file(self, file_w_path, data_list):
346
+ """
347
+ 列表数据 批量 覆盖写入文件
348
+ :param file_w_path:
349
+ :param data_list:
350
+ :return:
351
+ """
220
352
  with open(file_w_path, "w", encoding="utf-8") as result_w_f:
221
353
  result_w_f.write("\n".join(data_list))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: xtn-tools-pro
3
- Version: 1.0.1.0.4
3
+ Version: 1.0.1.0.5
4
4
  Summary: xtn 开发工具
5
5
  Author: xtn
6
6
  Author-email: czw011122@gmail.com
@@ -58,12 +58,12 @@ xtn_tools_pro/utils/file_utils.py,sha256=obaBP7CaBCsXxzqGeWzV2l0yw7vicgKOaXzmpMV
58
58
  xtn_tools_pro/utils/helpers.py,sha256=OmPRxB6vDN1S7OhxQWgoOXnxs2YPorwSQKBVzYOiKoA,4928
59
59
  xtn_tools_pro/utils/log.py,sha256=mf5huJDA8xVxxFWPG_tl_vOsAA2_ywGDFycYSGHIDCo,10202
60
60
  xtn_tools_pro/utils/retry.py,sha256=0wjHsR5DBBKpv4naMfxiky8kprrZes4WURIfFQ4H708,1657
61
- xtn_tools_pro/utils/set_data.py,sha256=IthfAclck7AbaxOIKOgJZ2wdcfEmlvC-C63Tywcr4bA,11180
61
+ xtn_tools_pro/utils/set_data.py,sha256=BzCLbEDO83csDHBey8kP7SoE6kx6EjqE7OqRBY7k82s,17608
62
62
  xtn_tools_pro/utils/sql.py,sha256=EAKzbkZP7Q09j15Gm6o0_uq0qgQmcCQT6EAawbpp4v0,6263
63
63
  xtn_tools_pro/utils/time_utils.py,sha256=TUtzG61PeVYXhaQd6pBrXAdlz7tBispNIRQRcGhE2No,4859
64
- xtn_tools_pro-1.0.1.0.4.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
- xtn_tools_pro-1.0.1.0.4.dist-info/METADATA,sha256=NXfWI4TvsFCXCr4IZMpsqP-vUP46smejlUEm8sFXFVY,498
66
- xtn_tools_pro-1.0.1.0.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
67
- xtn_tools_pro-1.0.1.0.4.dist-info/entry_points.txt,sha256=t8CtXWOgw7nRDW3XNlZh8MT_P4l8EzsFnyWi5b3ovrc,68
68
- xtn_tools_pro-1.0.1.0.4.dist-info/top_level.txt,sha256=jyB3FLDEr8zE1U7wHczTgIbvUpALhR-ULF7RVEO7O2U,14
69
- xtn_tools_pro-1.0.1.0.4.dist-info/RECORD,,
64
+ xtn_tools_pro-1.0.1.0.5.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
+ xtn_tools_pro-1.0.1.0.5.dist-info/METADATA,sha256=8gl2HpIsyu3m2geeB0cjy5a8OqPxDxxOBW6ILMN56eE,498
66
+ xtn_tools_pro-1.0.1.0.5.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
67
+ xtn_tools_pro-1.0.1.0.5.dist-info/entry_points.txt,sha256=t8CtXWOgw7nRDW3XNlZh8MT_P4l8EzsFnyWi5b3ovrc,68
68
+ xtn_tools_pro-1.0.1.0.5.dist-info/top_level.txt,sha256=jyB3FLDEr8zE1U7wHczTgIbvUpALhR-ULF7RVEO7O2U,14
69
+ xtn_tools_pro-1.0.1.0.5.dist-info/RECORD,,