PyPI - xtn-tools-pro - Versions diffs - 1.0.0.7.3__py3-none-any.whl → 1.0.0.7.4__py3-none-any.whl - Mend

xtn-tools-pro 1.0.0.7.3py3-none-any.whl → 1.0.0.7.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

xtn_tools_pro/utils/log.py CHANGED Viewed

@@ -78,7 +78,7 @@ class Log:
                  is_write_to_file=False,
                  color=True,
                  mode='a',
-                 max_bytes=1024000000,
+                 max_bytes=1024000000,  # 1 * 1024 * 1024 * 1024
                  backup_count=0,
                  encoding="utf-8",
                  save_time_log_path=None):

xtn_tools_pro/utils/set_data.py CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-# 说明：
+# 说明
 #    大文件去重
 # History:
 # Date          Author    Version       Modification
@@ -14,10 +14,10 @@ import hashlib
 from tqdm import tqdm
 from xtn_tools_pro.utils.log import Log
 from xtn_tools_pro.utils.helpers import get_orderId_random
-from xtn_tools_pro.utils.file_utils import mkdirs_dir, get_file_extension,is_dir,get_listdir
+from xtn_tools_pro.utils.file_utils import mkdirs_dir, get_file_extension, is_dir, get_listdir
-class PppSetDataObj:
+class PSetDataObj:
     def __init__(self):
         # 随机生成一个临时文件夹
         self.__order_id = get_orderId_random()
@@ -34,7 +34,7 @@ class PppSetDataObj:
         """
             对单个文件去重，air版本，不对文件做任何修改，去重任何数据
         :param set_file_path:单文件路径
-        :param num_shards:临时文件切片，推荐：数据越大值越大 10、100、1000、10000
+        :param num_shards:临时文件切片，推荐:数据越大值越大 1000
         :return:
         """
         if get_file_extension(set_file_path) != ".txt":
@@ -48,7 +48,7 @@ class PppSetDataObj:
         num_shards = 3000 if num_shards >= 3000 else num_shards
         num_shards = 3000 if line_count >= 30000000 else num_shards
-        num_shards = 1000 if num_shards <= 0 else num_shards
+        num_shards = 1000 if num_shards <= 1 else num_shards
         shard_file_obj_list = []
         shard_path_list = []
@@ -58,8 +58,9 @@ class PppSetDataObj:
             shard_file_obj_list.append(open(shard_path, "w", encoding="utf-8"))
         with open(set_file_path, "r", encoding="utf-8") as f_r:
-            tqdm_f = tqdm(f_r, total=line_count, desc="正在去重(1/2)", unit="lines")
-            for idx, line_i in enumerate(tqdm_f):
+            tqdm_f = tqdm(f_r, total=line_count, desc="正在去重(1/2)",
+                          bar_format="{l_bar}{bar}|{n}/{total} [预计完成时间:{remaining}]")
+            for line_i in tqdm_f:
                 line = line_i.strip().encode()
                 line_hash = hashlib.md5(line).hexdigest()
                 shard_id = int(line_hash, 16) % num_shards
@@ -69,7 +70,8 @@ class PppSetDataObj:
             shard_file_obj.close()
         result_w_path = os.path.join(self.__now_current_working_dir, "000_去重结果.txt")
-        tqdm_f = tqdm(shard_path_list, total=len(shard_path_list), desc="正在去重(2/2)", unit="lines")
+        tqdm_f = tqdm(shard_path_list, total=len(shard_path_list), desc="正在去重(2/2)",
+                      bar_format="{l_bar}{bar}|{n}/{total} [预计完成时间:{remaining}]")
         with open(result_w_path, "w", encoding="utf-8") as f_w:
             for shard_path in tqdm_f:
                 with open(shard_path, "r", encoding="utf-8") as f_r:
@@ -86,11 +88,11 @@ class PppSetDataObj:
             line_count = sum(1 for _ in fp_r)
         self.__logger.info(f"文件处理完毕,去重后总行数为:{line_count},结果路径:{result_w_path}")
-    def set_file_data_pro(self, set_file_dir_path, num_shards=1000):
+    def set_file_data_pro(self, set_file_dir_path, num_shards=3000):
         """
             对文件夹下的所有txt文件去重，pro版本，不对文件做任何修改，去重任何数据
         :param set_file_dir_path:文件夹路径
-        :param num_shards:临时文件切片，推荐：数据越大值越大 10、100、1000、10000
+        :param num_shards:临时文件切片，推荐:数据越大值越大 1000
         :return:
         """
         if not is_dir(set_file_dir_path):
@@ -101,55 +103,119 @@ class PppSetDataObj:
         set_file_path_list = []
         for set_file_name in get_listdir(set_file_dir_path):
             if fnmatch.fnmatch(set_file_name, '*.txt'):
-                set_file_path_list.append(os.path.join(set_file_dir_path,set_file_name))
+                set_file_path_list.append(os.path.join(set_file_dir_path, set_file_name))
         self.__logger.info(f"当前文件夹下可去重文件数量为:{len(set_file_path_list)}")
-        for set_file_path in set_file_path_list:
-            pass
-            # with open(set_file_path, "r", encoding="utf-8") as fp_r:
-            #     line_count = sum(1 for _ in fp_r)
-            # self.__logger.info(f"读取文件完成,总行数为：{line_count}")
-        # num_shards = 3000 if num_shards >= 3000 else num_shards
-        # num_shards = 3000 if line_count >= 30000000 else num_shards
-        # num_shards = 1000 if num_shards <= 0 else num_shards
-        #
-        # shard_file_obj_list = []
-        # shard_path_list = []
-        # for _ in range(num_shards):
-        #     shard_path = f"{os.path.join(self.__now_current_working_dir, f'{self.__order_id}_shard_{_}.tmp')}"
-        #     shard_path_list.append(shard_path)
-        #     shard_file_obj_list.append(open(shard_path, "w", encoding="utf-8"))
-        #
-        # with open(set_file_path, "r", encoding="utf-8") as f_r:
-        #     tqdm_f = tqdm(f_r, total=line_count, desc="正在去重(1/2)", unit="lines")
-        #     for idx, line_i in enumerate(tqdm_f):
-        #         line = line_i.strip().encode()
-        #         line_hash = hashlib.md5(line).hexdigest()
-        #         shard_id = int(line_hash, 16) % num_shards
-        #         shard_file_obj_list[shard_id].write(line_i)
-        #
-        # for shard_file_obj in shard_file_obj_list:
-        #     shard_file_obj.close()
-        #
-        # result_w_path = os.path.join(self.__now_current_working_dir, "000_去重结果.txt")
-        # tqdm_f = tqdm(shard_path_list, total=len(shard_path_list), desc="正在去重(2/2)", unit="lines")
-        # with open(result_w_path, "w", encoding="utf-8") as f_w:
-        #     for shard_path in tqdm_f:
-        #         with open(shard_path, "r", encoding="utf-8") as f_r:
-        #             seen_list = []
-        #             for line_i in f_r.readlines():
-        #                 line = line_i.strip()
-        #                 seen_list.append(line)
-        #             seen_list = list(set(seen_list))
-        #             w_txt = "\n".join(seen_list)
-        #             f_w.write(w_txt + "\n")
-        #         os.remove(shard_path)  # 删除临时文件
-        #
-        # with open(result_w_path, "r", encoding="utf-8") as fp_r:
-        #     line_count = sum(1 for _ in fp_r)
-        # self.__logger.info(f"文件处理完毕,去重后总行数为：{line_count},结果路径：{result_w_path}")
+        num_shards = 3000 if num_shards >= 3000 else num_shards
+        num_shards = 1000 if num_shards <= 1000 else num_shards
+        shard_file_obj_list = []
+        shard_path_list = []
+        for _ in range(num_shards):
+            shard_path = f"{os.path.join(self.__now_current_working_dir, f'{self.__order_id}_shard_{_}.tmp')}"
+            shard_path_list.append(shard_path)
+            shard_file_obj_list.append(open(shard_path, "w", encoding="utf-8"))
+        for _ in range(len(set_file_path_list)):
+            set_file_path = set_file_path_list[_]
+            with open(set_file_path, "r", encoding="utf-8") as fp_r:
+                line_count = sum(1 for _ in fp_r)
+            # self.__logger.info(f"{set_file_path}读取完成,总行数为:{line_count}")
+            with open(set_file_path, "r", encoding="utf-8") as f_r:
+                tqdm_f = tqdm(f_r, total=line_count, desc=f"正在去重({_ + 1}/{len(set_file_path_list) + 1})",
+                              bar_format="{l_bar}{bar}|{n}/{total} [预计完成时间:{remaining}]")
+                for line_i in tqdm_f:
+                    line = line_i.strip().encode()
+                    line_hash = hashlib.md5(line).hexdigest()
+                    shard_id = int(line_hash, 16) % num_shards
+                    shard_file_obj_list[shard_id].write(line_i)
+        for shard_file_obj in shard_file_obj_list:
+            shard_file_obj.close()
+        result_w_path = os.path.join(self.__now_current_working_dir, "000_去重结果.txt")
+        tqdm_f = tqdm(shard_path_list, total=len(shard_path_list),
+                      desc=f"正在去重({len(set_file_path_list) + 1}/{len(set_file_path_list) + 1})",
+                      bar_format="{l_bar}{bar}|{n}/{total} [预计完成时间:{remaining}]")
+        with open(result_w_path, "w", encoding="utf-8") as f_w:
+            for shard_path in tqdm_f:
+                with open(shard_path, "r", encoding="utf-8") as f_r:
+                    seen_list = []
+                    for line_i in f_r.readlines():
+                        line = line_i.strip()
+                        seen_list.append(line)
+                    seen_list = list(set(seen_list))
+                    w_txt = "\n".join(seen_list)
+                    f_w.write(w_txt + "\n")
+                os.remove(shard_path)  # 删除临时文件
+        with open(result_w_path, "r", encoding="utf-8") as fp_r:
+            line_count = sum(1 for _ in fp_r)
+        self.__logger.info(f"文件处理完毕,去重后总行数为:{line_count},结果路径:{result_w_path}")
+    def merging_data(self, file_dir_path, merging_new_file_name="合并"):
+        """
+            传入一个文件夹,合并这个文件夹下所有.txt的数据
+        :param file_dir_path: 文件夹
+        :param merging_new_file_name: 新的输出位置
+        :return:
+        """
+        if not is_dir(file_dir_path):
+            self.__logger.critical("文件夹不存在或不合法")
+            return
+        self.__logger.info("正在统计文件可合并数量...")
+        file_path_list = []
+        for set_file_name in get_listdir(file_dir_path):
+            if fnmatch.fnmatch(set_file_name, '*.txt'):
+                if set_file_name == f"{merging_new_file_name}.txt": continue
+                file_path_list.append(os.path.join(file_dir_path, set_file_name))
+        self.__logger.info(f"当前文件夹下可合并文件数量为:{len(file_path_list)}")
+        result_w_path = os.path.join(file_dir_path, f"{merging_new_file_name}.txt")
+        with open(result_w_path, "w", encoding="utf-8") as f_w:
+            for _ in range(len(file_path_list)):
+                file_path = file_path_list[_]
+                with open(file_path, "r", encoding="utf-8") as fp_r:
+                    line_count = sum(1 for _ in fp_r)
+                with open(file_path, "r", encoding="utf-8") as f_r:
+                    tqdm_f = tqdm(f_r, total=line_count,
+                                  desc=f"正在合并({_ + 1}/{len(file_path_list)})",
+                                  bar_format="{l_bar}{bar}|{n}/{total} [预计完成时间:{remaining}]")
+                    for line_i in tqdm_f:
+                        line = line_i.strip()
+                        f_w.write(line + "\n")
+    def split_data(self, file_path, split_new_file_name="分割", file_index=1, file_max_line=1000000):
+        if get_file_extension(file_path) != ".txt":
+            self.__logger.critical("文件不合法，只接受.txt文件")
+            return
+        self.__logger.info("正在读取文件总行数...")
+        with open(file_path, "r", encoding="utf-8") as fp_r:
+            line_count = sum(1 for _ in fp_r)
+        self.__logger.info(f"读取文件完成,总行数为:{line_count}")
+        with open(file_path, "r", encoding="utf-8") as f_r:
+            tqdm_f = tqdm(f_r, total=line_count, desc="正在分割(1/1)",
+                          bar_format="{l_bar}{bar}|{n}/{total} [预计完成时间:{remaining}]")
+            temp_line_list = []
+            parent_path = os.path.dirname(file_path)
+            for line_i in tqdm_f:
+                line = line_i.strip()
+                temp_line_list.append(line)
+                if len(temp_line_list) == file_max_line:
+                    result_w_path = os.path.join(parent_path, f"{split_new_file_name}_{file_index}.txt")
+                    self.__list_to_write_file(result_w_path, temp_line_list)
+                    temp_line_list = []
+                    file_index += 1
+            if temp_line_list:
+                result_w_path = os.path.join(parent_path, f"{split_new_file_name}_{file_index}.txt")
+                self.__list_to_write_file(result_w_path, temp_line_list)
+    def __list_to_write_file(self, file_w_path, data_list):
+        with open(file_w_path, "w", encoding="utf-8") as result_w_f:
+            result_w_f.write("\n".join(data_list))

{xtn_tools_pro-1.0.0.7.3.dist-info → xtn_tools_pro-1.0.0.7.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: xtn-tools-pro
-Version: 1.0.0.7.3
+Version: 1.0.0.7.4
 Summary: xtn 开发工具
 Author: xtn
 Author-email: czw011122@gmail.com

{xtn_tools_pro-1.0.0.7.3.dist-info → xtn_tools_pro-1.0.0.7.4.dist-info}/RECORD RENAMED Viewed

@@ -17,13 +17,13 @@ xtn_tools_pro/utils/__init__.py,sha256=I1_n_NP23F2lBqlF4EOlnOdLYxM8M4pbn63UhJN1h
 xtn_tools_pro/utils/crypto.py,sha256=oyzFqWum_oimUtzhfVCELQhdMjxDbLu-nOWfcNmazcc,4087
 xtn_tools_pro/utils/file_utils.py,sha256=obaBP7CaBCsXxzqGeWzV2l0yw7vicgKOaXzmpMV8ips,2567
 xtn_tools_pro/utils/helpers.py,sha256=H-a3gnahIah3kJqyKzzKlPWtVQYcFlJncz2rAfBqIiw,4444
-xtn_tools_pro/utils/log.py,sha256=pAye_sXH-y-8v2vNf-OwOTk2Exkjl6y7V_y_Hpk_d0s,10176
+xtn_tools_pro/utils/log.py,sha256=mf5huJDA8xVxxFWPG_tl_vOsAA2_ywGDFycYSGHIDCo,10202
 xtn_tools_pro/utils/retry.py,sha256=0wjHsR5DBBKpv4naMfxiky8kprrZes4WURIfFQ4H708,1657
-xtn_tools_pro/utils/set_data.py,sha256=vNhE_jCG-3p6KFnY_jbQ0vQ7EV1gB9D4Jb0S5ZoD4IM,7529
+xtn_tools_pro/utils/set_data.py,sha256=IthfAclck7AbaxOIKOgJZ2wdcfEmlvC-C63Tywcr4bA,11180
 xtn_tools_pro/utils/sql.py,sha256=EAKzbkZP7Q09j15Gm6o0_uq0qgQmcCQT6EAawbpp4v0,6263
 xtn_tools_pro/utils/time_utils.py,sha256=TUtzG61PeVYXhaQd6pBrXAdlz7tBispNIRQRcGhE2No,4859
-xtn_tools_pro-1.0.0.7.3.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-xtn_tools_pro-1.0.0.7.3.dist-info/METADATA,sha256=e7BnO3AKRCicejlwWdIVafht9964e0EJmA5ORQeRPa0,498
-xtn_tools_pro-1.0.0.7.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-xtn_tools_pro-1.0.0.7.3.dist-info/top_level.txt,sha256=jyB3FLDEr8zE1U7wHczTgIbvUpALhR-ULF7RVEO7O2U,14
-xtn_tools_pro-1.0.0.7.3.dist-info/RECORD,,
+xtn_tools_pro-1.0.0.7.4.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+xtn_tools_pro-1.0.0.7.4.dist-info/METADATA,sha256=gwssgHhEwWGdiJBfIaTSKm6fDbZYU1AqPYJjTFm8EtE,498
+xtn_tools_pro-1.0.0.7.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+xtn_tools_pro-1.0.0.7.4.dist-info/top_level.txt,sha256=jyB3FLDEr8zE1U7wHczTgIbvUpALhR-ULF7RVEO7O2U,14
+xtn_tools_pro-1.0.0.7.4.dist-info/RECORD,,

{xtn_tools_pro-1.0.0.7.3.dist-info → xtn_tools_pro-1.0.0.7.4.dist-info}/LICENSE RENAMED Viewed

File without changes

{xtn_tools_pro-1.0.0.7.3.dist-info → xtn_tools_pro-1.0.0.7.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{xtn_tools_pro-1.0.0.7.3.dist-info → xtn_tools_pro-1.0.0.7.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

xtn-tools-pro 1.0.0.7.3__py3-none-any.whl → 1.0.0.7.4__py3-none-any.whl

xtn-tools-pro 1.0.0.7.3py3-none-any.whl → 1.0.0.7.4py3-none-any.whl