PyPI - re-common - Versions diffs - 10.0.13__tar.gz → 10.0.15__tar.gz - Mend

re-common 10.0.13tar.gz → 10.0.15tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (229) hide show

{re_common-10.0.13 → re_common-10.0.15}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: re_common
-Version: 10.0.13
+Version: 10.0.15
 Summary: a library about all python projects
 Home-page: https://gitee.com/xujiangios/re-common
 Author: vic

re_common-10.0.15/re_common/v2/baselibrary/decorators/utils.py ADDED Viewed

@@ -0,0 +1,59 @@
+import warnings
+import functools
+# 全局集合，用于记录已警告的函数或类
+_warned_once = set()
+def deprecated(message=None):
+    """
+    装饰器：标记函数或类为已废弃，整个进程只发出一次警告。
+    Args:
+        message (str): 自定义警告信息，默认为 None。
+    """
+    def decorator(obj):
+        # 如果是函数
+        if isinstance(obj, type(lambda: None)):
+            @functools.wraps(obj)
+            def wrapper(*args, **kwargs):
+                obj_id = id(obj)  # 使用对象的内存地址作为唯一标识
+                if obj_id not in _warned_once:
+                    default_msg = f"函数 {obj.__name__} 已不建议使用。"
+                    warn_msg = f"{default_msg} {message}" if message else default_msg
+                    warnings.warn(
+                        warn_msg,
+                        category=DeprecationWarning,
+                        stacklevel=2
+                    )
+                    _warned_once.add(obj_id)  # 记录已警告
+                return obj(*args, **kwargs)
+            return wrapper
+        # 如果是类
+        elif isinstance(obj, type):
+            orig_init = obj.__init__
+            @functools.wraps(orig_init)
+            def new_init(self, *args, **kwargs):
+                obj_id = id(obj)
+                if obj_id not in _warned_once:
+                    default_msg = f"类 {obj.__name__} 已不建议使用。"
+                    warn_msg = f"{default_msg} {message}" if message else default_msg
+                    warnings.warn(
+                        warn_msg,
+                        category=DeprecationWarning,
+                        stacklevel=2
+                    )
+                    _warned_once.add(obj_id)  # 记录已警告
+                orig_init(self, *args, **kwargs)
+            obj.__init__ = new_init
+            return obj
+        else:
+            raise TypeError("此装饰器仅适用于函数和类")
+    return decorator

re_common-10.0.15/re_common/v2/baselibrary/tools/ac_ahocorasick.py ADDED Viewed

@@ -0,0 +1,76 @@
+import pickle
+import ahocorasick
+class ACTool(object):
+    def __init__(self):
+        self.automaton = ahocorasick.Automaton()
+    def add_word(self, key, value, overwrite=True) -> bool:
+        """
+        为 AC 机添加数据,默认情况下 key重复直接覆盖
+        :param key: 要添加的关键字
+        :param value: 对应的值
+        :param overwrite: 是否覆盖已有的 key，默认为 True
+        :return: 是否成功添加或覆盖
+        """
+        if key in self.automaton:  # 检查 key 是否已存在
+            if overwrite:  # 如果允许覆盖
+                self.automaton.add_word(key, value)
+                return True
+            else:  # 不允许覆盖，跳过
+                return False
+        else:  # key 不存在，直接添加
+            self.automaton.add_word(key, value)
+            return True
+    def is_exists_key(self, key) -> bool:
+        # 是否存在key
+        if self.automaton.exists(key):
+            return True
+        else:
+            return False
+    def make_automaton(self):
+        """
+        添加完词后需要构建
+        """
+        self.automaton.make_automaton()
+    def iter(self, key):
+        """
+        结果为可迭代对象 可通过list 转换 [(end_index, value)]
+        tool.add_word("he", "word1")
+        tool.add_word("hello", "word2")
+        # 在字符串中查找匹配
+        input_string = "hello world"
+        matches = list(tool.automaton.iter(input_string))
+        print(matches)  # [(1, 'word1'), (4, 'word2')]
+        (1, 'word1'):
+        end_index = 1: 表示匹配的关键字 "he" 在 input_string = "hello world" 中的结束位置是索引 1（即字符串 "he" 的最后一个字符 'e' 的位置）。
+        "hello world" 的索引：h(0)e(1)l(2)l(3)o(4) (5)w(6)o(7)r(8)l(9)d(10)。
+        value = 'word1': 表示匹配的关键字 "he" 对应的值是 "word1"。
+        (4, 'word2'):
+        end_index = 4: 表示匹配的关键字 "hello" 在 input_string = "hello world" 中的结束位置是索引 4（即字符串 "hello" 的最后一个字符 'o' 的位置）。
+        value = 'word2': 表示匹配的关键字 "hello" 对应的值是 "word2"。
+        注意: 结果只会返回 value 不会返回 key，如果需要key  请将key 组合到结果中
+        """
+        result_iter = self.automaton.iter(key)  # ahocorasick.AutomatonSearchIter
+        return result_iter
+    def save(self,local_temp_path):
+        """
+        将构建好的ac自动机保存到本地
+        """
+        self.automaton.save(local_temp_path,pickle.dumps)
+    def load(self,local_temp_path):
+        """
+        加载已经构建好的ac自动机
+        """
+        self.automaton=ahocorasick.load(local_temp_path, pickle.loads)

re_common-10.0.15/re_common/v2/baselibrary/tools/hdfs_data_processer.py ADDED Viewed

@@ -0,0 +1,318 @@
+import asyncio
+import gzip
+import json
+import sqlite3
+import time
+import os
+from io import BytesIO
+from typing import Callable, Any, List
+from hdfs import InsecureClient
+class HDFSDataProcessor:
+    def __init__(
+            self,
+            hdfs_url="http://VIP-DC-MASTER-2:9870",
+            hdfs_user="root",
+            db_file="processed_files.db",
+            batch_size=50,
+            retry_limit=3,
+    ):
+        self.hdfs_url = hdfs_url
+        self.hdfs_user = hdfs_user
+        self.db_file = db_file
+        self.batch_size = batch_size
+        self.retry_limit = retry_limit
+        self.client = InsecureClient(self.hdfs_url, user=self.hdfs_user)
+        self.read_hdfs_fanc = {"all": self.all_read_gz, "batch": self.batch_read_gz}
+        self.read_hdfs_model = "all"
+        self.init_db()
+    def init_db(self):
+        """初始化 SQLite 数据库"""
+        with sqlite3.connect(self.db_file) as conn:
+            cursor = conn.cursor()
+            cursor.execute("""
+                    CREATE TABLE IF NOT EXISTS processed_files (
+                        file_path TEXT PRIMARY KEY
+                    )
+                    """)
+            conn.commit()
+    def save_processed_file(self, file_path):
+        """保存处理过的文件"""
+        with sqlite3.connect(self.db_file) as conn:
+            cursor = conn.cursor()
+            cursor.execute("INSERT OR IGNORE INTO processed_files (file_path) VALUES (?)", (file_path,))
+            conn.commit()
+    def is_file_processed(self, file_path):
+        """检查文件是否已处理"""
+        with sqlite3.connect(self.db_file) as conn:
+            cursor = conn.cursor()
+            cursor.execute("SELECT file_path FROM processed_files WHERE file_path = ?", (file_path,))
+            result = cursor.fetchone()
+        return result is not None
+    def list_gz_files(self, hdfs_dir):
+        """列出 HDFS 目录中的所有 gzip 文件"""
+        return [f"{hdfs_dir}/{file[0]}" for file in self.client.list(hdfs_dir, status=True) if file[0].endswith(".gz")]
+    def count_total_lines(self, gz_file_path: str):
+        with self.client.read(gz_file_path) as hdfs_file:
+            with gzip.GzipFile(fileobj=hdfs_file) as gz:
+                return sum(1 for _ in gz)
+    def batch_read_gz(self, gz_file_path: str):
+        """分批读取 gz 文件"""
+        with self.client.read(gz_file_path) as hdfs_file:
+            with gzip.GzipFile(fileobj=hdfs_file) as gz:
+                while True:
+                    lines = []
+                    for _ in range(self.batch_size):
+                        try:
+                            line = next(gz)
+                            if line.strip():  # 移除空行
+                                lines.append(line.decode("utf-8"))  # 解码
+                        except StopIteration:  # 文件已读完
+                            break
+                    if not lines:
+                        break
+                    yield lines
+    def all_read_gz(self, gz_file_path: str, encoding='utf-8'):
+        """
+        读取 HDFS 上的 .gz 文件内容。
+        :param hdfs_path: HDFS 文件路径（必须以 .gz 结尾）
+        :param encoding: 文件编码格式（默认 utf-8）
+        :return: 文件内容
+        """
+        with self.client.read(gz_file_path) as reader:  # 以二进制模式读取
+            compressed_data = reader.read()  # 读取压缩数据
+            with gzip.GzipFile(fileobj=BytesIO(compressed_data)) as gz_file:  # 解压缩
+                content = gz_file.read().decode(encoding)  # 解码为字符串
+                print(f"文件读取成功: {gz_file_path}")
+                lines = [i for i in content.splitlines() if i.strip()]
+                result = [lines[i:i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
+                return result
+    async def process_data(self, data, process_func):
+        """处理数据并执行处理函数"""
+        retry_count = 0
+        while retry_count < self.retry_limit:
+            try:
+                await process_func(data)
+                return  # 成功处理后退出
+            except Exception as e:
+                retry_count += 1
+                print(f"处理数据时发生错误: {e}, 正在重试 {retry_count}/{self.retry_limit}, data: {data}")
+                await asyncio.sleep(2 ** retry_count)
+        print(f"处理数据失败, 达到重试上限, data: {data}")
+    async def process_file(self, hdfs_file_path, process_func):
+        """处理单个 gz 文件"""
+        total_lines = self.count_total_lines(hdfs_file_path)
+        processed_lines = 0
+        start_time = time.time()
+        #   # 这里根据不同的配置选用不同的读取文件的方法
+        for lines in self.read_hdfs_fanc[self.read_hdfs_model](hdfs_file_path):
+            processing_start_time = time.time()  # 记录本批处理开始时间
+            tasks = []
+            for line in lines:
+                try:
+                    data = json.loads(line)
+                    tasks.append(self.process_data(data, process_func))
+                except json.JSONDecodeError as e:
+                    print(f"解析JSON失败: {e}, 行内容: {line.strip()}")
+            # await AsyncTaskPool(self.batch_size).run(tasks) # AsyncTaskPool 适用于一次提交所有任务, 限制并发数执行
+            await asyncio.gather(*tasks)
+            processed_lines += len(lines)
+            elapsed_time = time.time() - start_time  # 已用时间
+            processing_time = time.time() - processing_start_time  # 本次处理时间
+            avg_processing_time = (
+                (elapsed_time * 1000) / processed_lines if processed_lines > 0 else float("inf")
+            )  # 平均每条数据的处理时间（毫秒）
+            # 估算剩余时间
+            remaining_time = (
+                ((avg_processing_time / 1000) * (total_lines - processed_lines))
+                if processed_lines > 0
+                else float("inf")
+            )
+            # 显示总进度信息
+            print(
+                f"文件: {hdfs_file_path} 总进度: {processed_lines}/{total_lines} 行 | "
+                f"已用时间: {elapsed_time:.2f}秒 | 本次处理时间: {processing_time:.2f}秒 | "
+                f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
+            )
+        # 最终进度显示
+        final_elapsed_time = time.time() - start_time  # 最终已用时间
+        print(
+            f"文件: {hdfs_file_path} 处理完成 | 总进度: {processed_lines}/{total_lines} 行 | "
+            f"总已用时间: {final_elapsed_time:.2f}秒 | "
+            f"平均每条处理时间: {(final_elapsed_time * 1000) / processed_lines:.2f}毫秒"
+            if processed_lines > 0
+            else "处理无数据"
+        )
+        self.save_processed_file(hdfs_file_path)  # 保存处理过的文件
+    async def retry_process_file(self, hdfs_file_path, process_func):
+        """带重试机制的文件处理"""
+        retry_count = 0
+        while retry_count < self.retry_limit:
+            try:
+                await self.process_file(hdfs_file_path, process_func)
+                return True  # 成功处理后退出
+            except Exception as e:
+                retry_count += 1
+                print(f"处理文件 {hdfs_file_path} 时发生错误: {e}，正在重试 {retry_count}/{self.retry_limit}")
+                await asyncio.sleep(2 ** retry_count)
+        print(f"处理文件 {hdfs_file_path} 失败，达到重试上限")
+        return False
+        # raise
+    async def batch_process_file(self, hdfs_dir: str, process_func: Callable[[dict], Any]):
+        """批量更新所有 gz 文件"""
+        gz_files = self.list_gz_files(hdfs_dir)
+        all_succeed = True
+        for hdfs_file_path in gz_files:
+            if self.is_file_processed(hdfs_file_path):
+                print(f"跳过已处理文件: {hdfs_file_path}")
+                continue  # 如果文件已处理，跳过
+            succeed = await self.retry_process_file(hdfs_file_path, process_func)  # 处理文件
+            if succeed is False:
+                all_succeed = False
+        if all_succeed:
+            # 处理完成后删除数据库文件
+            try:
+                if os.path.exists(self.db_file):
+                    os.remove(self.db_file)
+                    print(f"已删除断点重试文件: {self.db_file}")
+            except Exception as e:
+                print(f"删除断点重试文件失败: {e}")
+    async def process_file_bulk(self, hdfs_file_path, process_func):
+        """按批次处理单个文件，批量数据传递给处理函数"""
+        total_lines = self.count_total_lines(hdfs_file_path)
+        processed_lines = 0
+        start_time = time.time()
+        tasks = []
+        # 这里根据不同的配置选用不同的读取文件的方法
+        for lines in self.read_hdfs_fanc[self.read_hdfs_model](hdfs_file_path):
+            processing_start_time = time.time()  # 记录本批处理开始时间
+            batch_data = []
+            for line in lines:
+                try:
+                    data = json.loads(line)
+                    batch_data.append(data)
+                except json.JSONDecodeError as e:
+                    print(f"解析JSON失败: {e}, 行内容: {line.strip()}")
+            # 处理读取到的批次数据
+            if batch_data:
+                tasks.append(process_func(batch_data))  # 将批次数据传递给处理函数并收集任务
+                processed_lines += len(batch_data)  # 更新已处理行数
+            # 当积累的任务数量达到 batch_size 时并发处理所有任务
+            if len(tasks) >= self.batch_size:
+                await asyncio.gather(*tasks)  # 同时处理多个批次
+                elapsed_time = time.time() - start_time  # 已用时间
+                processing_time = time.time() - processing_start_time  # 本次处理时间
+                avg_processing_time = (
+                    (elapsed_time * 1000) / processed_lines if processed_lines > 0 else float("inf")
+                )  # 平均每条数据的处理时间（毫秒）
+                # 估算剩余时间
+                remaining_time = (
+                    ((avg_processing_time / 1000) * (total_lines - processed_lines))
+                    if processed_lines > 0
+                    else float("inf")
+                )
+                # 显示总进度信息
+                print(
+                    f"文件: {hdfs_file_path} 总进度: {processed_lines}/{total_lines} 行 | "
+                    f"已用时间: {elapsed_time:.2f}秒 | 本次处理时间: {processing_time:.2f}秒 | "
+                    f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
+                )
+                # 清空任务列表，准备下一批处理
+                tasks.clear()
+            # 处理剩余的任务
+        if tasks:
+            await asyncio.gather(*tasks)  # 处理未达到 batch_size 的剩余任务
+        # 最终进度显示
+        final_elapsed_time = time.time() - start_time  # 最终已用时间
+        print(
+            f"文件: {hdfs_file_path} 处理完成 | 总进度: {processed_lines}/{total_lines} 行 | "
+            f"总已用时间: {final_elapsed_time:.2f}秒 | "
+            f"平均每条处理时间: {(final_elapsed_time * 1000) / processed_lines:.2f}毫秒"
+            if processed_lines > 0
+            else "处理无数据"
+        )
+        self.save_processed_file(hdfs_file_path)
+    async def retry_process_file_bulk(self, hdfs_file_path, process_func):
+        """带重试机制的批量文件处理"""
+        retry_count = 0
+        while retry_count < self.retry_limit:
+            try:
+                await self.process_file_bulk(hdfs_file_path, process_func)
+                return True  # 成功处理后退出
+            except Exception as e:
+                retry_count += 1
+                print(f"处理文件 {hdfs_file_path} 时发生错误: {e}，正在重试 {retry_count}/{self.retry_limit}")
+                await asyncio.sleep(2 ** retry_count)
+        print(f"处理文件 {hdfs_file_path} 失败，达到重试上限")
+        return False
+    async def batch_process_file_bulk(self, hdfs_dir: str, process_func: Callable[[List[dict]], Any]):
+        """批量处理 gz 文件中的数据"""
+        gz_files = self.list_gz_files(hdfs_dir)
+        all_succeed = True
+        for hdfs_file_path in gz_files:
+            if self.is_file_processed(hdfs_file_path):
+                print(f"跳过已处理文件: {hdfs_file_path}")
+                continue  # 跳过已处理文件
+            succeed = await self.retry_process_file_bulk(hdfs_file_path, process_func)
+            if succeed is False:
+                all_succeed = False
+        if all_succeed:
+            # 处理完成后删除数据库文件
+            try:
+                if os.path.exists(self.db_file):
+                    os.remove(self.db_file)
+                    print(f"已删除断点重试文件: {self.db_file}")
+            except Exception as e:
+                print(f"删除断点重试文件失败: {e}")
+# # 使用示例
+# async def update_refer(data: dict):
+#     ref_id = data["ref_id"]
+#     url = f"http://192.168.98.79:8150/v1/fact_refer/update/{ref_id}"
+#     update_data = data["update_data"]
+#     if not update_data:
+#         return
+#
+#     # 此处为实际处理逻辑
+#     await ApiNetUtils.fetch_post(url=url, payload=update_data)
+#
+#
+# if __name__ == "__main__":
+#     processor = HDFSDataProcessor()  # 实例化数据处理类
+#     asyncio.run(processor.batch_process_file("/user/libaiyun/output/confidence", update_refer))

{re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/tools/search_hash_tools.py RENAMED Viewed

@@ -3,9 +3,10 @@ from typing import List
 import jieba
 from datasketch import MinHash, minhash
+from re_common.v2.baselibrary.decorators.utils import deprecated
 from re_common.v2.baselibrary.utils.string_bool import is_single_cjk_char
+@deprecated("请使用 TextMatcherV2 中的方法代替。")
 def tokenize(text: str, stopwords=None) -> List[str]:
     """
     分词并移除停用词
@@ -32,7 +33,7 @@ def tokenize(text: str, stopwords=None) -> List[str]:
     words = [w for w in words if w not in stopwords and w.strip()]
     return words
+@deprecated("请使用 TextMatcherV2 中的方法代替。")
 def create_minhash(words: List[str], num_perm=128) -> MinHash:
     """
     为分词结果创建 MinHash
@@ -42,7 +43,7 @@ def create_minhash(words: List[str], num_perm=128) -> MinHash:
         minhash.update(word.encode("utf-8"))
     return minhash
+@deprecated("请使用 TextMatcherV2 中的方法代替。")
 def get_str_minhash(title):
     from re_common.v2.baselibrary.utils.string_clear import rel_clear
     rel_title = rel_clear(title)

{re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/tools/text_matcher.py RENAMED Viewed

@@ -1,10 +1,16 @@
+import pickle
 import jieba
 import re
-from typing import List, Dict, Tuple, Set, Optional, Union
+from typing import List, Dict, Tuple, Set, Optional, Union, Hashable, Protocol
 from datasketch import MinHash, MinHashLSH
+from re_common.v2.baselibrary.decorators.utils import deprecated
+from re_common.v2.baselibrary.utils.string_bool import is_single_cjk_char
-class TextMatcher:
+@deprecated("请使用 TextMatcherV2 代替。")
+class TextMatcher(object):
     def __init__(
             self,
             threshold: float = 0.5,
@@ -188,36 +194,133 @@ class TextMatcher:
         self.doc_counter = 0
-if __name__ == "__main__":
-    # 创建匹配器实例
-    matcher = TextMatcher(
-        threshold=0.1,  # 相似度阈值
-        num_perm=128,  # MinHash 排列数
-    )
+# 定义一个协议，描述“像鸭子一样”的行为
+class TokenizeDuckLike(Protocol):
+    def get_words(self, text) -> List:
+        pass
+class JiebaTokenize(object):
+    def __init__(self, stopwords=None):
+        self.stopwords = stopwords
+    def get_words(self, text) -> List:
+        if self.stopwords is None:
+            stopwords = []
+        words = jieba.lcut(text)
+        # 统计单字符数据 长度，防止结巴分词分不了的单词 将数据分为单个字符
+        # 这里为什么使用函数 而不是在推导式中兼容，主要是在一些 spark中 推导式的if 条件不遵循最短路径原则会将表达式当做一个整体算子
+        def is_singel_en(i):
+            if len(i) == 1 and not is_single_cjk_char(i):
+                return True
+            return False
+        one_char_size = len([i for i in words if is_singel_en(i)])
+        all_size = len(words)
+        # 如果单字符个数超过一定比例 就直接用空格分词
+        if all_size != 0 and one_char_size / all_size > 0.6:
+            words = [i for i in text.split() if i.strip()]
+        # 过滤停用词和空字符
+        words = [w for w in words if w not in stopwords and w.strip()]
+        return words
+class TextMatcherV2(object):
+    def __init__(
+            self,
+            threshold: float = 0.5,
+            num_perm: int = 128,
+            tdk: TokenizeDuckLike = None
+    ):
+        """
+        初始化文本匹配器
+        Args:
+            threshold: LSH 相似度阈值
+            num_perm: MinHash 排列数
+            stopwords_path: 停用词文件路径
+            user_dict_path: 用户自定义词典路径
+        """
+        self.threshold = threshold
+        self.num_perm = num_perm
+        self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
+        self.tdk = tdk
+    def add_document(self, doc_id: str, minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None):
+        if isinstance(minhash, str):
+            minhash = self.str_to_minihash(minhash, tdk)
-    # 添加单个文档
-    doc_id = matcher.add_document(
-        "北京是中国的首都"
-    )
+        self.lsh.insert(doc_id, minhash)
-    # 批量添加文档
-    docs = {"doc1": "北京是一座现代化的大都市", "doc2": "上海是中国最大的城市", "doc3": "中国的首都是北京"}
-    matcher.batch_add_documents(docs)
+    def batch_add_documents(self, betch_data: Union[list, dict], tdk: TokenizeDuckLike = None):
+        def _add_document(minhash_or_str, tdk):
+            if isinstance(minhash_or_str, str):
+                minhash_or_str = self.str_to_minihash(minhash_or_str, tdk)
+            self.add_document(docid, minhash_or_str, tdk)
+        if isinstance(betch_data, list):
+            # 必须是可解包的2个数据的元组或list
+            for docid, minhash_or_str in betch_data:
+                _add_document(minhash_or_str, tdk)
+        elif isinstance(betch_data, dict):
+            for docid, minhash_or_str in betch_data.items():
+                _add_document(minhash_or_str, tdk)
+        else:
+            raise Exception("数据类型错误")
+    def find_similar(self, query_minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None) -> List[Hashable]:
+        # 使用 LSH 查找候选集
+        if isinstance(query_minhash, str):
+            query_minhash = self.str_to_minihash(query_minhash, tdk)
+        similar_docs = self.lsh.query(query_minhash)
+        return similar_docs
-    # 查找相似文档（不返回相似度分数）
-    similar_docs = matcher.find_similar("北京首都")
-    print("相似文档ID:", similar_docs)
+    def create_minhash(self, words: List[str], num_perm=None) -> MinHash:
+        """
+        为分词结果创建 MinHash
+        """
+        if num_perm is None:
+            num_perm = self.num_perm
+        minhash = MinHash(num_perm=num_perm)
+        for word in words:
+            minhash.update(word.encode("utf-8"))
+        return minhash
-    # 查找相似文档（返回相似度分数）
-    similar_docs_with_scores = matcher.find_similar("北京首都", return_similarities=True)
-    print("相似文档ID和分数:", similar_docs_with_scores)
+    def create_words(self, text: str, tdk: TokenizeDuckLike = None):
+        if tdk is None:
+            tdk = self.tdk
+        worlds = tdk.get_words(text)
+        return worlds
+    def str_to_minihash(self, text: str, tdk: TokenizeDuckLike = None):
+        if tdk is None:
+            tdk = self.tdk
+        words = self.create_words(text, tdk)
+        minhash = self.create_minhash(words, self.num_perm)
+        return minhash
-    # 获取原始文本
-    for doc_id, score in similar_docs_with_scores:
-        print(f"文档 {doc_id}: {matcher.get_text(doc_id)} (相似度: {score:.2f})")
+    def minhash_dumps(self, minhash) -> bytes:
+        """
+        序列化
+        """
+        serialized_minhash = pickle.dumps(minhash)
+        return serialized_minhash
-    # 删除文档
-    matcher.remove_document("doc1")
+    def minhash_loads(self, serialized_minhash) -> MinHash:
+        """
+        反序列化
+        """
+        minhash = pickle.loads(serialized_minhash)
+        return minhash
-    # 清空所有数据
-    matcher.clear()
+    def merge_other_minhashlsh(self, other_minhashlsh: MinHashLSH):
+        """
+        在其他地方创建好的lsh 合并进来
+        """
+        self.lsh.merge(other_minhashlsh)

re-common 10.0.13__tar.gz → 10.0.15__tar.gz

re-common 10.0.13tar.gz → 10.0.15tar.gz