re-common 10.0.22__py3-none-any.whl → 10.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. re_common/baselibrary/__init__.py +4 -4
  2. re_common/baselibrary/baseabs/__init__.py +6 -6
  3. re_common/baselibrary/baseabs/baseabs.py +26 -26
  4. re_common/baselibrary/database/mbuilder.py +132 -132
  5. re_common/baselibrary/database/moudle.py +93 -93
  6. re_common/baselibrary/database/msqlite3.py +194 -194
  7. re_common/baselibrary/database/mysql.py +169 -169
  8. re_common/baselibrary/database/sql_factory.py +26 -26
  9. re_common/baselibrary/mthread/MThreadingRun.py +486 -486
  10. re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
  11. re_common/baselibrary/mthread/__init__.py +2 -2
  12. re_common/baselibrary/mthread/mythreading.py +695 -695
  13. re_common/baselibrary/pakge_other/socks.py +404 -404
  14. re_common/baselibrary/readconfig/config_factory.py +18 -18
  15. re_common/baselibrary/readconfig/ini_config.py +317 -317
  16. re_common/baselibrary/readconfig/toml_config.py +49 -49
  17. re_common/baselibrary/temporary/envdata.py +36 -36
  18. re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
  19. re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
  20. re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
  21. re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
  22. re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
  23. re_common/baselibrary/tools/contrast_db3.py +123 -123
  24. re_common/baselibrary/tools/copy_file.py +39 -39
  25. re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
  26. re_common/baselibrary/tools/foreachgz.py +39 -39
  27. re_common/baselibrary/tools/get_attr.py +10 -10
  28. re_common/baselibrary/tools/image_to_pdf.py +61 -61
  29. re_common/baselibrary/tools/java_code_deal.py +139 -139
  30. re_common/baselibrary/tools/javacode.py +79 -79
  31. re_common/baselibrary/tools/mdb_db3.py +48 -48
  32. re_common/baselibrary/tools/merge_file.py +171 -171
  33. re_common/baselibrary/tools/merge_gz_file.py +165 -165
  34. re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
  35. re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
  36. re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
  37. re_common/baselibrary/tools/mongo_tools.py +50 -50
  38. re_common/baselibrary/tools/move_file.py +170 -170
  39. re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
  40. re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
  41. re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
  42. re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
  43. re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
  44. re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
  45. re_common/baselibrary/tools/myparsel.py +104 -104
  46. re_common/baselibrary/tools/rename_dir_file.py +37 -37
  47. re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
  48. re_common/baselibrary/tools/split_line_to_many.py +25 -25
  49. re_common/baselibrary/tools/stringtodicts.py +33 -33
  50. re_common/baselibrary/tools/workwechant_bot.py +84 -84
  51. re_common/baselibrary/utils/baseaiohttp.py +296 -296
  52. re_common/baselibrary/utils/baseaiomysql.py +87 -87
  53. re_common/baselibrary/utils/baseallstep.py +191 -191
  54. re_common/baselibrary/utils/baseavro.py +19 -19
  55. re_common/baselibrary/utils/baseboto3.py +291 -291
  56. re_common/baselibrary/utils/basecsv.py +32 -32
  57. re_common/baselibrary/utils/basedict.py +133 -133
  58. re_common/baselibrary/utils/basedir.py +241 -241
  59. re_common/baselibrary/utils/baseencode.py +351 -351
  60. re_common/baselibrary/utils/baseencoding.py +28 -28
  61. re_common/baselibrary/utils/baseesdsl.py +86 -86
  62. re_common/baselibrary/utils/baseexcel.py +264 -264
  63. re_common/baselibrary/utils/baseexcept.py +109 -109
  64. re_common/baselibrary/utils/basefile.py +654 -654
  65. re_common/baselibrary/utils/baseftp.py +214 -214
  66. re_common/baselibrary/utils/basegzip.py +60 -60
  67. re_common/baselibrary/utils/basehdfs.py +135 -135
  68. re_common/baselibrary/utils/basehttpx.py +268 -268
  69. re_common/baselibrary/utils/baseip.py +87 -87
  70. re_common/baselibrary/utils/basejson.py +2 -2
  71. re_common/baselibrary/utils/baselist.py +32 -32
  72. re_common/baselibrary/utils/basemotor.py +190 -190
  73. re_common/baselibrary/utils/basemssql.py +98 -98
  74. re_common/baselibrary/utils/baseodbc.py +113 -113
  75. re_common/baselibrary/utils/basepandas.py +302 -302
  76. re_common/baselibrary/utils/basepeewee.py +11 -11
  77. re_common/baselibrary/utils/basepika.py +180 -180
  78. re_common/baselibrary/utils/basepydash.py +143 -143
  79. re_common/baselibrary/utils/basepymongo.py +230 -230
  80. re_common/baselibrary/utils/basequeue.py +22 -22
  81. re_common/baselibrary/utils/baserar.py +57 -57
  82. re_common/baselibrary/utils/baserequest.py +279 -279
  83. re_common/baselibrary/utils/baseset.py +8 -8
  84. re_common/baselibrary/utils/basesmb.py +403 -403
  85. re_common/baselibrary/utils/basestring.py +382 -382
  86. re_common/baselibrary/utils/basetime.py +320 -320
  87. re_common/baselibrary/utils/baseurl.py +121 -121
  88. re_common/baselibrary/utils/basezip.py +57 -57
  89. re_common/baselibrary/utils/core/__init__.py +7 -7
  90. re_common/baselibrary/utils/core/bottomutils.py +18 -18
  91. re_common/baselibrary/utils/core/mdeprecated.py +327 -327
  92. re_common/baselibrary/utils/core/mlamada.py +16 -16
  93. re_common/baselibrary/utils/core/msginfo.py +25 -25
  94. re_common/baselibrary/utils/core/requests_core.py +103 -103
  95. re_common/baselibrary/utils/fateadm.py +429 -429
  96. re_common/baselibrary/utils/importfun.py +123 -123
  97. re_common/baselibrary/utils/mfaker.py +57 -57
  98. re_common/baselibrary/utils/my_abc/__init__.py +3 -3
  99. re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
  100. re_common/baselibrary/utils/mylogger.py +414 -414
  101. re_common/baselibrary/utils/myredisclient.py +861 -861
  102. re_common/baselibrary/utils/pipupgrade.py +21 -21
  103. re_common/baselibrary/utils/ringlist.py +85 -85
  104. re_common/baselibrary/utils/version_compare.py +36 -36
  105. re_common/baselibrary/utils/ydmhttp.py +126 -126
  106. re_common/facade/lazy_import.py +11 -11
  107. re_common/facade/loggerfacade.py +25 -25
  108. re_common/facade/mysqlfacade.py +467 -467
  109. re_common/facade/now.py +31 -31
  110. re_common/facade/sqlite3facade.py +257 -257
  111. re_common/facade/use/mq_use_facade.py +83 -83
  112. re_common/facade/use/proxy_use_facade.py +19 -19
  113. re_common/libtest/base_dict_test.py +19 -19
  114. re_common/libtest/baseavro_test.py +13 -13
  115. re_common/libtest/basefile_test.py +14 -14
  116. re_common/libtest/basemssql_test.py +77 -77
  117. re_common/libtest/baseodbc_test.py +7 -7
  118. re_common/libtest/basepandas_test.py +38 -38
  119. re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
  120. re_common/libtest/get_attr_test/settings.py +54 -54
  121. re_common/libtest/idencode_test.py +53 -53
  122. re_common/libtest/iniconfig_test.py +35 -35
  123. re_common/libtest/ip_test.py +34 -34
  124. re_common/libtest/merge_file_test.py +20 -20
  125. re_common/libtest/mfaker_test.py +8 -8
  126. re_common/libtest/mm3_test.py +31 -31
  127. re_common/libtest/mylogger_test.py +88 -88
  128. re_common/libtest/myparsel_test.py +27 -27
  129. re_common/libtest/mysql_test.py +151 -151
  130. re_common/libtest/pymongo_test.py +21 -21
  131. re_common/libtest/split_test.py +11 -11
  132. re_common/libtest/sqlite3_merge_test.py +5 -5
  133. re_common/libtest/sqlite3_test.py +34 -34
  134. re_common/libtest/tomlconfig_test.py +30 -30
  135. re_common/libtest/use_tools_test/__init__.py +2 -2
  136. re_common/libtest/user/__init__.py +4 -4
  137. re_common/studio/__init__.py +4 -4
  138. re_common/studio/assignment_expressions.py +36 -36
  139. re_common/studio/mydash/test1.py +18 -18
  140. re_common/studio/pydashstudio/first.py +9 -9
  141. re_common/studio/streamlitstudio/first_app.py +65 -65
  142. re_common/studio/streamlitstudio/uber_pickups.py +23 -23
  143. re_common/studio/test.py +18 -18
  144. re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +195 -0
  145. re_common/v2/baselibrary/business_utils/__init__.py +0 -0
  146. re_common/v2/baselibrary/business_utils/rel_tools.py +6 -0
  147. re_common/v2/baselibrary/decorators/utils.py +59 -59
  148. re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
  149. re_common/v2/baselibrary/tools/WeChatRobot.py +95 -79
  150. re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
  151. re_common/v2/baselibrary/tools/dict_tools.py +37 -37
  152. re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
  153. re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
  154. re_common/v2/baselibrary/tools/list_tools.py +65 -65
  155. re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
  156. re_common/v2/baselibrary/tools/text_matcher.py +326 -326
  157. re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
  158. re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
  159. re_common/v2/baselibrary/utils/author_smi.py +360 -360
  160. re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
  161. re_common/v2/baselibrary/utils/basedict.py +37 -37
  162. re_common/v2/baselibrary/utils/basehdfs.py +161 -161
  163. re_common/v2/baselibrary/utils/basepika.py +180 -180
  164. re_common/v2/baselibrary/utils/basetime.py +77 -77
  165. re_common/v2/baselibrary/utils/db.py +38 -38
  166. re_common/v2/baselibrary/utils/json_cls.py +16 -16
  167. re_common/v2/baselibrary/utils/mq.py +83 -83
  168. re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
  169. re_common/v2/baselibrary/utils/string_bool.py +186 -149
  170. re_common/v2/baselibrary/utils/string_clear.py +227 -204
  171. re_common/v2/baselibrary/utils/string_smi.py +18 -18
  172. re_common/v2/baselibrary/utils/stringutils.py +213 -213
  173. re_common/vip/base_step_process.py +11 -11
  174. re_common/vip/baseencodeid.py +90 -90
  175. re_common/vip/changetaskname.py +28 -28
  176. re_common/vip/core_var.py +24 -24
  177. re_common/vip/mmh3Hash.py +89 -89
  178. re_common/vip/proxy/allproxys.py +127 -127
  179. re_common/vip/proxy/allproxys_thread.py +159 -159
  180. re_common/vip/proxy/cnki_proxy.py +153 -153
  181. re_common/vip/proxy/kuaidaili.py +87 -87
  182. re_common/vip/proxy/proxy_all.py +113 -113
  183. re_common/vip/proxy/update_kuaidaili_0.py +42 -42
  184. re_common/vip/proxy/wanfang_proxy.py +152 -152
  185. re_common/vip/proxy/wp_proxy_all.py +181 -181
  186. re_common/vip/read_rawid_to_txt.py +91 -91
  187. re_common/vip/title/__init__.py +5 -5
  188. re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
  189. re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
  190. re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
  191. re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
  192. re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
  193. re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
  194. re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
  195. re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
  196. re_common/vip/title/transform/__init__.py +10 -10
  197. {re_common-10.0.22.dist-info → re_common-10.0.24.dist-info}/LICENSE +201 -201
  198. {re_common-10.0.22.dist-info → re_common-10.0.24.dist-info}/METADATA +16 -16
  199. re_common-10.0.24.dist-info/RECORD +230 -0
  200. {re_common-10.0.22.dist-info → re_common-10.0.24.dist-info}/WHEEL +1 -1
  201. re_common-10.0.22.dist-info/RECORD +0 -227
  202. {re_common-10.0.22.dist-info → re_common-10.0.24.dist-info}/top_level.txt +0 -0
@@ -1,326 +1,326 @@
1
- import pickle
2
-
3
- import jieba
4
- import re
5
- from typing import List, Dict, Tuple, Set, Optional, Union, Hashable, Protocol
6
- from datasketch import MinHash, MinHashLSH
7
-
8
- from re_common.v2.baselibrary.decorators.utils import deprecated
9
- from re_common.v2.baselibrary.utils.string_bool import is_single_cjk_char
10
-
11
-
12
- @deprecated("请使用 TextMatcherV2 代替。")
13
- class TextMatcher(object):
14
- def __init__(
15
- self,
16
- threshold: float = 0.5,
17
- num_perm: int = 128,
18
- is_raw_texts=True,
19
- stopwords_path: Optional[str] = None,
20
- user_dict_path: Optional[str] = None,
21
-
22
- ):
23
- """
24
- 初始化文本匹配器
25
-
26
- Args:
27
- threshold: LSH 相似度阈值
28
- num_perm: MinHash 排列数
29
- stopwords_path: 停用词文件路径
30
- user_dict_path: 用户自定义词典路径
31
- """
32
- self.threshold = threshold
33
- self.num_perm = num_perm
34
- self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
35
- # self.minhashes: Dict[str, MinHash] = {}
36
- self.raw_texts: Dict[str, str] = {}
37
- self.is_raw_texts = is_raw_texts
38
- self.doc_counter = 0
39
-
40
- # 加载停用词
41
- self.stopwords: Set[str] = set()
42
- if stopwords_path:
43
- self.load_stopwords(stopwords_path)
44
-
45
- # 加载用户词典
46
- if user_dict_path:
47
- jieba.load_userdict(user_dict_path)
48
-
49
- def load_stopwords(self, stopwords_path: str) -> None:
50
- """加载停用词"""
51
- with open(stopwords_path, "r", encoding="utf-8") as f:
52
- self.stopwords = set(line.strip() for line in f)
53
-
54
- def preprocess_text(self, text: str) -> str:
55
- """
56
- 文本预处理
57
- """
58
- # 转换为小写
59
- text = text.lower()
60
- # 移除特殊字符
61
- text = re.sub(r"[^\w\s\u4e00-\u9fff]", "", text)
62
- # 移除多余空格
63
- text = re.sub(r"\s+", " ", text).strip()
64
- return text
65
-
66
- def tokenize(self, text: str) -> List[str]:
67
- """
68
- 分词并移除停用词
69
- """
70
- words = jieba.lcut(text)
71
- one_char_size = len([i for i in words if len(i) == 1])
72
- all_size = len(words)
73
- if all_size != 0 and one_char_size / all_size > 0.6:
74
- words = [i for i in text.split() if i.strip()]
75
-
76
- # 过滤停用词和空字符
77
- words = [w for w in words if w not in self.stopwords and w.strip()]
78
- return words
79
-
80
- def create_minhash(self, words: List[str]) -> MinHash:
81
- """
82
- 为分词结果创建 MinHash
83
- """
84
- minhash = MinHash(num_perm=self.num_perm)
85
- for word in words:
86
- minhash.update(word.encode("utf-8"))
87
- return minhash
88
-
89
- def add_document(self, text: str, doc_id: Optional[str] = None) -> str:
90
- """
91
- 添加文档到索引
92
-
93
- Args:
94
- text: 文档文本
95
- doc_id: 文档ID(可选)
96
-
97
- Returns:
98
- doc_id: 文档ID
99
- """
100
- if doc_id is None:
101
- doc_id = f"doc_{self.doc_counter}"
102
- self.doc_counter += 1
103
-
104
- # 预处理和分词
105
- processed_text = self.preprocess_text(text)
106
- words = self.tokenize(processed_text)
107
-
108
- # 创建 MinHash
109
- minhash = self.create_minhash(words)
110
- if self.is_raw_texts:
111
- # 存储原始文本和 MinHash
112
- self.raw_texts[doc_id] = text
113
- # self.minhashes[doc_id] = minhash
114
-
115
- # 添加到 LSH
116
- self.lsh.insert(doc_id, minhash)
117
-
118
- return doc_id
119
-
120
- def batch_add_documents(self, texts: Dict[str, str]) -> None:
121
- """
122
- 批量添加文档
123
-
124
- Args:
125
- texts: {doc_id: text} 的字典
126
- """
127
- for doc_id, text in texts.items():
128
- self.add_document(text, doc_id)
129
-
130
- def create_query_minhash(self, query: str):
131
-
132
- # 预处理查询文本
133
- processed_query = self.preprocess_text(query)
134
- query_words = self.tokenize(processed_query)
135
- # print(query_words)
136
- query_minhash = self.create_minhash(query_words)
137
- return query_minhash
138
-
139
- def find_similar(self, query_minhash: MinHash, return_similarities: bool = False) -> Union[
140
- List[str], List[Tuple[str, float]]]:
141
- """
142
- 查找相似文档
143
-
144
- Args:
145
- query: 查询文本
146
- return_similarities: 是否返回相似度分数
147
-
148
- Returns:
149
- 如果 return_similarities 为 True,返回 [(doc_id, similarity), ...]
150
- 否则返回 [doc_id, ...]
151
- """
152
-
153
- # 使用 LSH 查找候选集
154
- similar_docs = self.lsh.query(query_minhash)
155
-
156
- # if return_similarities:
157
- # # 计算精确的 Jaccard 相似度
158
- # results = []
159
- # for doc_id in similar_docs:
160
- # similarity = query_minhash.jaccard(self.minhashes[doc_id])
161
- # results.append((doc_id, similarity))
162
- # # 按相似度降序排序
163
- # return sorted(results, key=lambda x: x[1], reverse=True)
164
-
165
- return similar_docs
166
-
167
- def get_text(self, doc_id: str) -> Optional[str]:
168
- """获取原始文本"""
169
- if self.is_raw_texts:
170
- return self.raw_texts.get(doc_id)
171
- raise Exception("没有开启存储")
172
-
173
- def remove_document(self, doc_id: str) -> bool:
174
- """
175
- 删除文档
176
-
177
- Returns:
178
- bool: 是否成功删除
179
- """
180
- # if doc_id not in self.minhashes:
181
- # return False
182
-
183
- self.lsh.remove(doc_id)
184
- # del self.minhashes[doc_id]
185
- if self.is_raw_texts:
186
- del self.raw_texts[doc_id]
187
- return True
188
-
189
- def clear(self) -> None:
190
- """清空所有数据"""
191
- self.lsh = MinHashLSH(threshold=self.threshold, num_perm=self.num_perm)
192
- # self.minhashes.clear()
193
- self.raw_texts.clear()
194
- self.doc_counter = 0
195
-
196
-
197
- # 定义一个协议,描述“像鸭子一样”的行为
198
- class TokenizeDuckLike(Protocol):
199
- def get_words(self, text) -> List:
200
- pass
201
-
202
-
203
- class JiebaTokenize(object):
204
-
205
- def __init__(self, stopwords=None):
206
- self.stopwords = stopwords
207
-
208
- def get_words(self, text) -> List:
209
-
210
- if self.stopwords is None:
211
- stopwords = []
212
- words = jieba.lcut(text)
213
-
214
- # 统计单字符数据 长度,防止结巴分词分不了的单词 将数据分为单个字符
215
-
216
- # 这里为什么使用函数 而不是在推导式中兼容,主要是在一些 spark中 推导式的if 条件不遵循最短路径原则会将表达式当做一个整体算子
217
- def is_singel_en(i):
218
- if len(i) == 1 and not is_single_cjk_char(i):
219
- return True
220
- return False
221
-
222
- one_char_size = len([i for i in words if is_singel_en(i)])
223
- all_size = len(words)
224
- # 如果单字符个数超过一定比例 就直接用空格分词
225
- if all_size != 0 and one_char_size / all_size > 0.6:
226
- words = [i for i in text.split() if i.strip()]
227
-
228
- # 过滤停用词和空字符
229
- words = [w for w in words if w not in stopwords and w.strip()]
230
- return words
231
-
232
-
233
- class TextMatcherV2(object):
234
-
235
- def __init__(
236
- self,
237
- threshold: float = 0.5,
238
- num_perm: int = 128,
239
- tdk: TokenizeDuckLike = None
240
- ):
241
- """
242
- 初始化文本匹配器
243
-
244
- Args:
245
- threshold: LSH 相似度阈值
246
- num_perm: MinHash 排列数
247
- stopwords_path: 停用词文件路径
248
- user_dict_path: 用户自定义词典路径
249
- """
250
- self.threshold = threshold
251
- self.num_perm = num_perm
252
- self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
253
- self.tdk = tdk
254
-
255
- def add_document(self, doc_id: str, minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None):
256
- if isinstance(minhash, str):
257
- minhash = self.str_to_minihash(minhash, tdk)
258
-
259
- self.lsh.insert(doc_id, minhash)
260
-
261
- def batch_add_documents(self, betch_data: Union[list, dict], tdk: TokenizeDuckLike = None):
262
- def _add_document(minhash_or_str, tdk):
263
- if isinstance(minhash_or_str, str):
264
- minhash_or_str = self.str_to_minihash(minhash_or_str, tdk)
265
- self.add_document(docid, minhash_or_str, tdk)
266
-
267
- if isinstance(betch_data, list):
268
- # 必须是可解包的2个数据的元组或list
269
- for docid, minhash_or_str in betch_data:
270
- _add_document(minhash_or_str, tdk)
271
- elif isinstance(betch_data, dict):
272
- for docid, minhash_or_str in betch_data.items():
273
- _add_document(minhash_or_str, tdk)
274
- else:
275
- raise Exception("数据类型错误")
276
-
277
- def find_similar(self, query_minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None) -> List[Hashable]:
278
- # 使用 LSH 查找候选集
279
- if isinstance(query_minhash, str):
280
- query_minhash = self.str_to_minihash(query_minhash, tdk)
281
- similar_docs = self.lsh.query(query_minhash)
282
- return similar_docs
283
-
284
- def create_minhash(self, words: List[str], num_perm=None) -> MinHash:
285
- """
286
- 为分词结果创建 MinHash
287
- """
288
- if num_perm is None:
289
- num_perm = self.num_perm
290
- minhash = MinHash(num_perm=num_perm)
291
- for word in words:
292
- minhash.update(word.encode("utf-8"))
293
- return minhash
294
-
295
- def create_words(self, text: str, tdk: TokenizeDuckLike = None):
296
- if tdk is None:
297
- tdk = self.tdk
298
- worlds = tdk.get_words(text)
299
- return worlds
300
-
301
- def str_to_minihash(self, text: str, tdk: TokenizeDuckLike = None):
302
- if tdk is None:
303
- tdk = self.tdk
304
- words = self.create_words(text, tdk)
305
- minhash = self.create_minhash(words, self.num_perm)
306
- return minhash
307
-
308
- def minhash_dumps(self, minhash) -> bytes:
309
- """
310
- 序列化
311
- """
312
- serialized_minhash = pickle.dumps(minhash)
313
- return serialized_minhash
314
-
315
- def minhash_loads(self, serialized_minhash) -> MinHash:
316
- """
317
- 反序列化
318
- """
319
- minhash = pickle.loads(serialized_minhash)
320
- return minhash
321
-
322
- def merge_other_minhashlsh(self, other_minhashlsh: MinHashLSH):
323
- """
324
- 在其他地方创建好的lsh 合并进来
325
- """
326
- self.lsh.merge(other_minhashlsh)
1
+ import pickle
2
+
3
+ import jieba
4
+ import re
5
+ from typing import List, Dict, Tuple, Set, Optional, Union, Hashable, Protocol
6
+ from datasketch import MinHash, MinHashLSH
7
+
8
+ from re_common.v2.baselibrary.decorators.utils import deprecated
9
+ from re_common.v2.baselibrary.utils.string_bool import is_single_cjk_char
10
+
11
+
12
+ @deprecated("请使用 TextMatcherV2 代替。")
13
+ class TextMatcher(object):
14
+ def __init__(
15
+ self,
16
+ threshold: float = 0.5,
17
+ num_perm: int = 128,
18
+ is_raw_texts=True,
19
+ stopwords_path: Optional[str] = None,
20
+ user_dict_path: Optional[str] = None,
21
+
22
+ ):
23
+ """
24
+ 初始化文本匹配器
25
+
26
+ Args:
27
+ threshold: LSH 相似度阈值
28
+ num_perm: MinHash 排列数
29
+ stopwords_path: 停用词文件路径
30
+ user_dict_path: 用户自定义词典路径
31
+ """
32
+ self.threshold = threshold
33
+ self.num_perm = num_perm
34
+ self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
35
+ # self.minhashes: Dict[str, MinHash] = {}
36
+ self.raw_texts: Dict[str, str] = {}
37
+ self.is_raw_texts = is_raw_texts
38
+ self.doc_counter = 0
39
+
40
+ # 加载停用词
41
+ self.stopwords: Set[str] = set()
42
+ if stopwords_path:
43
+ self.load_stopwords(stopwords_path)
44
+
45
+ # 加载用户词典
46
+ if user_dict_path:
47
+ jieba.load_userdict(user_dict_path)
48
+
49
+ def load_stopwords(self, stopwords_path: str) -> None:
50
+ """加载停用词"""
51
+ with open(stopwords_path, "r", encoding="utf-8") as f:
52
+ self.stopwords = set(line.strip() for line in f)
53
+
54
+ def preprocess_text(self, text: str) -> str:
55
+ """
56
+ 文本预处理
57
+ """
58
+ # 转换为小写
59
+ text = text.lower()
60
+ # 移除特殊字符
61
+ text = re.sub(r"[^\w\s\u4e00-\u9fff]", "", text)
62
+ # 移除多余空格
63
+ text = re.sub(r"\s+", " ", text).strip()
64
+ return text
65
+
66
+ def tokenize(self, text: str) -> List[str]:
67
+ """
68
+ 分词并移除停用词
69
+ """
70
+ words = jieba.lcut(text)
71
+ one_char_size = len([i for i in words if len(i) == 1])
72
+ all_size = len(words)
73
+ if all_size != 0 and one_char_size / all_size > 0.6:
74
+ words = [i for i in text.split() if i.strip()]
75
+
76
+ # 过滤停用词和空字符
77
+ words = [w for w in words if w not in self.stopwords and w.strip()]
78
+ return words
79
+
80
+ def create_minhash(self, words: List[str]) -> MinHash:
81
+ """
82
+ 为分词结果创建 MinHash
83
+ """
84
+ minhash = MinHash(num_perm=self.num_perm)
85
+ for word in words:
86
+ minhash.update(word.encode("utf-8"))
87
+ return minhash
88
+
89
+ def add_document(self, text: str, doc_id: Optional[str] = None) -> str:
90
+ """
91
+ 添加文档到索引
92
+
93
+ Args:
94
+ text: 文档文本
95
+ doc_id: 文档ID(可选)
96
+
97
+ Returns:
98
+ doc_id: 文档ID
99
+ """
100
+ if doc_id is None:
101
+ doc_id = f"doc_{self.doc_counter}"
102
+ self.doc_counter += 1
103
+
104
+ # 预处理和分词
105
+ processed_text = self.preprocess_text(text)
106
+ words = self.tokenize(processed_text)
107
+
108
+ # 创建 MinHash
109
+ minhash = self.create_minhash(words)
110
+ if self.is_raw_texts:
111
+ # 存储原始文本和 MinHash
112
+ self.raw_texts[doc_id] = text
113
+ # self.minhashes[doc_id] = minhash
114
+
115
+ # 添加到 LSH
116
+ self.lsh.insert(doc_id, minhash)
117
+
118
+ return doc_id
119
+
120
+ def batch_add_documents(self, texts: Dict[str, str]) -> None:
121
+ """
122
+ 批量添加文档
123
+
124
+ Args:
125
+ texts: {doc_id: text} 的字典
126
+ """
127
+ for doc_id, text in texts.items():
128
+ self.add_document(text, doc_id)
129
+
130
+ def create_query_minhash(self, query: str):
131
+
132
+ # 预处理查询文本
133
+ processed_query = self.preprocess_text(query)
134
+ query_words = self.tokenize(processed_query)
135
+ # print(query_words)
136
+ query_minhash = self.create_minhash(query_words)
137
+ return query_minhash
138
+
139
+ def find_similar(self, query_minhash: MinHash, return_similarities: bool = False) -> Union[
140
+ List[str], List[Tuple[str, float]]]:
141
+ """
142
+ 查找相似文档
143
+
144
+ Args:
145
+ query: 查询文本
146
+ return_similarities: 是否返回相似度分数
147
+
148
+ Returns:
149
+ 如果 return_similarities 为 True,返回 [(doc_id, similarity), ...]
150
+ 否则返回 [doc_id, ...]
151
+ """
152
+
153
+ # 使用 LSH 查找候选集
154
+ similar_docs = self.lsh.query(query_minhash)
155
+
156
+ # if return_similarities:
157
+ # # 计算精确的 Jaccard 相似度
158
+ # results = []
159
+ # for doc_id in similar_docs:
160
+ # similarity = query_minhash.jaccard(self.minhashes[doc_id])
161
+ # results.append((doc_id, similarity))
162
+ # # 按相似度降序排序
163
+ # return sorted(results, key=lambda x: x[1], reverse=True)
164
+
165
+ return similar_docs
166
+
167
+ def get_text(self, doc_id: str) -> Optional[str]:
168
+ """获取原始文本"""
169
+ if self.is_raw_texts:
170
+ return self.raw_texts.get(doc_id)
171
+ raise Exception("没有开启存储")
172
+
173
+ def remove_document(self, doc_id: str) -> bool:
174
+ """
175
+ 删除文档
176
+
177
+ Returns:
178
+ bool: 是否成功删除
179
+ """
180
+ # if doc_id not in self.minhashes:
181
+ # return False
182
+
183
+ self.lsh.remove(doc_id)
184
+ # del self.minhashes[doc_id]
185
+ if self.is_raw_texts:
186
+ del self.raw_texts[doc_id]
187
+ return True
188
+
189
+ def clear(self) -> None:
190
+ """清空所有数据"""
191
+ self.lsh = MinHashLSH(threshold=self.threshold, num_perm=self.num_perm)
192
+ # self.minhashes.clear()
193
+ self.raw_texts.clear()
194
+ self.doc_counter = 0
195
+
196
+
197
+ # 定义一个协议,描述“像鸭子一样”的行为
198
+ class TokenizeDuckLike(Protocol):
199
+ def get_words(self, text) -> List:
200
+ pass
201
+
202
+
203
+ class JiebaTokenize(object):
204
+
205
+ def __init__(self, stopwords=None):
206
+ self.stopwords = stopwords
207
+
208
+ def get_words(self, text) -> List:
209
+
210
+ if self.stopwords is None:
211
+ stopwords = []
212
+ words = jieba.lcut(text)
213
+
214
+ # 统计单字符数据 长度,防止结巴分词分不了的单词 将数据分为单个字符
215
+
216
+ # 这里为什么使用函数 而不是在推导式中兼容,主要是在一些 spark中 推导式的if 条件不遵循最短路径原则会将表达式当做一个整体算子
217
+ def is_singel_en(i):
218
+ if len(i) == 1 and not is_single_cjk_char(i):
219
+ return True
220
+ return False
221
+
222
+ one_char_size = len([i for i in words if is_singel_en(i)])
223
+ all_size = len(words)
224
+ # 如果单字符个数超过一定比例 就直接用空格分词
225
+ if all_size != 0 and one_char_size / all_size > 0.6:
226
+ words = [i for i in text.split() if i.strip()]
227
+
228
+ # 过滤停用词和空字符
229
+ words = [w for w in words if w not in stopwords and w.strip()]
230
+ return words
231
+
232
+
233
+ class TextMatcherV2(object):
234
+
235
+ def __init__(
236
+ self,
237
+ threshold: float = 0.5,
238
+ num_perm: int = 128,
239
+ tdk: TokenizeDuckLike = None
240
+ ):
241
+ """
242
+ 初始化文本匹配器
243
+
244
+ Args:
245
+ threshold: LSH 相似度阈值
246
+ num_perm: MinHash 排列数
247
+ stopwords_path: 停用词文件路径
248
+ user_dict_path: 用户自定义词典路径
249
+ """
250
+ self.threshold = threshold
251
+ self.num_perm = num_perm
252
+ self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
253
+ self.tdk = tdk
254
+
255
+ def add_document(self, doc_id: str, minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None):
256
+ if isinstance(minhash, str):
257
+ minhash = self.str_to_minihash(minhash, tdk)
258
+
259
+ self.lsh.insert(doc_id, minhash)
260
+
261
+ def batch_add_documents(self, betch_data: Union[list, dict], tdk: TokenizeDuckLike = None):
262
+ def _add_document(minhash_or_str, tdk):
263
+ if isinstance(minhash_or_str, str):
264
+ minhash_or_str = self.str_to_minihash(minhash_or_str, tdk)
265
+ self.add_document(docid, minhash_or_str, tdk)
266
+
267
+ if isinstance(betch_data, list):
268
+ # 必须是可解包的2个数据的元组或list
269
+ for docid, minhash_or_str in betch_data:
270
+ _add_document(minhash_or_str, tdk)
271
+ elif isinstance(betch_data, dict):
272
+ for docid, minhash_or_str in betch_data.items():
273
+ _add_document(minhash_or_str, tdk)
274
+ else:
275
+ raise Exception("数据类型错误")
276
+
277
+ def find_similar(self, query_minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None) -> List[Hashable]:
278
+ # 使用 LSH 查找候选集
279
+ if isinstance(query_minhash, str):
280
+ query_minhash = self.str_to_minihash(query_minhash, tdk)
281
+ similar_docs = self.lsh.query(query_minhash)
282
+ return similar_docs
283
+
284
+ def create_minhash(self, words: List[str], num_perm=None) -> MinHash:
285
+ """
286
+ 为分词结果创建 MinHash
287
+ """
288
+ if num_perm is None:
289
+ num_perm = self.num_perm
290
+ minhash = MinHash(num_perm=num_perm)
291
+ for word in words:
292
+ minhash.update(word.encode("utf-8"))
293
+ return minhash
294
+
295
+ def create_words(self, text: str, tdk: TokenizeDuckLike = None):
296
+ if tdk is None:
297
+ tdk = self.tdk
298
+ worlds = tdk.get_words(text)
299
+ return worlds
300
+
301
+ def str_to_minihash(self, text: str, tdk: TokenizeDuckLike = None):
302
+ if tdk is None:
303
+ tdk = self.tdk
304
+ words = self.create_words(text, tdk)
305
+ minhash = self.create_minhash(words, self.num_perm)
306
+ return minhash
307
+
308
+ def minhash_dumps(self, minhash) -> bytes:
309
+ """
310
+ 序列化
311
+ """
312
+ serialized_minhash = pickle.dumps(minhash)
313
+ return serialized_minhash
314
+
315
+ def minhash_loads(self, serialized_minhash) -> MinHash:
316
+ """
317
+ 反序列化
318
+ """
319
+ minhash = pickle.loads(serialized_minhash)
320
+ return minhash
321
+
322
+ def merge_other_minhashlsh(self, other_minhashlsh: MinHashLSH):
323
+ """
324
+ 在其他地方创建好的lsh 合并进来
325
+ """
326
+ self.lsh.merge(other_minhashlsh)