re-common 10.0.39__py3-none-any.whl → 10.0.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (218) hide show
  1. re_common/baselibrary/__init__.py +4 -4
  2. re_common/baselibrary/baseabs/__init__.py +6 -6
  3. re_common/baselibrary/baseabs/baseabs.py +26 -26
  4. re_common/baselibrary/database/mbuilder.py +132 -132
  5. re_common/baselibrary/database/moudle.py +93 -93
  6. re_common/baselibrary/database/msqlite3.py +194 -194
  7. re_common/baselibrary/database/mysql.py +169 -169
  8. re_common/baselibrary/database/sql_factory.py +26 -26
  9. re_common/baselibrary/mthread/MThreadingRun.py +486 -486
  10. re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
  11. re_common/baselibrary/mthread/__init__.py +2 -2
  12. re_common/baselibrary/mthread/mythreading.py +695 -695
  13. re_common/baselibrary/pakge_other/socks.py +404 -404
  14. re_common/baselibrary/readconfig/config_factory.py +18 -18
  15. re_common/baselibrary/readconfig/ini_config.py +317 -317
  16. re_common/baselibrary/readconfig/toml_config.py +49 -49
  17. re_common/baselibrary/temporary/envdata.py +36 -36
  18. re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
  19. re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
  20. re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
  21. re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
  22. re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
  23. re_common/baselibrary/tools/contrast_db3.py +123 -123
  24. re_common/baselibrary/tools/copy_file.py +39 -39
  25. re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
  26. re_common/baselibrary/tools/foreachgz.py +39 -39
  27. re_common/baselibrary/tools/get_attr.py +10 -10
  28. re_common/baselibrary/tools/image_to_pdf.py +61 -61
  29. re_common/baselibrary/tools/java_code_deal.py +139 -139
  30. re_common/baselibrary/tools/javacode.py +79 -79
  31. re_common/baselibrary/tools/mdb_db3.py +48 -48
  32. re_common/baselibrary/tools/merge_file.py +171 -171
  33. re_common/baselibrary/tools/merge_gz_file.py +165 -165
  34. re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
  35. re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
  36. re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
  37. re_common/baselibrary/tools/mongo_tools.py +50 -50
  38. re_common/baselibrary/tools/move_file.py +170 -170
  39. re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
  40. re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
  41. re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
  42. re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
  43. re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
  44. re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
  45. re_common/baselibrary/tools/myparsel.py +104 -104
  46. re_common/baselibrary/tools/rename_dir_file.py +37 -37
  47. re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
  48. re_common/baselibrary/tools/split_line_to_many.py +25 -25
  49. re_common/baselibrary/tools/stringtodicts.py +33 -33
  50. re_common/baselibrary/tools/workwechant_bot.py +84 -84
  51. re_common/baselibrary/utils/baseaiohttp.py +296 -296
  52. re_common/baselibrary/utils/baseaiomysql.py +87 -87
  53. re_common/baselibrary/utils/baseallstep.py +191 -191
  54. re_common/baselibrary/utils/baseavro.py +19 -19
  55. re_common/baselibrary/utils/baseboto3.py +291 -291
  56. re_common/baselibrary/utils/basecsv.py +32 -32
  57. re_common/baselibrary/utils/basedict.py +133 -133
  58. re_common/baselibrary/utils/basedir.py +241 -241
  59. re_common/baselibrary/utils/baseencode.py +351 -351
  60. re_common/baselibrary/utils/baseencoding.py +28 -28
  61. re_common/baselibrary/utils/baseesdsl.py +86 -86
  62. re_common/baselibrary/utils/baseexcel.py +264 -264
  63. re_common/baselibrary/utils/baseexcept.py +109 -109
  64. re_common/baselibrary/utils/basefile.py +654 -654
  65. re_common/baselibrary/utils/baseftp.py +214 -214
  66. re_common/baselibrary/utils/basegzip.py +60 -60
  67. re_common/baselibrary/utils/basehdfs.py +135 -135
  68. re_common/baselibrary/utils/basehttpx.py +268 -268
  69. re_common/baselibrary/utils/baseip.py +87 -87
  70. re_common/baselibrary/utils/basejson.py +2 -2
  71. re_common/baselibrary/utils/baselist.py +32 -32
  72. re_common/baselibrary/utils/basemotor.py +190 -190
  73. re_common/baselibrary/utils/basemssql.py +98 -98
  74. re_common/baselibrary/utils/baseodbc.py +113 -113
  75. re_common/baselibrary/utils/basepandas.py +302 -302
  76. re_common/baselibrary/utils/basepeewee.py +11 -11
  77. re_common/baselibrary/utils/basepika.py +180 -180
  78. re_common/baselibrary/utils/basepydash.py +143 -143
  79. re_common/baselibrary/utils/basepymongo.py +230 -230
  80. re_common/baselibrary/utils/basequeue.py +22 -22
  81. re_common/baselibrary/utils/baserar.py +57 -57
  82. re_common/baselibrary/utils/baserequest.py +279 -279
  83. re_common/baselibrary/utils/baseset.py +8 -8
  84. re_common/baselibrary/utils/basesmb.py +403 -403
  85. re_common/baselibrary/utils/basestring.py +382 -382
  86. re_common/baselibrary/utils/basetime.py +320 -320
  87. re_common/baselibrary/utils/baseurl.py +121 -121
  88. re_common/baselibrary/utils/basezip.py +57 -57
  89. re_common/baselibrary/utils/core/__init__.py +7 -7
  90. re_common/baselibrary/utils/core/bottomutils.py +18 -18
  91. re_common/baselibrary/utils/core/mdeprecated.py +327 -327
  92. re_common/baselibrary/utils/core/mlamada.py +16 -16
  93. re_common/baselibrary/utils/core/msginfo.py +25 -25
  94. re_common/baselibrary/utils/core/requests_core.py +103 -103
  95. re_common/baselibrary/utils/fateadm.py +429 -429
  96. re_common/baselibrary/utils/importfun.py +123 -123
  97. re_common/baselibrary/utils/mfaker.py +57 -57
  98. re_common/baselibrary/utils/my_abc/__init__.py +3 -3
  99. re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
  100. re_common/baselibrary/utils/mylogger.py +414 -414
  101. re_common/baselibrary/utils/myredisclient.py +861 -861
  102. re_common/baselibrary/utils/pipupgrade.py +21 -21
  103. re_common/baselibrary/utils/ringlist.py +85 -85
  104. re_common/baselibrary/utils/version_compare.py +36 -36
  105. re_common/baselibrary/utils/ydmhttp.py +126 -126
  106. re_common/facade/lazy_import.py +11 -11
  107. re_common/facade/loggerfacade.py +25 -25
  108. re_common/facade/mysqlfacade.py +467 -467
  109. re_common/facade/now.py +31 -31
  110. re_common/facade/sqlite3facade.py +257 -257
  111. re_common/facade/use/mq_use_facade.py +83 -83
  112. re_common/facade/use/proxy_use_facade.py +19 -19
  113. re_common/libtest/base_dict_test.py +19 -19
  114. re_common/libtest/baseavro_test.py +13 -13
  115. re_common/libtest/basefile_test.py +14 -14
  116. re_common/libtest/basemssql_test.py +77 -77
  117. re_common/libtest/baseodbc_test.py +7 -7
  118. re_common/libtest/basepandas_test.py +38 -38
  119. re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
  120. re_common/libtest/get_attr_test/settings.py +54 -54
  121. re_common/libtest/idencode_test.py +53 -53
  122. re_common/libtest/iniconfig_test.py +35 -35
  123. re_common/libtest/ip_test.py +34 -34
  124. re_common/libtest/merge_file_test.py +20 -20
  125. re_common/libtest/mfaker_test.py +8 -8
  126. re_common/libtest/mm3_test.py +31 -31
  127. re_common/libtest/mylogger_test.py +88 -88
  128. re_common/libtest/myparsel_test.py +27 -27
  129. re_common/libtest/mysql_test.py +151 -151
  130. re_common/libtest/pymongo_test.py +21 -21
  131. re_common/libtest/split_test.py +11 -11
  132. re_common/libtest/sqlite3_merge_test.py +5 -5
  133. re_common/libtest/sqlite3_test.py +34 -34
  134. re_common/libtest/tomlconfig_test.py +30 -30
  135. re_common/libtest/use_tools_test/__init__.py +2 -2
  136. re_common/libtest/user/__init__.py +4 -4
  137. re_common/studio/__init__.py +4 -4
  138. re_common/studio/assignment_expressions.py +36 -36
  139. re_common/studio/mydash/test1.py +18 -18
  140. re_common/studio/pydashstudio/first.py +9 -9
  141. re_common/studio/streamlitstudio/first_app.py +65 -65
  142. re_common/studio/streamlitstudio/uber_pickups.py +23 -23
  143. re_common/studio/test.py +18 -18
  144. re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +219 -219
  145. re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
  146. re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
  147. re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
  148. re_common/v2/baselibrary/decorators/utils.py +59 -59
  149. re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
  150. re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
  151. re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
  152. re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
  153. re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
  154. re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
  155. re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
  156. re_common/v2/baselibrary/tools/concurrency.py +35 -35
  157. re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
  158. re_common/v2/baselibrary/tools/data_processer/data_processer.py +508 -508
  159. re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
  160. re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
  161. re_common/v2/baselibrary/tools/dict_tools.py +44 -44
  162. re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
  163. re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
  164. re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
  165. re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
  166. re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
  167. re_common/v2/baselibrary/tools/list_tools.py +69 -69
  168. re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
  169. re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
  170. re_common/v2/baselibrary/tools/text_matcher.py +326 -326
  171. re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
  172. re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
  173. re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
  174. re_common/v2/baselibrary/utils/author_smi.py +361 -361
  175. re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
  176. re_common/v2/baselibrary/utils/basedict.py +37 -37
  177. re_common/v2/baselibrary/utils/basehdfs.py +163 -163
  178. re_common/v2/baselibrary/utils/basepika.py +180 -180
  179. re_common/v2/baselibrary/utils/basetime.py +77 -77
  180. re_common/v2/baselibrary/utils/db.py +156 -156
  181. re_common/v2/baselibrary/utils/elasticsearch.py +46 -0
  182. re_common/v2/baselibrary/utils/json_cls.py +16 -16
  183. re_common/v2/baselibrary/utils/mq.py +83 -83
  184. re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
  185. re_common/v2/baselibrary/utils/string_bool.py +186 -186
  186. re_common/v2/baselibrary/utils/string_clear.py +246 -246
  187. re_common/v2/baselibrary/utils/string_smi.py +18 -18
  188. re_common/v2/baselibrary/utils/stringutils.py +271 -271
  189. re_common/vip/base_step_process.py +11 -11
  190. re_common/vip/baseencodeid.py +90 -90
  191. re_common/vip/changetaskname.py +28 -28
  192. re_common/vip/core_var.py +24 -24
  193. re_common/vip/mmh3Hash.py +89 -89
  194. re_common/vip/proxy/allproxys.py +127 -127
  195. re_common/vip/proxy/allproxys_thread.py +159 -159
  196. re_common/vip/proxy/cnki_proxy.py +153 -153
  197. re_common/vip/proxy/kuaidaili.py +87 -87
  198. re_common/vip/proxy/proxy_all.py +113 -113
  199. re_common/vip/proxy/update_kuaidaili_0.py +42 -42
  200. re_common/vip/proxy/wanfang_proxy.py +152 -152
  201. re_common/vip/proxy/wp_proxy_all.py +181 -181
  202. re_common/vip/read_rawid_to_txt.py +91 -91
  203. re_common/vip/title/__init__.py +5 -5
  204. re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
  205. re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
  206. re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
  207. re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
  208. re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
  209. re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
  210. re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
  211. re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
  212. re_common/vip/title/transform/__init__.py +10 -10
  213. {re_common-10.0.39.dist-info → re_common-10.0.40.dist-info}/LICENSE +201 -201
  214. {re_common-10.0.39.dist-info → re_common-10.0.40.dist-info}/METADATA +24 -16
  215. re_common-10.0.40.dist-info/RECORD +249 -0
  216. {re_common-10.0.39.dist-info → re_common-10.0.40.dist-info}/WHEEL +1 -1
  217. re_common-10.0.39.dist-info/RECORD +0 -248
  218. {re_common-10.0.39.dist-info → re_common-10.0.40.dist-info}/top_level.txt +0 -0
@@ -1,326 +1,326 @@
1
- import pickle
2
-
3
- import jieba
4
- import re
5
- from typing import List, Dict, Tuple, Set, Optional, Union, Hashable, Protocol
6
- from datasketch import MinHash, MinHashLSH
7
-
8
- from re_common.v2.baselibrary.decorators.utils import deprecated
9
- from re_common.v2.baselibrary.utils.string_bool import is_single_cjk_char
10
-
11
-
12
- @deprecated("请使用 TextMatcherV2 代替。")
13
- class TextMatcher(object):
14
- def __init__(
15
- self,
16
- threshold: float = 0.5,
17
- num_perm: int = 128,
18
- is_raw_texts=True,
19
- stopwords_path: Optional[str] = None,
20
- user_dict_path: Optional[str] = None,
21
-
22
- ):
23
- """
24
- 初始化文本匹配器
25
-
26
- Args:
27
- threshold: LSH 相似度阈值
28
- num_perm: MinHash 排列数
29
- stopwords_path: 停用词文件路径
30
- user_dict_path: 用户自定义词典路径
31
- """
32
- self.threshold = threshold
33
- self.num_perm = num_perm
34
- self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
35
- # self.minhashes: Dict[str, MinHash] = {}
36
- self.raw_texts: Dict[str, str] = {}
37
- self.is_raw_texts = is_raw_texts
38
- self.doc_counter = 0
39
-
40
- # 加载停用词
41
- self.stopwords: Set[str] = set()
42
- if stopwords_path:
43
- self.load_stopwords(stopwords_path)
44
-
45
- # 加载用户词典
46
- if user_dict_path:
47
- jieba.load_userdict(user_dict_path)
48
-
49
- def load_stopwords(self, stopwords_path: str) -> None:
50
- """加载停用词"""
51
- with open(stopwords_path, "r", encoding="utf-8") as f:
52
- self.stopwords = set(line.strip() for line in f)
53
-
54
- def preprocess_text(self, text: str) -> str:
55
- """
56
- 文本预处理
57
- """
58
- # 转换为小写
59
- text = text.lower()
60
- # 移除特殊字符
61
- text = re.sub(r"[^\w\s\u4e00-\u9fff]", "", text)
62
- # 移除多余空格
63
- text = re.sub(r"\s+", " ", text).strip()
64
- return text
65
-
66
- def tokenize(self, text: str) -> List[str]:
67
- """
68
- 分词并移除停用词
69
- """
70
- words = jieba.lcut(text)
71
- one_char_size = len([i for i in words if len(i) == 1])
72
- all_size = len(words)
73
- if all_size != 0 and one_char_size / all_size > 0.6:
74
- words = [i for i in text.split() if i.strip()]
75
-
76
- # 过滤停用词和空字符
77
- words = [w for w in words if w not in self.stopwords and w.strip()]
78
- return words
79
-
80
- def create_minhash(self, words: List[str]) -> MinHash:
81
- """
82
- 为分词结果创建 MinHash
83
- """
84
- minhash = MinHash(num_perm=self.num_perm)
85
- for word in words:
86
- minhash.update(word.encode("utf-8"))
87
- return minhash
88
-
89
- def add_document(self, text: str, doc_id: Optional[str] = None) -> str:
90
- """
91
- 添加文档到索引
92
-
93
- Args:
94
- text: 文档文本
95
- doc_id: 文档ID(可选)
96
-
97
- Returns:
98
- doc_id: 文档ID
99
- """
100
- if doc_id is None:
101
- doc_id = f"doc_{self.doc_counter}"
102
- self.doc_counter += 1
103
-
104
- # 预处理和分词
105
- processed_text = self.preprocess_text(text)
106
- words = self.tokenize(processed_text)
107
-
108
- # 创建 MinHash
109
- minhash = self.create_minhash(words)
110
- if self.is_raw_texts:
111
- # 存储原始文本和 MinHash
112
- self.raw_texts[doc_id] = text
113
- # self.minhashes[doc_id] = minhash
114
-
115
- # 添加到 LSH
116
- self.lsh.insert(doc_id, minhash)
117
-
118
- return doc_id
119
-
120
- def batch_add_documents(self, texts: Dict[str, str]) -> None:
121
- """
122
- 批量添加文档
123
-
124
- Args:
125
- texts: {doc_id: text} 的字典
126
- """
127
- for doc_id, text in texts.items():
128
- self.add_document(text, doc_id)
129
-
130
- def create_query_minhash(self, query: str):
131
-
132
- # 预处理查询文本
133
- processed_query = self.preprocess_text(query)
134
- query_words = self.tokenize(processed_query)
135
- # print(query_words)
136
- query_minhash = self.create_minhash(query_words)
137
- return query_minhash
138
-
139
- def find_similar(self, query_minhash: MinHash, return_similarities: bool = False) -> Union[
140
- List[str], List[Tuple[str, float]]]:
141
- """
142
- 查找相似文档
143
-
144
- Args:
145
- query: 查询文本
146
- return_similarities: 是否返回相似度分数
147
-
148
- Returns:
149
- 如果 return_similarities 为 True,返回 [(doc_id, similarity), ...]
150
- 否则返回 [doc_id, ...]
151
- """
152
-
153
- # 使用 LSH 查找候选集
154
- similar_docs = self.lsh.query(query_minhash)
155
-
156
- # if return_similarities:
157
- # # 计算精确的 Jaccard 相似度
158
- # results = []
159
- # for doc_id in similar_docs:
160
- # similarity = query_minhash.jaccard(self.minhashes[doc_id])
161
- # results.append((doc_id, similarity))
162
- # # 按相似度降序排序
163
- # return sorted(results, key=lambda x: x[1], reverse=True)
164
-
165
- return similar_docs
166
-
167
- def get_text(self, doc_id: str) -> Optional[str]:
168
- """获取原始文本"""
169
- if self.is_raw_texts:
170
- return self.raw_texts.get(doc_id)
171
- raise Exception("没有开启存储")
172
-
173
- def remove_document(self, doc_id: str) -> bool:
174
- """
175
- 删除文档
176
-
177
- Returns:
178
- bool: 是否成功删除
179
- """
180
- # if doc_id not in self.minhashes:
181
- # return False
182
-
183
- self.lsh.remove(doc_id)
184
- # del self.minhashes[doc_id]
185
- if self.is_raw_texts:
186
- del self.raw_texts[doc_id]
187
- return True
188
-
189
- def clear(self) -> None:
190
- """清空所有数据"""
191
- self.lsh = MinHashLSH(threshold=self.threshold, num_perm=self.num_perm)
192
- # self.minhashes.clear()
193
- self.raw_texts.clear()
194
- self.doc_counter = 0
195
-
196
-
197
- # 定义一个协议,描述“像鸭子一样”的行为
198
- class TokenizeDuckLike(Protocol):
199
- def get_words(self, text) -> List:
200
- pass
201
-
202
-
203
- class JiebaTokenize(object):
204
-
205
- def __init__(self, stopwords=None):
206
- self.stopwords = stopwords
207
-
208
- def get_words(self, text) -> List:
209
-
210
- if self.stopwords is None:
211
- stopwords = []
212
- words = jieba.lcut(text)
213
-
214
- # 统计单字符数据 长度,防止结巴分词分不了的单词 将数据分为单个字符
215
-
216
- # 这里为什么使用函数 而不是在推导式中兼容,主要是在一些 spark中 推导式的if 条件不遵循最短路径原则会将表达式当做一个整体算子
217
- def is_singel_en(i):
218
- if len(i) == 1 and not is_single_cjk_char(i):
219
- return True
220
- return False
221
-
222
- one_char_size = len([i for i in words if is_singel_en(i)])
223
- all_size = len(words)
224
- # 如果单字符个数超过一定比例 就直接用空格分词
225
- if all_size != 0 and one_char_size / all_size > 0.6:
226
- words = [i for i in text.split() if i.strip()]
227
-
228
- # 过滤停用词和空字符
229
- words = [w for w in words if w not in stopwords and w.strip()]
230
- return words
231
-
232
-
233
- class TextMatcherV2(object):
234
-
235
- def __init__(
236
- self,
237
- threshold: float = 0.5,
238
- num_perm: int = 128,
239
- tdk: TokenizeDuckLike = None
240
- ):
241
- """
242
- 初始化文本匹配器
243
-
244
- Args:
245
- threshold: LSH 相似度阈值
246
- num_perm: MinHash 排列数
247
- stopwords_path: 停用词文件路径
248
- user_dict_path: 用户自定义词典路径
249
- """
250
- self.threshold = threshold
251
- self.num_perm = num_perm
252
- self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
253
- self.tdk = tdk
254
-
255
- def add_document(self, doc_id: str, minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None):
256
- if isinstance(minhash, str):
257
- minhash = self.str_to_minihash(minhash, tdk)
258
-
259
- self.lsh.insert(doc_id, minhash)
260
-
261
- def batch_add_documents(self, betch_data: Union[list, dict], tdk: TokenizeDuckLike = None):
262
- def _add_document(minhash_or_str, tdk):
263
- if isinstance(minhash_or_str, str):
264
- minhash_or_str = self.str_to_minihash(minhash_or_str, tdk)
265
- self.add_document(docid, minhash_or_str, tdk)
266
-
267
- if isinstance(betch_data, list):
268
- # 必须是可解包的2个数据的元组或list
269
- for docid, minhash_or_str in betch_data:
270
- _add_document(minhash_or_str, tdk)
271
- elif isinstance(betch_data, dict):
272
- for docid, minhash_or_str in betch_data.items():
273
- _add_document(minhash_or_str, tdk)
274
- else:
275
- raise Exception("数据类型错误")
276
-
277
- def find_similar(self, query_minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None) -> List[Hashable]:
278
- # 使用 LSH 查找候选集
279
- if isinstance(query_minhash, str):
280
- query_minhash = self.str_to_minihash(query_minhash, tdk)
281
- similar_docs = self.lsh.query(query_minhash)
282
- return similar_docs
283
-
284
- def create_minhash(self, words: List[str], num_perm=None) -> MinHash:
285
- """
286
- 为分词结果创建 MinHash
287
- """
288
- if num_perm is None:
289
- num_perm = self.num_perm
290
- minhash = MinHash(num_perm=num_perm)
291
- for word in words:
292
- minhash.update(word.encode("utf-8"))
293
- return minhash
294
-
295
- def create_words(self, text: str, tdk: TokenizeDuckLike = None):
296
- if tdk is None:
297
- tdk = self.tdk
298
- worlds = tdk.get_words(text)
299
- return worlds
300
-
301
- def str_to_minihash(self, text: str, tdk: TokenizeDuckLike = None):
302
- if tdk is None:
303
- tdk = self.tdk
304
- words = self.create_words(text, tdk)
305
- minhash = self.create_minhash(words, self.num_perm)
306
- return minhash
307
-
308
- def minhash_dumps(self, minhash) -> bytes:
309
- """
310
- 序列化
311
- """
312
- serialized_minhash = pickle.dumps(minhash)
313
- return serialized_minhash
314
-
315
- def minhash_loads(self, serialized_minhash) -> MinHash:
316
- """
317
- 反序列化
318
- """
319
- minhash = pickle.loads(serialized_minhash)
320
- return minhash
321
-
322
- def merge_other_minhashlsh(self, other_minhashlsh: MinHashLSH):
323
- """
324
- 在其他地方创建好的lsh 合并进来
325
- """
326
- self.lsh.merge(other_minhashlsh)
1
+ import pickle
2
+
3
+ import jieba
4
+ import re
5
+ from typing import List, Dict, Tuple, Set, Optional, Union, Hashable, Protocol
6
+ from datasketch import MinHash, MinHashLSH
7
+
8
+ from re_common.v2.baselibrary.decorators.utils import deprecated
9
+ from re_common.v2.baselibrary.utils.string_bool import is_single_cjk_char
10
+
11
+
12
+ @deprecated("请使用 TextMatcherV2 代替。")
13
+ class TextMatcher(object):
14
+ def __init__(
15
+ self,
16
+ threshold: float = 0.5,
17
+ num_perm: int = 128,
18
+ is_raw_texts=True,
19
+ stopwords_path: Optional[str] = None,
20
+ user_dict_path: Optional[str] = None,
21
+
22
+ ):
23
+ """
24
+ 初始化文本匹配器
25
+
26
+ Args:
27
+ threshold: LSH 相似度阈值
28
+ num_perm: MinHash 排列数
29
+ stopwords_path: 停用词文件路径
30
+ user_dict_path: 用户自定义词典路径
31
+ """
32
+ self.threshold = threshold
33
+ self.num_perm = num_perm
34
+ self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
35
+ # self.minhashes: Dict[str, MinHash] = {}
36
+ self.raw_texts: Dict[str, str] = {}
37
+ self.is_raw_texts = is_raw_texts
38
+ self.doc_counter = 0
39
+
40
+ # 加载停用词
41
+ self.stopwords: Set[str] = set()
42
+ if stopwords_path:
43
+ self.load_stopwords(stopwords_path)
44
+
45
+ # 加载用户词典
46
+ if user_dict_path:
47
+ jieba.load_userdict(user_dict_path)
48
+
49
+ def load_stopwords(self, stopwords_path: str) -> None:
50
+ """加载停用词"""
51
+ with open(stopwords_path, "r", encoding="utf-8") as f:
52
+ self.stopwords = set(line.strip() for line in f)
53
+
54
+ def preprocess_text(self, text: str) -> str:
55
+ """
56
+ 文本预处理
57
+ """
58
+ # 转换为小写
59
+ text = text.lower()
60
+ # 移除特殊字符
61
+ text = re.sub(r"[^\w\s\u4e00-\u9fff]", "", text)
62
+ # 移除多余空格
63
+ text = re.sub(r"\s+", " ", text).strip()
64
+ return text
65
+
66
+ def tokenize(self, text: str) -> List[str]:
67
+ """
68
+ 分词并移除停用词
69
+ """
70
+ words = jieba.lcut(text)
71
+ one_char_size = len([i for i in words if len(i) == 1])
72
+ all_size = len(words)
73
+ if all_size != 0 and one_char_size / all_size > 0.6:
74
+ words = [i for i in text.split() if i.strip()]
75
+
76
+ # 过滤停用词和空字符
77
+ words = [w for w in words if w not in self.stopwords and w.strip()]
78
+ return words
79
+
80
+ def create_minhash(self, words: List[str]) -> MinHash:
81
+ """
82
+ 为分词结果创建 MinHash
83
+ """
84
+ minhash = MinHash(num_perm=self.num_perm)
85
+ for word in words:
86
+ minhash.update(word.encode("utf-8"))
87
+ return minhash
88
+
89
+ def add_document(self, text: str, doc_id: Optional[str] = None) -> str:
90
+ """
91
+ 添加文档到索引
92
+
93
+ Args:
94
+ text: 文档文本
95
+ doc_id: 文档ID(可选)
96
+
97
+ Returns:
98
+ doc_id: 文档ID
99
+ """
100
+ if doc_id is None:
101
+ doc_id = f"doc_{self.doc_counter}"
102
+ self.doc_counter += 1
103
+
104
+ # 预处理和分词
105
+ processed_text = self.preprocess_text(text)
106
+ words = self.tokenize(processed_text)
107
+
108
+ # 创建 MinHash
109
+ minhash = self.create_minhash(words)
110
+ if self.is_raw_texts:
111
+ # 存储原始文本和 MinHash
112
+ self.raw_texts[doc_id] = text
113
+ # self.minhashes[doc_id] = minhash
114
+
115
+ # 添加到 LSH
116
+ self.lsh.insert(doc_id, minhash)
117
+
118
+ return doc_id
119
+
120
+ def batch_add_documents(self, texts: Dict[str, str]) -> None:
121
+ """
122
+ 批量添加文档
123
+
124
+ Args:
125
+ texts: {doc_id: text} 的字典
126
+ """
127
+ for doc_id, text in texts.items():
128
+ self.add_document(text, doc_id)
129
+
130
+ def create_query_minhash(self, query: str):
131
+
132
+ # 预处理查询文本
133
+ processed_query = self.preprocess_text(query)
134
+ query_words = self.tokenize(processed_query)
135
+ # print(query_words)
136
+ query_minhash = self.create_minhash(query_words)
137
+ return query_minhash
138
+
139
+ def find_similar(self, query_minhash: MinHash, return_similarities: bool = False) -> Union[
140
+ List[str], List[Tuple[str, float]]]:
141
+ """
142
+ 查找相似文档
143
+
144
+ Args:
145
+ query: 查询文本
146
+ return_similarities: 是否返回相似度分数
147
+
148
+ Returns:
149
+ 如果 return_similarities 为 True,返回 [(doc_id, similarity), ...]
150
+ 否则返回 [doc_id, ...]
151
+ """
152
+
153
+ # 使用 LSH 查找候选集
154
+ similar_docs = self.lsh.query(query_minhash)
155
+
156
+ # if return_similarities:
157
+ # # 计算精确的 Jaccard 相似度
158
+ # results = []
159
+ # for doc_id in similar_docs:
160
+ # similarity = query_minhash.jaccard(self.minhashes[doc_id])
161
+ # results.append((doc_id, similarity))
162
+ # # 按相似度降序排序
163
+ # return sorted(results, key=lambda x: x[1], reverse=True)
164
+
165
+ return similar_docs
166
+
167
+ def get_text(self, doc_id: str) -> Optional[str]:
168
+ """获取原始文本"""
169
+ if self.is_raw_texts:
170
+ return self.raw_texts.get(doc_id)
171
+ raise Exception("没有开启存储")
172
+
173
+ def remove_document(self, doc_id: str) -> bool:
174
+ """
175
+ 删除文档
176
+
177
+ Returns:
178
+ bool: 是否成功删除
179
+ """
180
+ # if doc_id not in self.minhashes:
181
+ # return False
182
+
183
+ self.lsh.remove(doc_id)
184
+ # del self.minhashes[doc_id]
185
+ if self.is_raw_texts:
186
+ del self.raw_texts[doc_id]
187
+ return True
188
+
189
+ def clear(self) -> None:
190
+ """清空所有数据"""
191
+ self.lsh = MinHashLSH(threshold=self.threshold, num_perm=self.num_perm)
192
+ # self.minhashes.clear()
193
+ self.raw_texts.clear()
194
+ self.doc_counter = 0
195
+
196
+
197
+ # 定义一个协议,描述“像鸭子一样”的行为
198
+ class TokenizeDuckLike(Protocol):
199
+ def get_words(self, text) -> List:
200
+ pass
201
+
202
+
203
+ class JiebaTokenize(object):
204
+
205
+ def __init__(self, stopwords=None):
206
+ self.stopwords = stopwords
207
+
208
+ def get_words(self, text) -> List:
209
+
210
+ if self.stopwords is None:
211
+ stopwords = []
212
+ words = jieba.lcut(text)
213
+
214
+ # 统计单字符数据 长度,防止结巴分词分不了的单词 将数据分为单个字符
215
+
216
+ # 这里为什么使用函数 而不是在推导式中兼容,主要是在一些 spark中 推导式的if 条件不遵循最短路径原则会将表达式当做一个整体算子
217
+ def is_singel_en(i):
218
+ if len(i) == 1 and not is_single_cjk_char(i):
219
+ return True
220
+ return False
221
+
222
+ one_char_size = len([i for i in words if is_singel_en(i)])
223
+ all_size = len(words)
224
+ # 如果单字符个数超过一定比例 就直接用空格分词
225
+ if all_size != 0 and one_char_size / all_size > 0.6:
226
+ words = [i for i in text.split() if i.strip()]
227
+
228
+ # 过滤停用词和空字符
229
+ words = [w for w in words if w not in stopwords and w.strip()]
230
+ return words
231
+
232
+
233
+ class TextMatcherV2(object):
234
+
235
+ def __init__(
236
+ self,
237
+ threshold: float = 0.5,
238
+ num_perm: int = 128,
239
+ tdk: TokenizeDuckLike = None
240
+ ):
241
+ """
242
+ 初始化文本匹配器
243
+
244
+ Args:
245
+ threshold: LSH 相似度阈值
246
+ num_perm: MinHash 排列数
247
+ stopwords_path: 停用词文件路径
248
+ user_dict_path: 用户自定义词典路径
249
+ """
250
+ self.threshold = threshold
251
+ self.num_perm = num_perm
252
+ self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
253
+ self.tdk = tdk
254
+
255
+ def add_document(self, doc_id: str, minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None):
256
+ if isinstance(minhash, str):
257
+ minhash = self.str_to_minihash(minhash, tdk)
258
+
259
+ self.lsh.insert(doc_id, minhash)
260
+
261
+ def batch_add_documents(self, betch_data: Union[list, dict], tdk: TokenizeDuckLike = None):
262
+ def _add_document(minhash_or_str, tdk):
263
+ if isinstance(minhash_or_str, str):
264
+ minhash_or_str = self.str_to_minihash(minhash_or_str, tdk)
265
+ self.add_document(docid, minhash_or_str, tdk)
266
+
267
+ if isinstance(betch_data, list):
268
+ # 必须是可解包的2个数据的元组或list
269
+ for docid, minhash_or_str in betch_data:
270
+ _add_document(minhash_or_str, tdk)
271
+ elif isinstance(betch_data, dict):
272
+ for docid, minhash_or_str in betch_data.items():
273
+ _add_document(minhash_or_str, tdk)
274
+ else:
275
+ raise Exception("数据类型错误")
276
+
277
+ def find_similar(self, query_minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None) -> List[Hashable]:
278
+ # 使用 LSH 查找候选集
279
+ if isinstance(query_minhash, str):
280
+ query_minhash = self.str_to_minihash(query_minhash, tdk)
281
+ similar_docs = self.lsh.query(query_minhash)
282
+ return similar_docs
283
+
284
+ def create_minhash(self, words: List[str], num_perm=None) -> MinHash:
285
+ """
286
+ 为分词结果创建 MinHash
287
+ """
288
+ if num_perm is None:
289
+ num_perm = self.num_perm
290
+ minhash = MinHash(num_perm=num_perm)
291
+ for word in words:
292
+ minhash.update(word.encode("utf-8"))
293
+ return minhash
294
+
295
+ def create_words(self, text: str, tdk: TokenizeDuckLike = None):
296
+ if tdk is None:
297
+ tdk = self.tdk
298
+ worlds = tdk.get_words(text)
299
+ return worlds
300
+
301
+ def str_to_minihash(self, text: str, tdk: TokenizeDuckLike = None):
302
+ if tdk is None:
303
+ tdk = self.tdk
304
+ words = self.create_words(text, tdk)
305
+ minhash = self.create_minhash(words, self.num_perm)
306
+ return minhash
307
+
308
+ def minhash_dumps(self, minhash) -> bytes:
309
+ """
310
+ 序列化
311
+ """
312
+ serialized_minhash = pickle.dumps(minhash)
313
+ return serialized_minhash
314
+
315
+ def minhash_loads(self, serialized_minhash) -> MinHash:
316
+ """
317
+ 反序列化
318
+ """
319
+ minhash = pickle.loads(serialized_minhash)
320
+ return minhash
321
+
322
+ def merge_other_minhashlsh(self, other_minhashlsh: MinHashLSH):
323
+ """
324
+ 在其他地方创建好的lsh 合并进来
325
+ """
326
+ self.lsh.merge(other_minhashlsh)