re-common 10.0.37__py3-none-any.whl → 10.0.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (217) hide show
  1. re_common/baselibrary/__init__.py +4 -4
  2. re_common/baselibrary/baseabs/__init__.py +6 -6
  3. re_common/baselibrary/baseabs/baseabs.py +26 -26
  4. re_common/baselibrary/database/mbuilder.py +132 -132
  5. re_common/baselibrary/database/moudle.py +93 -93
  6. re_common/baselibrary/database/msqlite3.py +194 -194
  7. re_common/baselibrary/database/mysql.py +169 -169
  8. re_common/baselibrary/database/sql_factory.py +26 -26
  9. re_common/baselibrary/mthread/MThreadingRun.py +486 -486
  10. re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
  11. re_common/baselibrary/mthread/__init__.py +2 -2
  12. re_common/baselibrary/mthread/mythreading.py +695 -695
  13. re_common/baselibrary/pakge_other/socks.py +404 -404
  14. re_common/baselibrary/readconfig/config_factory.py +18 -18
  15. re_common/baselibrary/readconfig/ini_config.py +317 -317
  16. re_common/baselibrary/readconfig/toml_config.py +49 -49
  17. re_common/baselibrary/temporary/envdata.py +36 -36
  18. re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
  19. re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
  20. re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
  21. re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
  22. re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
  23. re_common/baselibrary/tools/contrast_db3.py +123 -123
  24. re_common/baselibrary/tools/copy_file.py +39 -39
  25. re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
  26. re_common/baselibrary/tools/foreachgz.py +39 -39
  27. re_common/baselibrary/tools/get_attr.py +10 -10
  28. re_common/baselibrary/tools/image_to_pdf.py +61 -61
  29. re_common/baselibrary/tools/java_code_deal.py +139 -139
  30. re_common/baselibrary/tools/javacode.py +79 -79
  31. re_common/baselibrary/tools/mdb_db3.py +48 -48
  32. re_common/baselibrary/tools/merge_file.py +171 -171
  33. re_common/baselibrary/tools/merge_gz_file.py +165 -165
  34. re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
  35. re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
  36. re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
  37. re_common/baselibrary/tools/mongo_tools.py +50 -50
  38. re_common/baselibrary/tools/move_file.py +170 -170
  39. re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
  40. re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
  41. re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
  42. re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
  43. re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
  44. re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
  45. re_common/baselibrary/tools/myparsel.py +104 -104
  46. re_common/baselibrary/tools/rename_dir_file.py +37 -37
  47. re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
  48. re_common/baselibrary/tools/split_line_to_many.py +25 -25
  49. re_common/baselibrary/tools/stringtodicts.py +33 -33
  50. re_common/baselibrary/tools/workwechant_bot.py +84 -84
  51. re_common/baselibrary/utils/baseaiohttp.py +296 -296
  52. re_common/baselibrary/utils/baseaiomysql.py +87 -87
  53. re_common/baselibrary/utils/baseallstep.py +191 -191
  54. re_common/baselibrary/utils/baseavro.py +19 -19
  55. re_common/baselibrary/utils/baseboto3.py +291 -291
  56. re_common/baselibrary/utils/basecsv.py +32 -32
  57. re_common/baselibrary/utils/basedict.py +133 -133
  58. re_common/baselibrary/utils/basedir.py +241 -241
  59. re_common/baselibrary/utils/baseencode.py +351 -351
  60. re_common/baselibrary/utils/baseencoding.py +28 -28
  61. re_common/baselibrary/utils/baseesdsl.py +86 -86
  62. re_common/baselibrary/utils/baseexcel.py +264 -264
  63. re_common/baselibrary/utils/baseexcept.py +109 -109
  64. re_common/baselibrary/utils/basefile.py +654 -654
  65. re_common/baselibrary/utils/baseftp.py +214 -214
  66. re_common/baselibrary/utils/basegzip.py +60 -60
  67. re_common/baselibrary/utils/basehdfs.py +135 -135
  68. re_common/baselibrary/utils/basehttpx.py +268 -268
  69. re_common/baselibrary/utils/baseip.py +87 -87
  70. re_common/baselibrary/utils/basejson.py +2 -2
  71. re_common/baselibrary/utils/baselist.py +32 -32
  72. re_common/baselibrary/utils/basemotor.py +190 -190
  73. re_common/baselibrary/utils/basemssql.py +98 -98
  74. re_common/baselibrary/utils/baseodbc.py +113 -113
  75. re_common/baselibrary/utils/basepandas.py +302 -302
  76. re_common/baselibrary/utils/basepeewee.py +11 -11
  77. re_common/baselibrary/utils/basepika.py +180 -180
  78. re_common/baselibrary/utils/basepydash.py +143 -143
  79. re_common/baselibrary/utils/basepymongo.py +230 -230
  80. re_common/baselibrary/utils/basequeue.py +22 -22
  81. re_common/baselibrary/utils/baserar.py +57 -57
  82. re_common/baselibrary/utils/baserequest.py +279 -279
  83. re_common/baselibrary/utils/baseset.py +8 -8
  84. re_common/baselibrary/utils/basesmb.py +403 -403
  85. re_common/baselibrary/utils/basestring.py +382 -382
  86. re_common/baselibrary/utils/basetime.py +320 -320
  87. re_common/baselibrary/utils/baseurl.py +121 -121
  88. re_common/baselibrary/utils/basezip.py +57 -57
  89. re_common/baselibrary/utils/core/__init__.py +7 -7
  90. re_common/baselibrary/utils/core/bottomutils.py +18 -18
  91. re_common/baselibrary/utils/core/mdeprecated.py +327 -327
  92. re_common/baselibrary/utils/core/mlamada.py +16 -16
  93. re_common/baselibrary/utils/core/msginfo.py +25 -25
  94. re_common/baselibrary/utils/core/requests_core.py +103 -103
  95. re_common/baselibrary/utils/fateadm.py +429 -429
  96. re_common/baselibrary/utils/importfun.py +123 -123
  97. re_common/baselibrary/utils/mfaker.py +57 -57
  98. re_common/baselibrary/utils/my_abc/__init__.py +3 -3
  99. re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
  100. re_common/baselibrary/utils/mylogger.py +414 -414
  101. re_common/baselibrary/utils/myredisclient.py +861 -861
  102. re_common/baselibrary/utils/pipupgrade.py +21 -21
  103. re_common/baselibrary/utils/ringlist.py +85 -85
  104. re_common/baselibrary/utils/version_compare.py +36 -36
  105. re_common/baselibrary/utils/ydmhttp.py +126 -126
  106. re_common/facade/lazy_import.py +11 -11
  107. re_common/facade/loggerfacade.py +25 -25
  108. re_common/facade/mysqlfacade.py +467 -467
  109. re_common/facade/now.py +31 -31
  110. re_common/facade/sqlite3facade.py +257 -257
  111. re_common/facade/use/mq_use_facade.py +83 -83
  112. re_common/facade/use/proxy_use_facade.py +19 -19
  113. re_common/libtest/base_dict_test.py +19 -19
  114. re_common/libtest/baseavro_test.py +13 -13
  115. re_common/libtest/basefile_test.py +14 -14
  116. re_common/libtest/basemssql_test.py +77 -77
  117. re_common/libtest/baseodbc_test.py +7 -7
  118. re_common/libtest/basepandas_test.py +38 -38
  119. re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
  120. re_common/libtest/get_attr_test/settings.py +54 -54
  121. re_common/libtest/idencode_test.py +53 -53
  122. re_common/libtest/iniconfig_test.py +35 -35
  123. re_common/libtest/ip_test.py +34 -34
  124. re_common/libtest/merge_file_test.py +20 -20
  125. re_common/libtest/mfaker_test.py +8 -8
  126. re_common/libtest/mm3_test.py +31 -31
  127. re_common/libtest/mylogger_test.py +88 -88
  128. re_common/libtest/myparsel_test.py +27 -27
  129. re_common/libtest/mysql_test.py +151 -151
  130. re_common/libtest/pymongo_test.py +21 -21
  131. re_common/libtest/split_test.py +11 -11
  132. re_common/libtest/sqlite3_merge_test.py +5 -5
  133. re_common/libtest/sqlite3_test.py +34 -34
  134. re_common/libtest/tomlconfig_test.py +30 -30
  135. re_common/libtest/use_tools_test/__init__.py +2 -2
  136. re_common/libtest/user/__init__.py +4 -4
  137. re_common/studio/__init__.py +4 -4
  138. re_common/studio/assignment_expressions.py +36 -36
  139. re_common/studio/mydash/test1.py +18 -18
  140. re_common/studio/pydashstudio/first.py +9 -9
  141. re_common/studio/streamlitstudio/first_app.py +65 -65
  142. re_common/studio/streamlitstudio/uber_pickups.py +23 -23
  143. re_common/studio/test.py +18 -18
  144. re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +219 -219
  145. re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
  146. re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
  147. re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
  148. re_common/v2/baselibrary/decorators/utils.py +59 -59
  149. re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
  150. re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
  151. re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
  152. re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
  153. re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
  154. re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
  155. re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
  156. re_common/v2/baselibrary/tools/concurrency.py +35 -35
  157. re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
  158. re_common/v2/baselibrary/tools/data_processer/data_processer.py +508 -508
  159. re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
  160. re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
  161. re_common/v2/baselibrary/tools/dict_tools.py +44 -44
  162. re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
  163. re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
  164. re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
  165. re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
  166. re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
  167. re_common/v2/baselibrary/tools/list_tools.py +69 -69
  168. re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
  169. re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
  170. re_common/v2/baselibrary/tools/text_matcher.py +326 -326
  171. re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
  172. re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
  173. re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
  174. re_common/v2/baselibrary/utils/author_smi.py +361 -361
  175. re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
  176. re_common/v2/baselibrary/utils/basedict.py +37 -37
  177. re_common/v2/baselibrary/utils/basehdfs.py +163 -163
  178. re_common/v2/baselibrary/utils/basepika.py +180 -180
  179. re_common/v2/baselibrary/utils/basetime.py +77 -77
  180. re_common/v2/baselibrary/utils/db.py +156 -156
  181. re_common/v2/baselibrary/utils/json_cls.py +16 -16
  182. re_common/v2/baselibrary/utils/mq.py +83 -83
  183. re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
  184. re_common/v2/baselibrary/utils/string_bool.py +186 -186
  185. re_common/v2/baselibrary/utils/string_clear.py +246 -246
  186. re_common/v2/baselibrary/utils/string_smi.py +18 -18
  187. re_common/v2/baselibrary/utils/stringutils.py +271 -278
  188. re_common/vip/base_step_process.py +11 -11
  189. re_common/vip/baseencodeid.py +90 -90
  190. re_common/vip/changetaskname.py +28 -28
  191. re_common/vip/core_var.py +24 -24
  192. re_common/vip/mmh3Hash.py +89 -89
  193. re_common/vip/proxy/allproxys.py +127 -127
  194. re_common/vip/proxy/allproxys_thread.py +159 -159
  195. re_common/vip/proxy/cnki_proxy.py +153 -153
  196. re_common/vip/proxy/kuaidaili.py +87 -87
  197. re_common/vip/proxy/proxy_all.py +113 -113
  198. re_common/vip/proxy/update_kuaidaili_0.py +42 -42
  199. re_common/vip/proxy/wanfang_proxy.py +152 -152
  200. re_common/vip/proxy/wp_proxy_all.py +181 -181
  201. re_common/vip/read_rawid_to_txt.py +91 -91
  202. re_common/vip/title/__init__.py +5 -5
  203. re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
  204. re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
  205. re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
  206. re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
  207. re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
  208. re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
  209. re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
  210. re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
  211. re_common/vip/title/transform/__init__.py +10 -10
  212. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/LICENSE +201 -201
  213. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/METADATA +16 -16
  214. re_common-10.0.39.dist-info/RECORD +248 -0
  215. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/WHEEL +1 -1
  216. re_common-10.0.37.dist-info/RECORD +0 -248
  217. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/top_level.txt +0 -0
@@ -1,95 +1,95 @@
1
- import sqlite3
2
- import requests
3
- import pandas as pd
4
- import os
5
- import json
6
- import traceback
7
-
8
-
9
- # c1d3a814-1a02-4bbd-b5c2-f756fef92cb8: b层机器人消息群-非聊天 的 pythonspark
10
- # 013547da-3d78-4a7f-b4a7-e668b192c293: b层机器人消息群-非聊天 的 数仓B层服务端部署通知
11
-
12
- # 发送消息到企业微信机器人
13
- # vx_key: string类型,自己的企业微信机器人的key
14
- # s:string类型,要发送的消息
15
- def send_vx(vx_key, s, i=0):
16
- vx_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=' + vx_key
17
- headers = {"Content-Type": "text/plain"}
18
- data = {
19
- "msgtype": "text",
20
- "text": {
21
- "content": s,
22
- }
23
- }
24
- if i > 3:
25
- raise Exception(str(traceback.format_exc()))
26
- try:
27
- requests.post(url=vx_url, headers=headers, json=data, timeout=30)
28
- except:
29
- i = i + 1
30
- send_vx(vx_key, str(traceback.format_exc()), i)
31
-
32
-
33
- # 发送文件到企业微信机器人
34
- # vx_key: string类型,自己的企业微信机器人的key
35
- # file_path: string类型,文件地址
36
- def post_file(vx_key, file_path):
37
- id_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/upload_media?key=' + vx_key + '&type=file'
38
- wx_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=' + vx_key
39
- data = {'file': open(file_path, 'rb')}
40
- response = requests.post(url=id_url, files=data)
41
- json_res = response.json()
42
- media_id = json_res['media_id']
43
- data = {"msgtype": "file",
44
- "file": {"media_id": media_id}
45
- }
46
- try:
47
- requests.post(url=wx_url, json=data)
48
- except:
49
- send_vx(send_vx, str(traceback.format_exc()))
50
-
51
-
52
- # data: dataframe|string|dict|list|tuple|array
53
- # file_name: 带后缀的完整文件名
54
- # file_type: 文件类型,包括csv、excel、txt、json、sql
55
- def file_to_vx(vx_key, data, file_name):
56
- file_type = file_name.split('.')[-1]
57
- if file_type == "xls" or file_type == "xlsx":
58
- file_type = "excel"
59
- current_dir = os.getcwd()
60
- temp_dir = os.path.join(os.getcwd(), "tmp")
61
- if os.path.exists(temp_dir):
62
- pass
63
- else:
64
- os.makedirs(temp_dir)
65
- file_path = current_dir + "/" + file_name
66
- try:
67
- if isinstance(data, pd.DataFrame) and file_type != "txt":
68
- if file_type == "db3":
69
- conn = sqlite3.connect(file_path)
70
- data.to_sql('base_table', conn, if_exists='replace', index=False)
71
- post_file(vx_key, file_path)
72
- else:
73
- code_str = "data.to_" + file_type + "(file_path,index=False)"
74
- eval(code_str)
75
- post_file(vx_key, file_path)
76
- os.system('rm ' + file_path + '')
77
- else:
78
- if isinstance(data, dict):
79
- data_str = json.dumps(data, ensure_ascii=False)
80
- elif isinstance(data, list):
81
- data_str = ""
82
- for i in data:
83
- if isinstance(i, dict):
84
- data_str = data_str + json.dumps(i, ensure_ascii=False) + "\n"
85
- else:
86
- data_str = data_str + str(i) + "\n"
87
- else:
88
- data_str = str(data)
89
- print(data_str[:100])
90
- with open(file_path, 'w', encoding='utf-8') as f:
91
- f.write(data_str)
92
- post_file(vx_key, file_path)
93
- except:
94
- send_vx(vx_key, str(traceback.format_exc()))
95
- os.system('rm -r' + temp_dir + '')
1
+ import sqlite3
2
+ import requests
3
+ import pandas as pd
4
+ import os
5
+ import json
6
+ import traceback
7
+
8
+
9
+ # c1d3a814-1a02-4bbd-b5c2-f756fef92cb8: b层机器人消息群-非聊天 的 pythonspark
10
+ # 013547da-3d78-4a7f-b4a7-e668b192c293: b层机器人消息群-非聊天 的 数仓B层服务端部署通知
11
+
12
+ # 发送消息到企业微信机器人
13
+ # vx_key: string类型,自己的企业微信机器人的key
14
+ # s:string类型,要发送的消息
15
+ def send_vx(vx_key, s, i=0):
16
+ vx_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=' + vx_key
17
+ headers = {"Content-Type": "text/plain"}
18
+ data = {
19
+ "msgtype": "text",
20
+ "text": {
21
+ "content": s,
22
+ }
23
+ }
24
+ if i > 3:
25
+ raise Exception(str(traceback.format_exc()))
26
+ try:
27
+ requests.post(url=vx_url, headers=headers, json=data, timeout=30)
28
+ except:
29
+ i = i + 1
30
+ send_vx(vx_key, str(traceback.format_exc()), i)
31
+
32
+
33
+ # 发送文件到企业微信机器人
34
+ # vx_key: string类型,自己的企业微信机器人的key
35
+ # file_path: string类型,文件地址
36
+ def post_file(vx_key, file_path):
37
+ id_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/upload_media?key=' + vx_key + '&type=file'
38
+ wx_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=' + vx_key
39
+ data = {'file': open(file_path, 'rb')}
40
+ response = requests.post(url=id_url, files=data)
41
+ json_res = response.json()
42
+ media_id = json_res['media_id']
43
+ data = {"msgtype": "file",
44
+ "file": {"media_id": media_id}
45
+ }
46
+ try:
47
+ requests.post(url=wx_url, json=data)
48
+ except:
49
+ send_vx(send_vx, str(traceback.format_exc()))
50
+
51
+
52
+ # data: dataframe|string|dict|list|tuple|array
53
+ # file_name: 带后缀的完整文件名
54
+ # file_type: 文件类型,包括csv、excel、txt、json、sql
55
+ def file_to_vx(vx_key, data, file_name):
56
+ file_type = file_name.split('.')[-1]
57
+ if file_type == "xls" or file_type == "xlsx":
58
+ file_type = "excel"
59
+ current_dir = os.getcwd()
60
+ temp_dir = os.path.join(os.getcwd(), "tmp")
61
+ if os.path.exists(temp_dir):
62
+ pass
63
+ else:
64
+ os.makedirs(temp_dir)
65
+ file_path = current_dir + "/" + file_name
66
+ try:
67
+ if isinstance(data, pd.DataFrame) and file_type != "txt":
68
+ if file_type == "db3":
69
+ conn = sqlite3.connect(file_path)
70
+ data.to_sql('base_table', conn, if_exists='replace', index=False)
71
+ post_file(vx_key, file_path)
72
+ else:
73
+ code_str = "data.to_" + file_type + "(file_path,index=False)"
74
+ eval(code_str)
75
+ post_file(vx_key, file_path)
76
+ os.system('rm ' + file_path + '')
77
+ else:
78
+ if isinstance(data, dict):
79
+ data_str = json.dumps(data, ensure_ascii=False)
80
+ elif isinstance(data, list):
81
+ data_str = ""
82
+ for i in data:
83
+ if isinstance(i, dict):
84
+ data_str = data_str + json.dumps(i, ensure_ascii=False) + "\n"
85
+ else:
86
+ data_str = data_str + str(i) + "\n"
87
+ else:
88
+ data_str = str(data)
89
+ print(data_str[:100])
90
+ with open(file_path, 'w', encoding='utf-8') as f:
91
+ f.write(data_str)
92
+ post_file(vx_key, file_path)
93
+ except:
94
+ send_vx(vx_key, str(traceback.format_exc()))
95
+ os.system('rm -r' + temp_dir + '')
@@ -1,76 +1,76 @@
1
- import pickle
2
-
3
- import ahocorasick
4
-
5
-
6
- class ACTool(object):
7
-
8
- def __init__(self):
9
- self.automaton = ahocorasick.Automaton()
10
-
11
- def add_word(self, key, value, overwrite=True) -> bool:
12
- """
13
- 为 AC 机添加数据,默认情况下 key重复直接覆盖
14
- :param key: 要添加的关键字
15
- :param value: 对应的值
16
- :param overwrite: 是否覆盖已有的 key,默认为 True
17
- :return: 是否成功添加或覆盖
18
- """
19
- if key in self.automaton: # 检查 key 是否已存在
20
- if overwrite: # 如果允许覆盖
21
- self.automaton.add_word(key, value)
22
- return True
23
- else: # 不允许覆盖,跳过
24
- return False
25
- else: # key 不存在,直接添加
26
- self.automaton.add_word(key, value)
27
- return True
28
-
29
- def is_exists_key(self, key) -> bool:
30
- # 是否存在key
31
- if self.automaton.exists(key):
32
- return True
33
- else:
34
- return False
35
-
36
- def make_automaton(self):
37
- """
38
- 添加完词后需要构建
39
- """
40
- self.automaton.make_automaton()
41
-
42
- def iter(self, key):
43
- """
44
- 结果为可迭代对象 可通过list 转换 [(end_index, value)]
45
- tool.add_word("he", "word1")
46
- tool.add_word("hello", "word2")
47
-
48
- # 在字符串中查找匹配
49
- input_string = "hello world"
50
- matches = list(tool.automaton.iter(input_string))
51
- print(matches) # [(1, 'word1'), (4, 'word2')]
52
-
53
- (1, 'word1'):
54
- end_index = 1: 表示匹配的关键字 "he" 在 input_string = "hello world" 中的结束位置是索引 1(即字符串 "he" 的最后一个字符 'e' 的位置)。
55
- "hello world" 的索引:h(0)e(1)l(2)l(3)o(4) (5)w(6)o(7)r(8)l(9)d(10)。
56
- value = 'word1': 表示匹配的关键字 "he" 对应的值是 "word1"。
57
- (4, 'word2'):
58
- end_index = 4: 表示匹配的关键字 "hello" 在 input_string = "hello world" 中的结束位置是索引 4(即字符串 "hello" 的最后一个字符 'o' 的位置)。
59
- value = 'word2': 表示匹配的关键字 "hello" 对应的值是 "word2"。
60
-
61
- 注意: 结果只会返回 value 不会返回 key,如果需要key 请将key 组合到结果中
62
- """
63
-
64
- result_iter = self.automaton.iter(key) # ahocorasick.AutomatonSearchIter
65
- return result_iter
66
- def save(self,local_temp_path):
67
- """
68
- 将构建好的ac自动机保存到本地
69
- """
70
- self.automaton.save(local_temp_path,pickle.dumps)
71
-
72
- def load(self,local_temp_path):
73
- """
74
- 加载已经构建好的ac自动机
75
- """
1
+ import pickle
2
+
3
+ import ahocorasick
4
+
5
+
6
+ class ACTool(object):
7
+
8
+ def __init__(self):
9
+ self.automaton = ahocorasick.Automaton()
10
+
11
+ def add_word(self, key, value, overwrite=True) -> bool:
12
+ """
13
+ 为 AC 机添加数据,默认情况下 key重复直接覆盖
14
+ :param key: 要添加的关键字
15
+ :param value: 对应的值
16
+ :param overwrite: 是否覆盖已有的 key,默认为 True
17
+ :return: 是否成功添加或覆盖
18
+ """
19
+ if key in self.automaton: # 检查 key 是否已存在
20
+ if overwrite: # 如果允许覆盖
21
+ self.automaton.add_word(key, value)
22
+ return True
23
+ else: # 不允许覆盖,跳过
24
+ return False
25
+ else: # key 不存在,直接添加
26
+ self.automaton.add_word(key, value)
27
+ return True
28
+
29
+ def is_exists_key(self, key) -> bool:
30
+ # 是否存在key
31
+ if self.automaton.exists(key):
32
+ return True
33
+ else:
34
+ return False
35
+
36
+ def make_automaton(self):
37
+ """
38
+ 添加完词后需要构建
39
+ """
40
+ self.automaton.make_automaton()
41
+
42
+ def iter(self, key):
43
+ """
44
+ 结果为可迭代对象 可通过list 转换 [(end_index, value)]
45
+ tool.add_word("he", "word1")
46
+ tool.add_word("hello", "word2")
47
+
48
+ # 在字符串中查找匹配
49
+ input_string = "hello world"
50
+ matches = list(tool.automaton.iter(input_string))
51
+ print(matches) # [(1, 'word1'), (4, 'word2')]
52
+
53
+ (1, 'word1'):
54
+ end_index = 1: 表示匹配的关键字 "he" 在 input_string = "hello world" 中的结束位置是索引 1(即字符串 "he" 的最后一个字符 'e' 的位置)。
55
+ "hello world" 的索引:h(0)e(1)l(2)l(3)o(4) (5)w(6)o(7)r(8)l(9)d(10)。
56
+ value = 'word1': 表示匹配的关键字 "he" 对应的值是 "word1"。
57
+ (4, 'word2'):
58
+ end_index = 4: 表示匹配的关键字 "hello" 在 input_string = "hello world" 中的结束位置是索引 4(即字符串 "hello" 的最后一个字符 'o' 的位置)。
59
+ value = 'word2': 表示匹配的关键字 "hello" 对应的值是 "word2"。
60
+
61
+ 注意: 结果只会返回 value 不会返回 key,如果需要key 请将key 组合到结果中
62
+ """
63
+
64
+ result_iter = self.automaton.iter(key) # ahocorasick.AutomatonSearchIter
65
+ return result_iter
66
+ def save(self,local_temp_path):
67
+ """
68
+ 将构建好的ac自动机保存到本地
69
+ """
70
+ self.automaton.save(local_temp_path,pickle.dumps)
71
+
72
+ def load(self,local_temp_path):
73
+ """
74
+ 加载已经构建好的ac自动机
75
+ """
76
76
  self.automaton=ahocorasick.load(local_temp_path, pickle.loads)
@@ -1,35 +1,35 @@
1
- import asyncio
2
- from asyncio import Semaphore
3
- from typing import Awaitable, List, Iterable, Callable
4
-
5
-
6
- class AsyncTaskPool:
7
- def __init__(self, max_workers: int = 10):
8
- """
9
-
10
- Args:
11
- max_workers: 任务最大并发数
12
- """
13
- self.semaphore = Semaphore(max_workers)
14
-
15
- async def _run_task(self, task: Awaitable):
16
- async with self.semaphore:
17
- return await task
18
-
19
- async def run(self, tasks: List[Awaitable]):
20
- return await asyncio.gather(*[self._run_task(task) for task in tasks])
21
-
22
- async def map(self, fn: Callable[..., Awaitable], *iterables: Iterable):
23
- tasks = [fn(*args) for args in zip(*iterables)]
24
- return await self.run(tasks)
25
-
26
-
27
- if __name__ == "__main__":
28
-
29
- async def test(x, y):
30
- await asyncio.sleep(1)
31
- print(x, y)
32
- return x + y
33
-
34
- result = asyncio.run(AsyncTaskPool(2).map(test, [1, 2, 3, 4], [5, 6, 7, 8]))
35
- print(result)
1
+ import asyncio
2
+ from asyncio import Semaphore
3
+ from typing import Awaitable, List, Iterable, Callable
4
+
5
+
6
+ class AsyncTaskPool:
7
+ def __init__(self, max_workers: int = 10):
8
+ """
9
+
10
+ Args:
11
+ max_workers: 任务最大并发数
12
+ """
13
+ self.semaphore = Semaphore(max_workers)
14
+
15
+ async def _run_task(self, task: Awaitable):
16
+ async with self.semaphore:
17
+ return await task
18
+
19
+ async def run(self, tasks: List[Awaitable]):
20
+ return await asyncio.gather(*[self._run_task(task) for task in tasks])
21
+
22
+ async def map(self, fn: Callable[..., Awaitable], *iterables: Iterable):
23
+ tasks = [fn(*args) for args in zip(*iterables)]
24
+ return await self.run(tasks)
25
+
26
+
27
+ if __name__ == "__main__":
28
+
29
+ async def test(x, y):
30
+ await asyncio.sleep(1)
31
+ print(x, y)
32
+ return x + y
33
+
34
+ result = asyncio.run(AsyncTaskPool(2).map(test, [1, 2, 3, 4], [5, 6, 7, 8]))
35
+ print(result)
@@ -1,53 +1,53 @@
1
- from abc import ABC, abstractmethod
2
- from typing import List, Generator
3
-
4
-
5
- class BaseFileReader(ABC):
6
-
7
- def __init__(self, batch_size: int = 10000):
8
- self.batch_size = batch_size
9
- self.read_model = 1
10
-
11
- @abstractmethod
12
- def list_files(self, path: str) -> List[str]:
13
- """列出路径下所有目标文件"""
14
- pass
15
-
16
- @abstractmethod
17
- def count_lines(self, file_path: str) -> int:
18
- """统计文件行数"""
19
- pass
20
-
21
- @abstractmethod
22
- def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
23
- """读取文件内容,返回批量数据"""
24
- pass
25
-
26
- @abstractmethod
27
- def read_all(self, file_path: str) -> List[List[str]]:
28
- """读取整个文件,默认按1000行分批"""
29
- return [line for line in self.read_lines(file_path)]
30
-
31
- def read_select(self, file_path: str) -> Generator[List[str], None, None]:
32
- if self.read_model == 1:
33
- for batch_data in self.read_lines(file_path):
34
- yield batch_data
35
- elif self.read_model == 2:
36
- for batch_data in self.read_all(file_path):
37
- yield batch_data
38
- else:
39
- raise Exception("模式选择错误")
40
-
41
-
42
- class BaseFileWriter(ABC):
43
-
44
- def __init__(self, file_path: str, compress: bool = True, overwrite: bool = True, encoding: str = "utf-8"):
45
- self.file_path = file_path
46
- self.compress = compress
47
- self.encoding = encoding
48
- self.overwrite = overwrite
49
-
50
- @abstractmethod
51
- def write_lines(self, lines: List[str], file_path: str):
52
- """写入多行文本到文件,支持压缩"""
53
- pass
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Generator
3
+
4
+
5
+ class BaseFileReader(ABC):
6
+
7
+ def __init__(self, batch_size: int = 10000):
8
+ self.batch_size = batch_size
9
+ self.read_model = 1
10
+
11
+ @abstractmethod
12
+ def list_files(self, path: str) -> List[str]:
13
+ """列出路径下所有目标文件"""
14
+ pass
15
+
16
+ @abstractmethod
17
+ def count_lines(self, file_path: str) -> int:
18
+ """统计文件行数"""
19
+ pass
20
+
21
+ @abstractmethod
22
+ def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
23
+ """读取文件内容,返回批量数据"""
24
+ pass
25
+
26
+ @abstractmethod
27
+ def read_all(self, file_path: str) -> List[List[str]]:
28
+ """读取整个文件,默认按1000行分批"""
29
+ return [line for line in self.read_lines(file_path)]
30
+
31
+ def read_select(self, file_path: str) -> Generator[List[str], None, None]:
32
+ if self.read_model == 1:
33
+ for batch_data in self.read_lines(file_path):
34
+ yield batch_data
35
+ elif self.read_model == 2:
36
+ for batch_data in self.read_all(file_path):
37
+ yield batch_data
38
+ else:
39
+ raise Exception("模式选择错误")
40
+
41
+
42
+ class BaseFileWriter(ABC):
43
+
44
+ def __init__(self, file_path: str, compress: bool = True, overwrite: bool = True, encoding: str = "utf-8"):
45
+ self.file_path = file_path
46
+ self.compress = compress
47
+ self.encoding = encoding
48
+ self.overwrite = overwrite
49
+
50
+ @abstractmethod
51
+ def write_lines(self, lines: List[str], file_path: str):
52
+ """写入多行文本到文件,支持压缩"""
53
+ pass