re-common 10.0.37__py3-none-any.whl → 10.0.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (217) hide show
  1. re_common/baselibrary/__init__.py +4 -4
  2. re_common/baselibrary/baseabs/__init__.py +6 -6
  3. re_common/baselibrary/baseabs/baseabs.py +26 -26
  4. re_common/baselibrary/database/mbuilder.py +132 -132
  5. re_common/baselibrary/database/moudle.py +93 -93
  6. re_common/baselibrary/database/msqlite3.py +194 -194
  7. re_common/baselibrary/database/mysql.py +169 -169
  8. re_common/baselibrary/database/sql_factory.py +26 -26
  9. re_common/baselibrary/mthread/MThreadingRun.py +486 -486
  10. re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
  11. re_common/baselibrary/mthread/__init__.py +2 -2
  12. re_common/baselibrary/mthread/mythreading.py +695 -695
  13. re_common/baselibrary/pakge_other/socks.py +404 -404
  14. re_common/baselibrary/readconfig/config_factory.py +18 -18
  15. re_common/baselibrary/readconfig/ini_config.py +317 -317
  16. re_common/baselibrary/readconfig/toml_config.py +49 -49
  17. re_common/baselibrary/temporary/envdata.py +36 -36
  18. re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
  19. re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
  20. re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
  21. re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
  22. re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
  23. re_common/baselibrary/tools/contrast_db3.py +123 -123
  24. re_common/baselibrary/tools/copy_file.py +39 -39
  25. re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
  26. re_common/baselibrary/tools/foreachgz.py +39 -39
  27. re_common/baselibrary/tools/get_attr.py +10 -10
  28. re_common/baselibrary/tools/image_to_pdf.py +61 -61
  29. re_common/baselibrary/tools/java_code_deal.py +139 -139
  30. re_common/baselibrary/tools/javacode.py +79 -79
  31. re_common/baselibrary/tools/mdb_db3.py +48 -48
  32. re_common/baselibrary/tools/merge_file.py +171 -171
  33. re_common/baselibrary/tools/merge_gz_file.py +165 -165
  34. re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
  35. re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
  36. re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
  37. re_common/baselibrary/tools/mongo_tools.py +50 -50
  38. re_common/baselibrary/tools/move_file.py +170 -170
  39. re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
  40. re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
  41. re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
  42. re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
  43. re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
  44. re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
  45. re_common/baselibrary/tools/myparsel.py +104 -104
  46. re_common/baselibrary/tools/rename_dir_file.py +37 -37
  47. re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
  48. re_common/baselibrary/tools/split_line_to_many.py +25 -25
  49. re_common/baselibrary/tools/stringtodicts.py +33 -33
  50. re_common/baselibrary/tools/workwechant_bot.py +84 -84
  51. re_common/baselibrary/utils/baseaiohttp.py +296 -296
  52. re_common/baselibrary/utils/baseaiomysql.py +87 -87
  53. re_common/baselibrary/utils/baseallstep.py +191 -191
  54. re_common/baselibrary/utils/baseavro.py +19 -19
  55. re_common/baselibrary/utils/baseboto3.py +291 -291
  56. re_common/baselibrary/utils/basecsv.py +32 -32
  57. re_common/baselibrary/utils/basedict.py +133 -133
  58. re_common/baselibrary/utils/basedir.py +241 -241
  59. re_common/baselibrary/utils/baseencode.py +351 -351
  60. re_common/baselibrary/utils/baseencoding.py +28 -28
  61. re_common/baselibrary/utils/baseesdsl.py +86 -86
  62. re_common/baselibrary/utils/baseexcel.py +264 -264
  63. re_common/baselibrary/utils/baseexcept.py +109 -109
  64. re_common/baselibrary/utils/basefile.py +654 -654
  65. re_common/baselibrary/utils/baseftp.py +214 -214
  66. re_common/baselibrary/utils/basegzip.py +60 -60
  67. re_common/baselibrary/utils/basehdfs.py +135 -135
  68. re_common/baselibrary/utils/basehttpx.py +268 -268
  69. re_common/baselibrary/utils/baseip.py +87 -87
  70. re_common/baselibrary/utils/basejson.py +2 -2
  71. re_common/baselibrary/utils/baselist.py +32 -32
  72. re_common/baselibrary/utils/basemotor.py +190 -190
  73. re_common/baselibrary/utils/basemssql.py +98 -98
  74. re_common/baselibrary/utils/baseodbc.py +113 -113
  75. re_common/baselibrary/utils/basepandas.py +302 -302
  76. re_common/baselibrary/utils/basepeewee.py +11 -11
  77. re_common/baselibrary/utils/basepika.py +180 -180
  78. re_common/baselibrary/utils/basepydash.py +143 -143
  79. re_common/baselibrary/utils/basepymongo.py +230 -230
  80. re_common/baselibrary/utils/basequeue.py +22 -22
  81. re_common/baselibrary/utils/baserar.py +57 -57
  82. re_common/baselibrary/utils/baserequest.py +279 -279
  83. re_common/baselibrary/utils/baseset.py +8 -8
  84. re_common/baselibrary/utils/basesmb.py +403 -403
  85. re_common/baselibrary/utils/basestring.py +382 -382
  86. re_common/baselibrary/utils/basetime.py +320 -320
  87. re_common/baselibrary/utils/baseurl.py +121 -121
  88. re_common/baselibrary/utils/basezip.py +57 -57
  89. re_common/baselibrary/utils/core/__init__.py +7 -7
  90. re_common/baselibrary/utils/core/bottomutils.py +18 -18
  91. re_common/baselibrary/utils/core/mdeprecated.py +327 -327
  92. re_common/baselibrary/utils/core/mlamada.py +16 -16
  93. re_common/baselibrary/utils/core/msginfo.py +25 -25
  94. re_common/baselibrary/utils/core/requests_core.py +103 -103
  95. re_common/baselibrary/utils/fateadm.py +429 -429
  96. re_common/baselibrary/utils/importfun.py +123 -123
  97. re_common/baselibrary/utils/mfaker.py +57 -57
  98. re_common/baselibrary/utils/my_abc/__init__.py +3 -3
  99. re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
  100. re_common/baselibrary/utils/mylogger.py +414 -414
  101. re_common/baselibrary/utils/myredisclient.py +861 -861
  102. re_common/baselibrary/utils/pipupgrade.py +21 -21
  103. re_common/baselibrary/utils/ringlist.py +85 -85
  104. re_common/baselibrary/utils/version_compare.py +36 -36
  105. re_common/baselibrary/utils/ydmhttp.py +126 -126
  106. re_common/facade/lazy_import.py +11 -11
  107. re_common/facade/loggerfacade.py +25 -25
  108. re_common/facade/mysqlfacade.py +467 -467
  109. re_common/facade/now.py +31 -31
  110. re_common/facade/sqlite3facade.py +257 -257
  111. re_common/facade/use/mq_use_facade.py +83 -83
  112. re_common/facade/use/proxy_use_facade.py +19 -19
  113. re_common/libtest/base_dict_test.py +19 -19
  114. re_common/libtest/baseavro_test.py +13 -13
  115. re_common/libtest/basefile_test.py +14 -14
  116. re_common/libtest/basemssql_test.py +77 -77
  117. re_common/libtest/baseodbc_test.py +7 -7
  118. re_common/libtest/basepandas_test.py +38 -38
  119. re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
  120. re_common/libtest/get_attr_test/settings.py +54 -54
  121. re_common/libtest/idencode_test.py +53 -53
  122. re_common/libtest/iniconfig_test.py +35 -35
  123. re_common/libtest/ip_test.py +34 -34
  124. re_common/libtest/merge_file_test.py +20 -20
  125. re_common/libtest/mfaker_test.py +8 -8
  126. re_common/libtest/mm3_test.py +31 -31
  127. re_common/libtest/mylogger_test.py +88 -88
  128. re_common/libtest/myparsel_test.py +27 -27
  129. re_common/libtest/mysql_test.py +151 -151
  130. re_common/libtest/pymongo_test.py +21 -21
  131. re_common/libtest/split_test.py +11 -11
  132. re_common/libtest/sqlite3_merge_test.py +5 -5
  133. re_common/libtest/sqlite3_test.py +34 -34
  134. re_common/libtest/tomlconfig_test.py +30 -30
  135. re_common/libtest/use_tools_test/__init__.py +2 -2
  136. re_common/libtest/user/__init__.py +4 -4
  137. re_common/studio/__init__.py +4 -4
  138. re_common/studio/assignment_expressions.py +36 -36
  139. re_common/studio/mydash/test1.py +18 -18
  140. re_common/studio/pydashstudio/first.py +9 -9
  141. re_common/studio/streamlitstudio/first_app.py +65 -65
  142. re_common/studio/streamlitstudio/uber_pickups.py +23 -23
  143. re_common/studio/test.py +18 -18
  144. re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +219 -219
  145. re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
  146. re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
  147. re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
  148. re_common/v2/baselibrary/decorators/utils.py +59 -59
  149. re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
  150. re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
  151. re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
  152. re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
  153. re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
  154. re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
  155. re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
  156. re_common/v2/baselibrary/tools/concurrency.py +35 -35
  157. re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
  158. re_common/v2/baselibrary/tools/data_processer/data_processer.py +508 -508
  159. re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
  160. re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
  161. re_common/v2/baselibrary/tools/dict_tools.py +44 -44
  162. re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
  163. re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
  164. re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
  165. re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
  166. re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
  167. re_common/v2/baselibrary/tools/list_tools.py +69 -69
  168. re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
  169. re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
  170. re_common/v2/baselibrary/tools/text_matcher.py +326 -326
  171. re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
  172. re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
  173. re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
  174. re_common/v2/baselibrary/utils/author_smi.py +361 -361
  175. re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
  176. re_common/v2/baselibrary/utils/basedict.py +37 -37
  177. re_common/v2/baselibrary/utils/basehdfs.py +163 -163
  178. re_common/v2/baselibrary/utils/basepika.py +180 -180
  179. re_common/v2/baselibrary/utils/basetime.py +77 -77
  180. re_common/v2/baselibrary/utils/db.py +156 -156
  181. re_common/v2/baselibrary/utils/json_cls.py +16 -16
  182. re_common/v2/baselibrary/utils/mq.py +83 -83
  183. re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
  184. re_common/v2/baselibrary/utils/string_bool.py +186 -186
  185. re_common/v2/baselibrary/utils/string_clear.py +246 -246
  186. re_common/v2/baselibrary/utils/string_smi.py +18 -18
  187. re_common/v2/baselibrary/utils/stringutils.py +271 -278
  188. re_common/vip/base_step_process.py +11 -11
  189. re_common/vip/baseencodeid.py +90 -90
  190. re_common/vip/changetaskname.py +28 -28
  191. re_common/vip/core_var.py +24 -24
  192. re_common/vip/mmh3Hash.py +89 -89
  193. re_common/vip/proxy/allproxys.py +127 -127
  194. re_common/vip/proxy/allproxys_thread.py +159 -159
  195. re_common/vip/proxy/cnki_proxy.py +153 -153
  196. re_common/vip/proxy/kuaidaili.py +87 -87
  197. re_common/vip/proxy/proxy_all.py +113 -113
  198. re_common/vip/proxy/update_kuaidaili_0.py +42 -42
  199. re_common/vip/proxy/wanfang_proxy.py +152 -152
  200. re_common/vip/proxy/wp_proxy_all.py +181 -181
  201. re_common/vip/read_rawid_to_txt.py +91 -91
  202. re_common/vip/title/__init__.py +5 -5
  203. re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
  204. re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
  205. re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
  206. re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
  207. re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
  208. re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
  209. re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
  210. re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
  211. re_common/vip/title/transform/__init__.py +10 -10
  212. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/LICENSE +201 -201
  213. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/METADATA +16 -16
  214. re_common-10.0.39.dist-info/RECORD +248 -0
  215. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/WHEEL +1 -1
  216. re_common-10.0.37.dist-info/RECORD +0 -248
  217. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/top_level.txt +0 -0
@@ -1,116 +1,116 @@
1
- import base64
2
- import hashlib
3
- import os
4
-
5
- from re_common.v2.baselibrary.business_utils.baseencodeid import BaseLngid
6
-
7
- import os
8
- import base64
9
- import hashlib
10
-
11
- """
12
- DOI-文件路径 转换工具
13
-
14
- 设计目标:
15
- 1. 将任意DOI字符串转换为可逆、稳定的文件路径
16
- 2. 提供高效的目录分散方案(65,536个子目录)
17
- 3. 支持带文件扩展名的存储
18
- 4. 完全可逆转换
19
-
20
- 工作原理:
21
- 1. DOI编码:
22
- - 使用URL安全的Base64编码(RFC 3548)
23
- - 移除Base64填充的'='字符
24
- - 文件名长度 ≈ 原始DOI长度 × 4/3
25
-
26
- 2. 目录分散:
27
- - 使用MD5哈希创建两级目录结构
28
- - 目录层级:/MD5[0:2]/MD5[2:4]/
29
- - 支持65,536个目录(256×256),每目录约1,525个文件(假设10亿文件)
30
-
31
- 3. 扩展名处理:
32
- - 保持原始扩展名不变
33
- - 解码时自动忽略扩展名
34
-
35
- 典型转换示例:
36
- DOI: "10.1000/xyz123" -> 路径: "a1/b2/QTMuMTAwMC94eXoxMjM.pdf"
37
- 路径: "a1/b2/QTMuMTAwMC94eXoxMjM.pdf" -> DOI: "10.1000/xyz123"
38
- """
39
-
40
- base_lngid = BaseLngid()
41
-
42
-
43
- # 以后需要启用
44
- def doi_to_path(doi: str, ext: str = "") -> str:
45
- """
46
- 将 DOI 转换为可逆的存储路径:
47
- 1. 对 DOI 进行 URL 安全的 Base64 编码(可逆)
48
- 2. 生成 DOI 的 MD5 哈希用于目录分散
49
- 3. 目录结构:MD5前2字符/次2字符/
50
- 4. 文件名:Base64编码的DOI + 扩展名
51
-
52
- Args:
53
- doi: 文件 DOI 标识符
54
- ext: 文件扩展名(如 '.pdf')
55
-
56
- Returns:
57
- 相对文件路径(如 'a1/b2/QTMuMTAwMC94eXoxMjM=.pdf')
58
- """
59
- # URL安全的Base64编码(可逆)
60
- doi_b64 = base64.urlsafe_b64encode(doi.encode("utf-8")).decode("ascii").rstrip("=")
61
-
62
- # 生成MD5哈希用于目录分配
63
- hash_md5 = hashlib.md5(doi.encode("utf-8")).hexdigest()
64
- dir_level1 = hash_md5[0:2]
65
- dir_level2 = hash_md5[2:4]
66
-
67
- return os.path.join(dir_level1, dir_level2, f"{doi_b64}{ext}")
68
-
69
-
70
- # 以后需要启用
71
- def path_to_doi(path: str) -> str:
72
- """
73
- 从文件路径反推原始DOI
74
- Args:
75
- path: 文件路径(如 'a1/b2/QTMuMTAwMC94eXoxMjM=.pdf')
76
-
77
- Returns:
78
- 原始DOI字符串
79
- """
80
- # 提取文件名并移除扩展名
81
- filename = os.path.basename(path)
82
- base_name = os.path.splitext(filename)[0]
83
-
84
- # 补齐Base64填充字符
85
- padding = 4 - (len(base_name) % 4)
86
- if padding != 4: # 不需要补齐
87
- base_name += "=" * padding
88
-
89
- # Base64解码还原DOI
90
- return base64.urlsafe_b64decode(base_name.encode("ascii")).decode("utf-8")
91
-
92
-
93
- def doi_to_dir(doi):
94
- """生成文件的存储路径和可解码的文件名
95
-
96
- Args:
97
- doi (str): 文件的唯一DOI标识
98
-
99
- Returns:
100
- str: 文件相对路径,如 "ab/cd/Base64EncodedFileName"
101
- """
102
- # 计算DOI的MD5哈希
103
- hash_md5 = hashlib.md5(doi.encode('utf-8')).hexdigest().lower()
104
-
105
- # 提取目录层级:前2位作为一级目录,3-4位作为二级目录
106
- first_dir = hash_md5[0:2].upper()
107
- second_dir = hash_md5[2:4].upper()
108
-
109
- return first_dir + "/" + second_dir
110
-
111
-
112
- def get_doi_path(doi, case_insensitive=False):
113
- # 目前使用
114
- dir_path = doi_to_dir(doi)
115
- file_name = base_lngid.getDoiid(doi, case_insensitive=case_insensitive) + ".pdf"
116
- return dir_path + "/" + file_name
1
+ import base64
2
+ import hashlib
3
+ import os
4
+
5
+ from re_common.v2.baselibrary.business_utils.baseencodeid import BaseLngid
6
+
7
+ import os
8
+ import base64
9
+ import hashlib
10
+
11
+ """
12
+ DOI-文件路径 转换工具
13
+
14
+ 设计目标:
15
+ 1. 将任意DOI字符串转换为可逆、稳定的文件路径
16
+ 2. 提供高效的目录分散方案(65,536个子目录)
17
+ 3. 支持带文件扩展名的存储
18
+ 4. 完全可逆转换
19
+
20
+ 工作原理:
21
+ 1. DOI编码:
22
+ - 使用URL安全的Base64编码(RFC 3548)
23
+ - 移除Base64填充的'='字符
24
+ - 文件名长度 ≈ 原始DOI长度 × 4/3
25
+
26
+ 2. 目录分散:
27
+ - 使用MD5哈希创建两级目录结构
28
+ - 目录层级:/MD5[0:2]/MD5[2:4]/
29
+ - 支持65,536个目录(256×256),每目录约1,525个文件(假设10亿文件)
30
+
31
+ 3. 扩展名处理:
32
+ - 保持原始扩展名不变
33
+ - 解码时自动忽略扩展名
34
+
35
+ 典型转换示例:
36
+ DOI: "10.1000/xyz123" -> 路径: "a1/b2/QTMuMTAwMC94eXoxMjM.pdf"
37
+ 路径: "a1/b2/QTMuMTAwMC94eXoxMjM.pdf" -> DOI: "10.1000/xyz123"
38
+ """
39
+
40
+ base_lngid = BaseLngid()
41
+
42
+
43
+ # 以后需要启用
44
+ def doi_to_path(doi: str, ext: str = "") -> str:
45
+ """
46
+ 将 DOI 转换为可逆的存储路径:
47
+ 1. 对 DOI 进行 URL 安全的 Base64 编码(可逆)
48
+ 2. 生成 DOI 的 MD5 哈希用于目录分散
49
+ 3. 目录结构:MD5前2字符/次2字符/
50
+ 4. 文件名:Base64编码的DOI + 扩展名
51
+
52
+ Args:
53
+ doi: 文件 DOI 标识符
54
+ ext: 文件扩展名(如 '.pdf')
55
+
56
+ Returns:
57
+ 相对文件路径(如 'a1/b2/QTMuMTAwMC94eXoxMjM=.pdf')
58
+ """
59
+ # URL安全的Base64编码(可逆)
60
+ doi_b64 = base64.urlsafe_b64encode(doi.encode("utf-8")).decode("ascii").rstrip("=")
61
+
62
+ # 生成MD5哈希用于目录分配
63
+ hash_md5 = hashlib.md5(doi.encode("utf-8")).hexdigest()
64
+ dir_level1 = hash_md5[0:2]
65
+ dir_level2 = hash_md5[2:4]
66
+
67
+ return os.path.join(dir_level1, dir_level2, f"{doi_b64}{ext}")
68
+
69
+
70
+ # 以后需要启用
71
+ def path_to_doi(path: str) -> str:
72
+ """
73
+ 从文件路径反推原始DOI
74
+ Args:
75
+ path: 文件路径(如 'a1/b2/QTMuMTAwMC94eXoxMjM=.pdf')
76
+
77
+ Returns:
78
+ 原始DOI字符串
79
+ """
80
+ # 提取文件名并移除扩展名
81
+ filename = os.path.basename(path)
82
+ base_name = os.path.splitext(filename)[0]
83
+
84
+ # 补齐Base64填充字符
85
+ padding = 4 - (len(base_name) % 4)
86
+ if padding != 4: # 不需要补齐
87
+ base_name += "=" * padding
88
+
89
+ # Base64解码还原DOI
90
+ return base64.urlsafe_b64decode(base_name.encode("ascii")).decode("utf-8")
91
+
92
+
93
+ def doi_to_dir(doi):
94
+ """生成文件的存储路径和可解码的文件名
95
+
96
+ Args:
97
+ doi (str): 文件的唯一DOI标识
98
+
99
+ Returns:
100
+ str: 文件相对路径,如 "ab/cd/Base64EncodedFileName"
101
+ """
102
+ # 计算DOI的MD5哈希
103
+ hash_md5 = hashlib.md5(doi.encode('utf-8')).hexdigest().lower()
104
+
105
+ # 提取目录层级:前2位作为一级目录,3-4位作为二级目录
106
+ first_dir = hash_md5[0:2].upper()
107
+ second_dir = hash_md5[2:4].upper()
108
+
109
+ return first_dir + "/" + second_dir
110
+
111
+
112
+ def get_doi_path(doi, case_insensitive=False):
113
+ # 目前使用
114
+ dir_path = doi_to_dir(doi)
115
+ file_name = base_lngid.getDoiid(doi, case_insensitive=case_insensitive) + ".pdf"
116
+ return dir_path + "/" + file_name
@@ -1,6 +1,6 @@
1
- def assign_group_id(rows: list, sub_db_order: list):
2
- subdb_keyid_map = {row.sub_db_id: row.keyid for row in rows}
3
- for sub_db_id in sub_db_order:
4
- if keyid := subdb_keyid_map.get(sub_db_id):
5
- return keyid, len(rows), rows
6
- return rows[0].keyid, len(rows), rows
1
+ def assign_group_id(rows: list, sub_db_order: list):
2
+ subdb_keyid_map = {row.sub_db_id: row.keyid for row in rows}
3
+ for sub_db_id in sub_db_order:
4
+ if keyid := subdb_keyid_map.get(sub_db_id):
5
+ return keyid, len(rows), rows
6
+ return rows[0].keyid, len(rows), rows
@@ -1,59 +1,59 @@
1
- import warnings
2
- import functools
3
-
4
- # 全局集合,用于记录已警告的函数或类
5
- _warned_once = set()
6
-
7
-
8
- def deprecated(message=None):
9
- """
10
- 装饰器:标记函数或类为已废弃,整个进程只发出一次警告。
11
-
12
- Args:
13
- message (str): 自定义警告信息,默认为 None。
14
- """
15
-
16
- def decorator(obj):
17
- # 如果是函数
18
- if isinstance(obj, type(lambda: None)):
19
- @functools.wraps(obj)
20
- def wrapper(*args, **kwargs):
21
- obj_id = id(obj) # 使用对象的内存地址作为唯一标识
22
- if obj_id not in _warned_once:
23
- default_msg = f"函数 {obj.__name__} 已不建议使用。"
24
- warn_msg = f"{default_msg} {message}" if message else default_msg
25
- warnings.warn(
26
- warn_msg,
27
- category=DeprecationWarning,
28
- stacklevel=2
29
- )
30
- _warned_once.add(obj_id) # 记录已警告
31
- return obj(*args, **kwargs)
32
-
33
- return wrapper
34
-
35
- # 如果是类
36
- elif isinstance(obj, type):
37
- orig_init = obj.__init__
38
-
39
- @functools.wraps(orig_init)
40
- def new_init(self, *args, **kwargs):
41
- obj_id = id(obj)
42
- if obj_id not in _warned_once:
43
- default_msg = f"类 {obj.__name__} 已不建议使用。"
44
- warn_msg = f"{default_msg} {message}" if message else default_msg
45
- warnings.warn(
46
- warn_msg,
47
- category=DeprecationWarning,
48
- stacklevel=2
49
- )
50
- _warned_once.add(obj_id) # 记录已警告
51
- orig_init(self, *args, **kwargs)
52
-
53
- obj.__init__ = new_init
54
- return obj
55
-
56
- else:
57
- raise TypeError("此装饰器仅适用于函数和类")
58
-
59
- return decorator
1
+ import warnings
2
+ import functools
3
+
4
+ # 全局集合,用于记录已警告的函数或类
5
+ _warned_once = set()
6
+
7
+
8
+ def deprecated(message=None):
9
+ """
10
+ 装饰器:标记函数或类为已废弃,整个进程只发出一次警告。
11
+
12
+ Args:
13
+ message (str): 自定义警告信息,默认为 None。
14
+ """
15
+
16
+ def decorator(obj):
17
+ # 如果是函数
18
+ if isinstance(obj, type(lambda: None)):
19
+ @functools.wraps(obj)
20
+ def wrapper(*args, **kwargs):
21
+ obj_id = id(obj) # 使用对象的内存地址作为唯一标识
22
+ if obj_id not in _warned_once:
23
+ default_msg = f"函数 {obj.__name__} 已不建议使用。"
24
+ warn_msg = f"{default_msg} {message}" if message else default_msg
25
+ warnings.warn(
26
+ warn_msg,
27
+ category=DeprecationWarning,
28
+ stacklevel=2
29
+ )
30
+ _warned_once.add(obj_id) # 记录已警告
31
+ return obj(*args, **kwargs)
32
+
33
+ return wrapper
34
+
35
+ # 如果是类
36
+ elif isinstance(obj, type):
37
+ orig_init = obj.__init__
38
+
39
+ @functools.wraps(orig_init)
40
+ def new_init(self, *args, **kwargs):
41
+ obj_id = id(obj)
42
+ if obj_id not in _warned_once:
43
+ default_msg = f"类 {obj.__name__} 已不建议使用。"
44
+ warn_msg = f"{default_msg} {message}" if message else default_msg
45
+ warnings.warn(
46
+ warn_msg,
47
+ category=DeprecationWarning,
48
+ stacklevel=2
49
+ )
50
+ _warned_once.add(obj_id) # 记录已警告
51
+ orig_init(self, *args, **kwargs)
52
+
53
+ obj.__init__ = new_init
54
+ return obj
55
+
56
+ else:
57
+ raise TypeError("此装饰器仅适用于函数和类")
58
+
59
+ return decorator
@@ -1,105 +1,105 @@
1
- import datetime
2
- import gzip
3
- import io
4
- import time
5
-
6
- import joblib
7
- from hdfs import InsecureClient
8
-
9
- from re_common.v2.baselibrary.helpers.search_packge.fit_text_match import FitTextMatcher
10
-
11
- if __name__ == '__main__':
12
- matcher = FitTextMatcher(
13
- algorithm='auto', # 对于小数据集,暴力搜索足够快 brute
14
- n_jobs=2 # 使用所有CPU核心
15
-
16
- )
17
-
18
- client = InsecureClient("http://VIP-DC-MASTER-2:9870", user="xujiang")
19
-
20
- lists = []
21
- for i in ["vectorizer", "nn_model", "corpus", "idx"]:
22
- fit_file_path = f"/b_task_data/class_smi/fit_file/t_23600_{i}.joblib.gz"
23
- with client.read(fit_file_path) as reader:
24
- tp = io.BytesIO(reader.read())
25
- tp.seek(0)
26
- lists.append(tp)
27
-
28
- with gzip.GzipFile(fileobj=lists[2], mode='rb') as gz:
29
- matcher.corpus = joblib.load(gz)
30
-
31
- with gzip.GzipFile(fileobj=lists[3], mode='rb') as gz:
32
- matcher.idx = joblib.load(gz)
33
- matcher.corpus_size = max(len(matcher.corpus), len(matcher.idx))
34
- print(f"加载bytes完成,共 {matcher.corpus_size} 篇文献")
35
-
36
- matcher.fit(matcher.corpus)
37
-
38
- print(matcher.nn._fit_method)
39
-
40
- print("fit 训练完成")
41
-
42
- count = 0
43
- bacth_list = []
44
- n = min(100, matcher.corpus_size)
45
- for i in matcher.corpus:
46
- count = count + 1
47
- bacth_list.append(i)
48
- if count % 10000 == 0:
49
- t1 = time.time()
50
- index, similarities = matcher.batch_search(bacth_list, n=n)
51
- for rank, (idxs, sims) in enumerate(zip(index, similarities)):
52
- print({"keyid": matcher.idx[rank],
53
- "search_list": [(matcher.idx[idx], sim) for idx, sim in zip(idxs, sims)]})
54
-
55
- t2 = time.time()
56
- now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
57
- print(now, count, t2 - t1)
58
- bacth_list.clear()
59
-
60
-
61
-
62
-
63
-
64
-
65
-
66
- # with open("t_8220_corpus.joblib","rb") as f:
67
- # buf = io.BytesIO(f.read())
68
- # buf.seek(0)
69
- # corpus = buf
70
- #
71
- # matcher.corpus = joblib.load(corpus)
72
- # print(len(matcher.corpus))
73
-
74
- # matcher.load_bytes(vec, nn, corpus)
75
-
76
- # with open(r"C:\Users\Administrator\Desktop\update\part-02440\part-02440_1", "r", encoding="utf-8") as f:
77
- # lists = [line.strip() for line in f if line]
78
- #
79
- # matcher.fit(lists)
80
-
81
- # matcher.load("./","test")
82
-
83
- # query = r"herbdrug interaction in the protective effect of alpinia officinarum against gastric injury induced by indomethacin based on pharmacokinetic tissue distribution and excretion studies in rats"
84
- # result = matcher.search(query, n=100)
85
- # print("query", query)
86
- # for rank, (idx, sim) in enumerate(result):
87
- # print(f"\nTop {rank + 1} [相似度: {sim:.4f}]:")
88
- # print(f"文献 #{idx}: {lists[idx]}")
89
- # print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
90
- # time.sleep(100)
91
-
92
- # for query in lists[:1000]:
93
- # indices, similarities = matcher.search(query, n=100)
94
- # print("query", query)
95
- # for rank, (idx, sim) in enumerate(zip(indices, similarities)):
96
- # print(f"\nTop {rank + 1} [相似度: {sim:.4f}]:")
97
- # print(f"文献 #{idx}: {lists[idx]}")
98
- # print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
99
- # time.sleep(100)
100
- #
101
- # idx,sim = matcher.batch_search(lists[:1000], n=100)
102
- # for rank, (idxs, sims) in enumerate(zip(idx,sim)):
103
- # tp = (lists[rank],[(lists[idx], sim) for idx,sim in zip(idxs,sims)])
104
- # print(tp)
105
- # time.sleep(100)
1
+ import datetime
2
+ import gzip
3
+ import io
4
+ import time
5
+
6
+ import joblib
7
+ from hdfs import InsecureClient
8
+
9
+ from re_common.v2.baselibrary.helpers.search_packge.fit_text_match import FitTextMatcher
10
+
11
+ if __name__ == '__main__':
12
+ matcher = FitTextMatcher(
13
+ algorithm='auto', # 对于小数据集,暴力搜索足够快 brute
14
+ n_jobs=2 # 使用所有CPU核心
15
+
16
+ )
17
+
18
+ client = InsecureClient("http://VIP-DC-MASTER-2:9870", user="xujiang")
19
+
20
+ lists = []
21
+ for i in ["vectorizer", "nn_model", "corpus", "idx"]:
22
+ fit_file_path = f"/b_task_data/class_smi/fit_file/t_23600_{i}.joblib.gz"
23
+ with client.read(fit_file_path) as reader:
24
+ tp = io.BytesIO(reader.read())
25
+ tp.seek(0)
26
+ lists.append(tp)
27
+
28
+ with gzip.GzipFile(fileobj=lists[2], mode='rb') as gz:
29
+ matcher.corpus = joblib.load(gz)
30
+
31
+ with gzip.GzipFile(fileobj=lists[3], mode='rb') as gz:
32
+ matcher.idx = joblib.load(gz)
33
+ matcher.corpus_size = max(len(matcher.corpus), len(matcher.idx))
34
+ print(f"加载bytes完成,共 {matcher.corpus_size} 篇文献")
35
+
36
+ matcher.fit(matcher.corpus)
37
+
38
+ print(matcher.nn._fit_method)
39
+
40
+ print("fit 训练完成")
41
+
42
+ count = 0
43
+ bacth_list = []
44
+ n = min(100, matcher.corpus_size)
45
+ for i in matcher.corpus:
46
+ count = count + 1
47
+ bacth_list.append(i)
48
+ if count % 10000 == 0:
49
+ t1 = time.time()
50
+ index, similarities = matcher.batch_search(bacth_list, n=n)
51
+ for rank, (idxs, sims) in enumerate(zip(index, similarities)):
52
+ print({"keyid": matcher.idx[rank],
53
+ "search_list": [(matcher.idx[idx], sim) for idx, sim in zip(idxs, sims)]})
54
+
55
+ t2 = time.time()
56
+ now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
57
+ print(now, count, t2 - t1)
58
+ bacth_list.clear()
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+ # with open("t_8220_corpus.joblib","rb") as f:
67
+ # buf = io.BytesIO(f.read())
68
+ # buf.seek(0)
69
+ # corpus = buf
70
+ #
71
+ # matcher.corpus = joblib.load(corpus)
72
+ # print(len(matcher.corpus))
73
+
74
+ # matcher.load_bytes(vec, nn, corpus)
75
+
76
+ # with open(r"C:\Users\Administrator\Desktop\update\part-02440\part-02440_1", "r", encoding="utf-8") as f:
77
+ # lists = [line.strip() for line in f if line]
78
+ #
79
+ # matcher.fit(lists)
80
+
81
+ # matcher.load("./","test")
82
+
83
+ # query = r"herbdrug interaction in the protective effect of alpinia officinarum against gastric injury induced by indomethacin based on pharmacokinetic tissue distribution and excretion studies in rats"
84
+ # result = matcher.search(query, n=100)
85
+ # print("query", query)
86
+ # for rank, (idx, sim) in enumerate(result):
87
+ # print(f"\nTop {rank + 1} [相似度: {sim:.4f}]:")
88
+ # print(f"文献 #{idx}: {lists[idx]}")
89
+ # print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
90
+ # time.sleep(100)
91
+
92
+ # for query in lists[:1000]:
93
+ # indices, similarities = matcher.search(query, n=100)
94
+ # print("query", query)
95
+ # for rank, (idx, sim) in enumerate(zip(indices, similarities)):
96
+ # print(f"\nTop {rank + 1} [相似度: {sim:.4f}]:")
97
+ # print(f"文献 #{idx}: {lists[idx]}")
98
+ # print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
99
+ # time.sleep(100)
100
+ #
101
+ # idx,sim = matcher.batch_search(lists[:1000], n=100)
102
+ # for rank, (idxs, sims) in enumerate(zip(idx,sim)):
103
+ # tp = (lists[rank],[(lists[idx], sim) for idx,sim in zip(idxs,sims)])
104
+ # print(tp)
105
+ # time.sleep(100)