re-common 10.0.37__py3-none-any.whl → 10.0.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/baselibrary/__init__.py +4 -4
- re_common/baselibrary/baseabs/__init__.py +6 -6
- re_common/baselibrary/baseabs/baseabs.py +26 -26
- re_common/baselibrary/database/mbuilder.py +132 -132
- re_common/baselibrary/database/moudle.py +93 -93
- re_common/baselibrary/database/msqlite3.py +194 -194
- re_common/baselibrary/database/mysql.py +169 -169
- re_common/baselibrary/database/sql_factory.py +26 -26
- re_common/baselibrary/mthread/MThreadingRun.py +486 -486
- re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
- re_common/baselibrary/mthread/__init__.py +2 -2
- re_common/baselibrary/mthread/mythreading.py +695 -695
- re_common/baselibrary/pakge_other/socks.py +404 -404
- re_common/baselibrary/readconfig/config_factory.py +18 -18
- re_common/baselibrary/readconfig/ini_config.py +317 -317
- re_common/baselibrary/readconfig/toml_config.py +49 -49
- re_common/baselibrary/temporary/envdata.py +36 -36
- re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
- re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
- re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
- re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
- re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
- re_common/baselibrary/tools/contrast_db3.py +123 -123
- re_common/baselibrary/tools/copy_file.py +39 -39
- re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
- re_common/baselibrary/tools/foreachgz.py +39 -39
- re_common/baselibrary/tools/get_attr.py +10 -10
- re_common/baselibrary/tools/image_to_pdf.py +61 -61
- re_common/baselibrary/tools/java_code_deal.py +139 -139
- re_common/baselibrary/tools/javacode.py +79 -79
- re_common/baselibrary/tools/mdb_db3.py +48 -48
- re_common/baselibrary/tools/merge_file.py +171 -171
- re_common/baselibrary/tools/merge_gz_file.py +165 -165
- re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
- re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
- re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
- re_common/baselibrary/tools/mongo_tools.py +50 -50
- re_common/baselibrary/tools/move_file.py +170 -170
- re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
- re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
- re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
- re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
- re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
- re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
- re_common/baselibrary/tools/myparsel.py +104 -104
- re_common/baselibrary/tools/rename_dir_file.py +37 -37
- re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
- re_common/baselibrary/tools/split_line_to_many.py +25 -25
- re_common/baselibrary/tools/stringtodicts.py +33 -33
- re_common/baselibrary/tools/workwechant_bot.py +84 -84
- re_common/baselibrary/utils/baseaiohttp.py +296 -296
- re_common/baselibrary/utils/baseaiomysql.py +87 -87
- re_common/baselibrary/utils/baseallstep.py +191 -191
- re_common/baselibrary/utils/baseavro.py +19 -19
- re_common/baselibrary/utils/baseboto3.py +291 -291
- re_common/baselibrary/utils/basecsv.py +32 -32
- re_common/baselibrary/utils/basedict.py +133 -133
- re_common/baselibrary/utils/basedir.py +241 -241
- re_common/baselibrary/utils/baseencode.py +351 -351
- re_common/baselibrary/utils/baseencoding.py +28 -28
- re_common/baselibrary/utils/baseesdsl.py +86 -86
- re_common/baselibrary/utils/baseexcel.py +264 -264
- re_common/baselibrary/utils/baseexcept.py +109 -109
- re_common/baselibrary/utils/basefile.py +654 -654
- re_common/baselibrary/utils/baseftp.py +214 -214
- re_common/baselibrary/utils/basegzip.py +60 -60
- re_common/baselibrary/utils/basehdfs.py +135 -135
- re_common/baselibrary/utils/basehttpx.py +268 -268
- re_common/baselibrary/utils/baseip.py +87 -87
- re_common/baselibrary/utils/basejson.py +2 -2
- re_common/baselibrary/utils/baselist.py +32 -32
- re_common/baselibrary/utils/basemotor.py +190 -190
- re_common/baselibrary/utils/basemssql.py +98 -98
- re_common/baselibrary/utils/baseodbc.py +113 -113
- re_common/baselibrary/utils/basepandas.py +302 -302
- re_common/baselibrary/utils/basepeewee.py +11 -11
- re_common/baselibrary/utils/basepika.py +180 -180
- re_common/baselibrary/utils/basepydash.py +143 -143
- re_common/baselibrary/utils/basepymongo.py +230 -230
- re_common/baselibrary/utils/basequeue.py +22 -22
- re_common/baselibrary/utils/baserar.py +57 -57
- re_common/baselibrary/utils/baserequest.py +279 -279
- re_common/baselibrary/utils/baseset.py +8 -8
- re_common/baselibrary/utils/basesmb.py +403 -403
- re_common/baselibrary/utils/basestring.py +382 -382
- re_common/baselibrary/utils/basetime.py +320 -320
- re_common/baselibrary/utils/baseurl.py +121 -121
- re_common/baselibrary/utils/basezip.py +57 -57
- re_common/baselibrary/utils/core/__init__.py +7 -7
- re_common/baselibrary/utils/core/bottomutils.py +18 -18
- re_common/baselibrary/utils/core/mdeprecated.py +327 -327
- re_common/baselibrary/utils/core/mlamada.py +16 -16
- re_common/baselibrary/utils/core/msginfo.py +25 -25
- re_common/baselibrary/utils/core/requests_core.py +103 -103
- re_common/baselibrary/utils/fateadm.py +429 -429
- re_common/baselibrary/utils/importfun.py +123 -123
- re_common/baselibrary/utils/mfaker.py +57 -57
- re_common/baselibrary/utils/my_abc/__init__.py +3 -3
- re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
- re_common/baselibrary/utils/mylogger.py +414 -414
- re_common/baselibrary/utils/myredisclient.py +861 -861
- re_common/baselibrary/utils/pipupgrade.py +21 -21
- re_common/baselibrary/utils/ringlist.py +85 -85
- re_common/baselibrary/utils/version_compare.py +36 -36
- re_common/baselibrary/utils/ydmhttp.py +126 -126
- re_common/facade/lazy_import.py +11 -11
- re_common/facade/loggerfacade.py +25 -25
- re_common/facade/mysqlfacade.py +467 -467
- re_common/facade/now.py +31 -31
- re_common/facade/sqlite3facade.py +257 -257
- re_common/facade/use/mq_use_facade.py +83 -83
- re_common/facade/use/proxy_use_facade.py +19 -19
- re_common/libtest/base_dict_test.py +19 -19
- re_common/libtest/baseavro_test.py +13 -13
- re_common/libtest/basefile_test.py +14 -14
- re_common/libtest/basemssql_test.py +77 -77
- re_common/libtest/baseodbc_test.py +7 -7
- re_common/libtest/basepandas_test.py +38 -38
- re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
- re_common/libtest/get_attr_test/settings.py +54 -54
- re_common/libtest/idencode_test.py +53 -53
- re_common/libtest/iniconfig_test.py +35 -35
- re_common/libtest/ip_test.py +34 -34
- re_common/libtest/merge_file_test.py +20 -20
- re_common/libtest/mfaker_test.py +8 -8
- re_common/libtest/mm3_test.py +31 -31
- re_common/libtest/mylogger_test.py +88 -88
- re_common/libtest/myparsel_test.py +27 -27
- re_common/libtest/mysql_test.py +151 -151
- re_common/libtest/pymongo_test.py +21 -21
- re_common/libtest/split_test.py +11 -11
- re_common/libtest/sqlite3_merge_test.py +5 -5
- re_common/libtest/sqlite3_test.py +34 -34
- re_common/libtest/tomlconfig_test.py +30 -30
- re_common/libtest/use_tools_test/__init__.py +2 -2
- re_common/libtest/user/__init__.py +4 -4
- re_common/studio/__init__.py +4 -4
- re_common/studio/assignment_expressions.py +36 -36
- re_common/studio/mydash/test1.py +18 -18
- re_common/studio/pydashstudio/first.py +9 -9
- re_common/studio/streamlitstudio/first_app.py +65 -65
- re_common/studio/streamlitstudio/uber_pickups.py +23 -23
- re_common/studio/test.py +18 -18
- re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +219 -219
- re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
- re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
- re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
- re_common/v2/baselibrary/decorators/utils.py +59 -59
- re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
- re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
- re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
- re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
- re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
- re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
- re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
- re_common/v2/baselibrary/tools/concurrency.py +35 -35
- re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
- re_common/v2/baselibrary/tools/data_processer/data_processer.py +508 -508
- re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
- re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
- re_common/v2/baselibrary/tools/dict_tools.py +44 -44
- re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
- re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
- re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
- re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
- re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
- re_common/v2/baselibrary/tools/list_tools.py +69 -69
- re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
- re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
- re_common/v2/baselibrary/tools/text_matcher.py +326 -326
- re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
- re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
- re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
- re_common/v2/baselibrary/utils/author_smi.py +361 -361
- re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
- re_common/v2/baselibrary/utils/basedict.py +37 -37
- re_common/v2/baselibrary/utils/basehdfs.py +163 -163
- re_common/v2/baselibrary/utils/basepika.py +180 -180
- re_common/v2/baselibrary/utils/basetime.py +77 -77
- re_common/v2/baselibrary/utils/db.py +156 -156
- re_common/v2/baselibrary/utils/json_cls.py +16 -16
- re_common/v2/baselibrary/utils/mq.py +83 -83
- re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
- re_common/v2/baselibrary/utils/string_bool.py +186 -186
- re_common/v2/baselibrary/utils/string_clear.py +246 -246
- re_common/v2/baselibrary/utils/string_smi.py +18 -18
- re_common/v2/baselibrary/utils/stringutils.py +271 -278
- re_common/vip/base_step_process.py +11 -11
- re_common/vip/baseencodeid.py +90 -90
- re_common/vip/changetaskname.py +28 -28
- re_common/vip/core_var.py +24 -24
- re_common/vip/mmh3Hash.py +89 -89
- re_common/vip/proxy/allproxys.py +127 -127
- re_common/vip/proxy/allproxys_thread.py +159 -159
- re_common/vip/proxy/cnki_proxy.py +153 -153
- re_common/vip/proxy/kuaidaili.py +87 -87
- re_common/vip/proxy/proxy_all.py +113 -113
- re_common/vip/proxy/update_kuaidaili_0.py +42 -42
- re_common/vip/proxy/wanfang_proxy.py +152 -152
- re_common/vip/proxy/wp_proxy_all.py +181 -181
- re_common/vip/read_rawid_to_txt.py +91 -91
- re_common/vip/title/__init__.py +5 -5
- re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
- re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
- re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
- re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
- re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
- re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
- re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
- re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
- re_common/vip/title/transform/__init__.py +10 -10
- {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/LICENSE +201 -201
- {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/METADATA +16 -16
- re_common-10.0.39.dist-info/RECORD +248 -0
- {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/WHEEL +1 -1
- re_common-10.0.37.dist-info/RECORD +0 -248
- {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/top_level.txt +0 -0
|
@@ -1,116 +1,116 @@
|
|
|
1
|
-
import base64
|
|
2
|
-
import hashlib
|
|
3
|
-
import os
|
|
4
|
-
|
|
5
|
-
from re_common.v2.baselibrary.business_utils.baseencodeid import BaseLngid
|
|
6
|
-
|
|
7
|
-
import os
|
|
8
|
-
import base64
|
|
9
|
-
import hashlib
|
|
10
|
-
|
|
11
|
-
"""
|
|
12
|
-
DOI-文件路径 转换工具
|
|
13
|
-
|
|
14
|
-
设计目标:
|
|
15
|
-
1. 将任意DOI字符串转换为可逆、稳定的文件路径
|
|
16
|
-
2. 提供高效的目录分散方案(65,536个子目录)
|
|
17
|
-
3. 支持带文件扩展名的存储
|
|
18
|
-
4. 完全可逆转换
|
|
19
|
-
|
|
20
|
-
工作原理:
|
|
21
|
-
1. DOI编码:
|
|
22
|
-
- 使用URL安全的Base64编码(RFC 3548)
|
|
23
|
-
- 移除Base64填充的'='字符
|
|
24
|
-
- 文件名长度 ≈ 原始DOI长度 × 4/3
|
|
25
|
-
|
|
26
|
-
2. 目录分散:
|
|
27
|
-
- 使用MD5哈希创建两级目录结构
|
|
28
|
-
- 目录层级:/MD5[0:2]/MD5[2:4]/
|
|
29
|
-
- 支持65,536个目录(256×256),每目录约1,525个文件(假设10亿文件)
|
|
30
|
-
|
|
31
|
-
3. 扩展名处理:
|
|
32
|
-
- 保持原始扩展名不变
|
|
33
|
-
- 解码时自动忽略扩展名
|
|
34
|
-
|
|
35
|
-
典型转换示例:
|
|
36
|
-
DOI: "10.1000/xyz123" -> 路径: "a1/b2/QTMuMTAwMC94eXoxMjM.pdf"
|
|
37
|
-
路径: "a1/b2/QTMuMTAwMC94eXoxMjM.pdf" -> DOI: "10.1000/xyz123"
|
|
38
|
-
"""
|
|
39
|
-
|
|
40
|
-
base_lngid = BaseLngid()
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
# 以后需要启用
|
|
44
|
-
def doi_to_path(doi: str, ext: str = "") -> str:
|
|
45
|
-
"""
|
|
46
|
-
将 DOI 转换为可逆的存储路径:
|
|
47
|
-
1. 对 DOI 进行 URL 安全的 Base64 编码(可逆)
|
|
48
|
-
2. 生成 DOI 的 MD5 哈希用于目录分散
|
|
49
|
-
3. 目录结构:MD5前2字符/次2字符/
|
|
50
|
-
4. 文件名:Base64编码的DOI + 扩展名
|
|
51
|
-
|
|
52
|
-
Args:
|
|
53
|
-
doi: 文件 DOI 标识符
|
|
54
|
-
ext: 文件扩展名(如 '.pdf')
|
|
55
|
-
|
|
56
|
-
Returns:
|
|
57
|
-
相对文件路径(如 'a1/b2/QTMuMTAwMC94eXoxMjM=.pdf')
|
|
58
|
-
"""
|
|
59
|
-
# URL安全的Base64编码(可逆)
|
|
60
|
-
doi_b64 = base64.urlsafe_b64encode(doi.encode("utf-8")).decode("ascii").rstrip("=")
|
|
61
|
-
|
|
62
|
-
# 生成MD5哈希用于目录分配
|
|
63
|
-
hash_md5 = hashlib.md5(doi.encode("utf-8")).hexdigest()
|
|
64
|
-
dir_level1 = hash_md5[0:2]
|
|
65
|
-
dir_level2 = hash_md5[2:4]
|
|
66
|
-
|
|
67
|
-
return os.path.join(dir_level1, dir_level2, f"{doi_b64}{ext}")
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
# 以后需要启用
|
|
71
|
-
def path_to_doi(path: str) -> str:
|
|
72
|
-
"""
|
|
73
|
-
从文件路径反推原始DOI
|
|
74
|
-
Args:
|
|
75
|
-
path: 文件路径(如 'a1/b2/QTMuMTAwMC94eXoxMjM=.pdf')
|
|
76
|
-
|
|
77
|
-
Returns:
|
|
78
|
-
原始DOI字符串
|
|
79
|
-
"""
|
|
80
|
-
# 提取文件名并移除扩展名
|
|
81
|
-
filename = os.path.basename(path)
|
|
82
|
-
base_name = os.path.splitext(filename)[0]
|
|
83
|
-
|
|
84
|
-
# 补齐Base64填充字符
|
|
85
|
-
padding = 4 - (len(base_name) % 4)
|
|
86
|
-
if padding != 4: # 不需要补齐
|
|
87
|
-
base_name += "=" * padding
|
|
88
|
-
|
|
89
|
-
# Base64解码还原DOI
|
|
90
|
-
return base64.urlsafe_b64decode(base_name.encode("ascii")).decode("utf-8")
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
def doi_to_dir(doi):
|
|
94
|
-
"""生成文件的存储路径和可解码的文件名
|
|
95
|
-
|
|
96
|
-
Args:
|
|
97
|
-
doi (str): 文件的唯一DOI标识
|
|
98
|
-
|
|
99
|
-
Returns:
|
|
100
|
-
str: 文件相对路径,如 "ab/cd/Base64EncodedFileName"
|
|
101
|
-
"""
|
|
102
|
-
# 计算DOI的MD5哈希
|
|
103
|
-
hash_md5 = hashlib.md5(doi.encode('utf-8')).hexdigest().lower()
|
|
104
|
-
|
|
105
|
-
# 提取目录层级:前2位作为一级目录,3-4位作为二级目录
|
|
106
|
-
first_dir = hash_md5[0:2].upper()
|
|
107
|
-
second_dir = hash_md5[2:4].upper()
|
|
108
|
-
|
|
109
|
-
return first_dir + "/" + second_dir
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def get_doi_path(doi, case_insensitive=False):
|
|
113
|
-
# 目前使用
|
|
114
|
-
dir_path = doi_to_dir(doi)
|
|
115
|
-
file_name = base_lngid.getDoiid(doi, case_insensitive=case_insensitive) + ".pdf"
|
|
116
|
-
return dir_path + "/" + file_name
|
|
1
|
+
import base64
|
|
2
|
+
import hashlib
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
from re_common.v2.baselibrary.business_utils.baseencodeid import BaseLngid
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import base64
|
|
9
|
+
import hashlib
|
|
10
|
+
|
|
11
|
+
"""
|
|
12
|
+
DOI-文件路径 转换工具
|
|
13
|
+
|
|
14
|
+
设计目标:
|
|
15
|
+
1. 将任意DOI字符串转换为可逆、稳定的文件路径
|
|
16
|
+
2. 提供高效的目录分散方案(65,536个子目录)
|
|
17
|
+
3. 支持带文件扩展名的存储
|
|
18
|
+
4. 完全可逆转换
|
|
19
|
+
|
|
20
|
+
工作原理:
|
|
21
|
+
1. DOI编码:
|
|
22
|
+
- 使用URL安全的Base64编码(RFC 3548)
|
|
23
|
+
- 移除Base64填充的'='字符
|
|
24
|
+
- 文件名长度 ≈ 原始DOI长度 × 4/3
|
|
25
|
+
|
|
26
|
+
2. 目录分散:
|
|
27
|
+
- 使用MD5哈希创建两级目录结构
|
|
28
|
+
- 目录层级:/MD5[0:2]/MD5[2:4]/
|
|
29
|
+
- 支持65,536个目录(256×256),每目录约1,525个文件(假设10亿文件)
|
|
30
|
+
|
|
31
|
+
3. 扩展名处理:
|
|
32
|
+
- 保持原始扩展名不变
|
|
33
|
+
- 解码时自动忽略扩展名
|
|
34
|
+
|
|
35
|
+
典型转换示例:
|
|
36
|
+
DOI: "10.1000/xyz123" -> 路径: "a1/b2/QTMuMTAwMC94eXoxMjM.pdf"
|
|
37
|
+
路径: "a1/b2/QTMuMTAwMC94eXoxMjM.pdf" -> DOI: "10.1000/xyz123"
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
base_lngid = BaseLngid()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# 以后需要启用
|
|
44
|
+
def doi_to_path(doi: str, ext: str = "") -> str:
|
|
45
|
+
"""
|
|
46
|
+
将 DOI 转换为可逆的存储路径:
|
|
47
|
+
1. 对 DOI 进行 URL 安全的 Base64 编码(可逆)
|
|
48
|
+
2. 生成 DOI 的 MD5 哈希用于目录分散
|
|
49
|
+
3. 目录结构:MD5前2字符/次2字符/
|
|
50
|
+
4. 文件名:Base64编码的DOI + 扩展名
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
doi: 文件 DOI 标识符
|
|
54
|
+
ext: 文件扩展名(如 '.pdf')
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
相对文件路径(如 'a1/b2/QTMuMTAwMC94eXoxMjM=.pdf')
|
|
58
|
+
"""
|
|
59
|
+
# URL安全的Base64编码(可逆)
|
|
60
|
+
doi_b64 = base64.urlsafe_b64encode(doi.encode("utf-8")).decode("ascii").rstrip("=")
|
|
61
|
+
|
|
62
|
+
# 生成MD5哈希用于目录分配
|
|
63
|
+
hash_md5 = hashlib.md5(doi.encode("utf-8")).hexdigest()
|
|
64
|
+
dir_level1 = hash_md5[0:2]
|
|
65
|
+
dir_level2 = hash_md5[2:4]
|
|
66
|
+
|
|
67
|
+
return os.path.join(dir_level1, dir_level2, f"{doi_b64}{ext}")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# 以后需要启用
|
|
71
|
+
def path_to_doi(path: str) -> str:
|
|
72
|
+
"""
|
|
73
|
+
从文件路径反推原始DOI
|
|
74
|
+
Args:
|
|
75
|
+
path: 文件路径(如 'a1/b2/QTMuMTAwMC94eXoxMjM=.pdf')
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
原始DOI字符串
|
|
79
|
+
"""
|
|
80
|
+
# 提取文件名并移除扩展名
|
|
81
|
+
filename = os.path.basename(path)
|
|
82
|
+
base_name = os.path.splitext(filename)[0]
|
|
83
|
+
|
|
84
|
+
# 补齐Base64填充字符
|
|
85
|
+
padding = 4 - (len(base_name) % 4)
|
|
86
|
+
if padding != 4: # 不需要补齐
|
|
87
|
+
base_name += "=" * padding
|
|
88
|
+
|
|
89
|
+
# Base64解码还原DOI
|
|
90
|
+
return base64.urlsafe_b64decode(base_name.encode("ascii")).decode("utf-8")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def doi_to_dir(doi):
|
|
94
|
+
"""生成文件的存储路径和可解码的文件名
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
doi (str): 文件的唯一DOI标识
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
str: 文件相对路径,如 "ab/cd/Base64EncodedFileName"
|
|
101
|
+
"""
|
|
102
|
+
# 计算DOI的MD5哈希
|
|
103
|
+
hash_md5 = hashlib.md5(doi.encode('utf-8')).hexdigest().lower()
|
|
104
|
+
|
|
105
|
+
# 提取目录层级:前2位作为一级目录,3-4位作为二级目录
|
|
106
|
+
first_dir = hash_md5[0:2].upper()
|
|
107
|
+
second_dir = hash_md5[2:4].upper()
|
|
108
|
+
|
|
109
|
+
return first_dir + "/" + second_dir
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def get_doi_path(doi, case_insensitive=False):
|
|
113
|
+
# 目前使用
|
|
114
|
+
dir_path = doi_to_dir(doi)
|
|
115
|
+
file_name = base_lngid.getDoiid(doi, case_insensitive=case_insensitive) + ".pdf"
|
|
116
|
+
return dir_path + "/" + file_name
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
def assign_group_id(rows: list, sub_db_order: list):
|
|
2
|
-
subdb_keyid_map = {row.sub_db_id: row.keyid for row in rows}
|
|
3
|
-
for sub_db_id in sub_db_order:
|
|
4
|
-
if keyid := subdb_keyid_map.get(sub_db_id):
|
|
5
|
-
return keyid, len(rows), rows
|
|
6
|
-
return rows[0].keyid, len(rows), rows
|
|
1
|
+
def assign_group_id(rows: list, sub_db_order: list):
|
|
2
|
+
subdb_keyid_map = {row.sub_db_id: row.keyid for row in rows}
|
|
3
|
+
for sub_db_id in sub_db_order:
|
|
4
|
+
if keyid := subdb_keyid_map.get(sub_db_id):
|
|
5
|
+
return keyid, len(rows), rows
|
|
6
|
+
return rows[0].keyid, len(rows), rows
|
|
@@ -1,59 +1,59 @@
|
|
|
1
|
-
import warnings
|
|
2
|
-
import functools
|
|
3
|
-
|
|
4
|
-
# 全局集合,用于记录已警告的函数或类
|
|
5
|
-
_warned_once = set()
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def deprecated(message=None):
|
|
9
|
-
"""
|
|
10
|
-
装饰器:标记函数或类为已废弃,整个进程只发出一次警告。
|
|
11
|
-
|
|
12
|
-
Args:
|
|
13
|
-
message (str): 自定义警告信息,默认为 None。
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
def decorator(obj):
|
|
17
|
-
# 如果是函数
|
|
18
|
-
if isinstance(obj, type(lambda: None)):
|
|
19
|
-
@functools.wraps(obj)
|
|
20
|
-
def wrapper(*args, **kwargs):
|
|
21
|
-
obj_id = id(obj) # 使用对象的内存地址作为唯一标识
|
|
22
|
-
if obj_id not in _warned_once:
|
|
23
|
-
default_msg = f"函数 {obj.__name__} 已不建议使用。"
|
|
24
|
-
warn_msg = f"{default_msg} {message}" if message else default_msg
|
|
25
|
-
warnings.warn(
|
|
26
|
-
warn_msg,
|
|
27
|
-
category=DeprecationWarning,
|
|
28
|
-
stacklevel=2
|
|
29
|
-
)
|
|
30
|
-
_warned_once.add(obj_id) # 记录已警告
|
|
31
|
-
return obj(*args, **kwargs)
|
|
32
|
-
|
|
33
|
-
return wrapper
|
|
34
|
-
|
|
35
|
-
# 如果是类
|
|
36
|
-
elif isinstance(obj, type):
|
|
37
|
-
orig_init = obj.__init__
|
|
38
|
-
|
|
39
|
-
@functools.wraps(orig_init)
|
|
40
|
-
def new_init(self, *args, **kwargs):
|
|
41
|
-
obj_id = id(obj)
|
|
42
|
-
if obj_id not in _warned_once:
|
|
43
|
-
default_msg = f"类 {obj.__name__} 已不建议使用。"
|
|
44
|
-
warn_msg = f"{default_msg} {message}" if message else default_msg
|
|
45
|
-
warnings.warn(
|
|
46
|
-
warn_msg,
|
|
47
|
-
category=DeprecationWarning,
|
|
48
|
-
stacklevel=2
|
|
49
|
-
)
|
|
50
|
-
_warned_once.add(obj_id) # 记录已警告
|
|
51
|
-
orig_init(self, *args, **kwargs)
|
|
52
|
-
|
|
53
|
-
obj.__init__ = new_init
|
|
54
|
-
return obj
|
|
55
|
-
|
|
56
|
-
else:
|
|
57
|
-
raise TypeError("此装饰器仅适用于函数和类")
|
|
58
|
-
|
|
59
|
-
return decorator
|
|
1
|
+
import warnings
|
|
2
|
+
import functools
|
|
3
|
+
|
|
4
|
+
# 全局集合,用于记录已警告的函数或类
|
|
5
|
+
_warned_once = set()
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def deprecated(message=None):
|
|
9
|
+
"""
|
|
10
|
+
装饰器:标记函数或类为已废弃,整个进程只发出一次警告。
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
message (str): 自定义警告信息,默认为 None。
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def decorator(obj):
|
|
17
|
+
# 如果是函数
|
|
18
|
+
if isinstance(obj, type(lambda: None)):
|
|
19
|
+
@functools.wraps(obj)
|
|
20
|
+
def wrapper(*args, **kwargs):
|
|
21
|
+
obj_id = id(obj) # 使用对象的内存地址作为唯一标识
|
|
22
|
+
if obj_id not in _warned_once:
|
|
23
|
+
default_msg = f"函数 {obj.__name__} 已不建议使用。"
|
|
24
|
+
warn_msg = f"{default_msg} {message}" if message else default_msg
|
|
25
|
+
warnings.warn(
|
|
26
|
+
warn_msg,
|
|
27
|
+
category=DeprecationWarning,
|
|
28
|
+
stacklevel=2
|
|
29
|
+
)
|
|
30
|
+
_warned_once.add(obj_id) # 记录已警告
|
|
31
|
+
return obj(*args, **kwargs)
|
|
32
|
+
|
|
33
|
+
return wrapper
|
|
34
|
+
|
|
35
|
+
# 如果是类
|
|
36
|
+
elif isinstance(obj, type):
|
|
37
|
+
orig_init = obj.__init__
|
|
38
|
+
|
|
39
|
+
@functools.wraps(orig_init)
|
|
40
|
+
def new_init(self, *args, **kwargs):
|
|
41
|
+
obj_id = id(obj)
|
|
42
|
+
if obj_id not in _warned_once:
|
|
43
|
+
default_msg = f"类 {obj.__name__} 已不建议使用。"
|
|
44
|
+
warn_msg = f"{default_msg} {message}" if message else default_msg
|
|
45
|
+
warnings.warn(
|
|
46
|
+
warn_msg,
|
|
47
|
+
category=DeprecationWarning,
|
|
48
|
+
stacklevel=2
|
|
49
|
+
)
|
|
50
|
+
_warned_once.add(obj_id) # 记录已警告
|
|
51
|
+
orig_init(self, *args, **kwargs)
|
|
52
|
+
|
|
53
|
+
obj.__init__ = new_init
|
|
54
|
+
return obj
|
|
55
|
+
|
|
56
|
+
else:
|
|
57
|
+
raise TypeError("此装饰器仅适用于函数和类")
|
|
58
|
+
|
|
59
|
+
return decorator
|
|
@@ -1,105 +1,105 @@
|
|
|
1
|
-
import datetime
|
|
2
|
-
import gzip
|
|
3
|
-
import io
|
|
4
|
-
import time
|
|
5
|
-
|
|
6
|
-
import joblib
|
|
7
|
-
from hdfs import InsecureClient
|
|
8
|
-
|
|
9
|
-
from re_common.v2.baselibrary.helpers.search_packge.fit_text_match import FitTextMatcher
|
|
10
|
-
|
|
11
|
-
if __name__ == '__main__':
|
|
12
|
-
matcher = FitTextMatcher(
|
|
13
|
-
algorithm='auto', # 对于小数据集,暴力搜索足够快 brute
|
|
14
|
-
n_jobs=2 # 使用所有CPU核心
|
|
15
|
-
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
client = InsecureClient("http://VIP-DC-MASTER-2:9870", user="xujiang")
|
|
19
|
-
|
|
20
|
-
lists = []
|
|
21
|
-
for i in ["vectorizer", "nn_model", "corpus", "idx"]:
|
|
22
|
-
fit_file_path = f"/b_task_data/class_smi/fit_file/t_23600_{i}.joblib.gz"
|
|
23
|
-
with client.read(fit_file_path) as reader:
|
|
24
|
-
tp = io.BytesIO(reader.read())
|
|
25
|
-
tp.seek(0)
|
|
26
|
-
lists.append(tp)
|
|
27
|
-
|
|
28
|
-
with gzip.GzipFile(fileobj=lists[2], mode='rb') as gz:
|
|
29
|
-
matcher.corpus = joblib.load(gz)
|
|
30
|
-
|
|
31
|
-
with gzip.GzipFile(fileobj=lists[3], mode='rb') as gz:
|
|
32
|
-
matcher.idx = joblib.load(gz)
|
|
33
|
-
matcher.corpus_size = max(len(matcher.corpus), len(matcher.idx))
|
|
34
|
-
print(f"加载bytes完成,共 {matcher.corpus_size} 篇文献")
|
|
35
|
-
|
|
36
|
-
matcher.fit(matcher.corpus)
|
|
37
|
-
|
|
38
|
-
print(matcher.nn._fit_method)
|
|
39
|
-
|
|
40
|
-
print("fit 训练完成")
|
|
41
|
-
|
|
42
|
-
count = 0
|
|
43
|
-
bacth_list = []
|
|
44
|
-
n = min(100, matcher.corpus_size)
|
|
45
|
-
for i in matcher.corpus:
|
|
46
|
-
count = count + 1
|
|
47
|
-
bacth_list.append(i)
|
|
48
|
-
if count % 10000 == 0:
|
|
49
|
-
t1 = time.time()
|
|
50
|
-
index, similarities = matcher.batch_search(bacth_list, n=n)
|
|
51
|
-
for rank, (idxs, sims) in enumerate(zip(index, similarities)):
|
|
52
|
-
print({"keyid": matcher.idx[rank],
|
|
53
|
-
"search_list": [(matcher.idx[idx], sim) for idx, sim in zip(idxs, sims)]})
|
|
54
|
-
|
|
55
|
-
t2 = time.time()
|
|
56
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
|
|
57
|
-
print(now, count, t2 - t1)
|
|
58
|
-
bacth_list.clear()
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
# with open("t_8220_corpus.joblib","rb") as f:
|
|
67
|
-
# buf = io.BytesIO(f.read())
|
|
68
|
-
# buf.seek(0)
|
|
69
|
-
# corpus = buf
|
|
70
|
-
#
|
|
71
|
-
# matcher.corpus = joblib.load(corpus)
|
|
72
|
-
# print(len(matcher.corpus))
|
|
73
|
-
|
|
74
|
-
# matcher.load_bytes(vec, nn, corpus)
|
|
75
|
-
|
|
76
|
-
# with open(r"C:\Users\Administrator\Desktop\update\part-02440\part-02440_1", "r", encoding="utf-8") as f:
|
|
77
|
-
# lists = [line.strip() for line in f if line]
|
|
78
|
-
#
|
|
79
|
-
# matcher.fit(lists)
|
|
80
|
-
|
|
81
|
-
# matcher.load("./","test")
|
|
82
|
-
|
|
83
|
-
# query = r"herbdrug interaction in the protective effect of alpinia officinarum against gastric injury induced by indomethacin based on pharmacokinetic tissue distribution and excretion studies in rats"
|
|
84
|
-
# result = matcher.search(query, n=100)
|
|
85
|
-
# print("query", query)
|
|
86
|
-
# for rank, (idx, sim) in enumerate(result):
|
|
87
|
-
# print(f"\nTop {rank + 1} [相似度: {sim:.4f}]:")
|
|
88
|
-
# print(f"文献 #{idx}: {lists[idx]}")
|
|
89
|
-
# print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
|
|
90
|
-
# time.sleep(100)
|
|
91
|
-
|
|
92
|
-
# for query in lists[:1000]:
|
|
93
|
-
# indices, similarities = matcher.search(query, n=100)
|
|
94
|
-
# print("query", query)
|
|
95
|
-
# for rank, (idx, sim) in enumerate(zip(indices, similarities)):
|
|
96
|
-
# print(f"\nTop {rank + 1} [相似度: {sim:.4f}]:")
|
|
97
|
-
# print(f"文献 #{idx}: {lists[idx]}")
|
|
98
|
-
# print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
|
|
99
|
-
# time.sleep(100)
|
|
100
|
-
#
|
|
101
|
-
# idx,sim = matcher.batch_search(lists[:1000], n=100)
|
|
102
|
-
# for rank, (idxs, sims) in enumerate(zip(idx,sim)):
|
|
103
|
-
# tp = (lists[rank],[(lists[idx], sim) for idx,sim in zip(idxs,sims)])
|
|
104
|
-
# print(tp)
|
|
105
|
-
# time.sleep(100)
|
|
1
|
+
import datetime
|
|
2
|
+
import gzip
|
|
3
|
+
import io
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
import joblib
|
|
7
|
+
from hdfs import InsecureClient
|
|
8
|
+
|
|
9
|
+
from re_common.v2.baselibrary.helpers.search_packge.fit_text_match import FitTextMatcher
|
|
10
|
+
|
|
11
|
+
if __name__ == '__main__':
|
|
12
|
+
matcher = FitTextMatcher(
|
|
13
|
+
algorithm='auto', # 对于小数据集,暴力搜索足够快 brute
|
|
14
|
+
n_jobs=2 # 使用所有CPU核心
|
|
15
|
+
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
client = InsecureClient("http://VIP-DC-MASTER-2:9870", user="xujiang")
|
|
19
|
+
|
|
20
|
+
lists = []
|
|
21
|
+
for i in ["vectorizer", "nn_model", "corpus", "idx"]:
|
|
22
|
+
fit_file_path = f"/b_task_data/class_smi/fit_file/t_23600_{i}.joblib.gz"
|
|
23
|
+
with client.read(fit_file_path) as reader:
|
|
24
|
+
tp = io.BytesIO(reader.read())
|
|
25
|
+
tp.seek(0)
|
|
26
|
+
lists.append(tp)
|
|
27
|
+
|
|
28
|
+
with gzip.GzipFile(fileobj=lists[2], mode='rb') as gz:
|
|
29
|
+
matcher.corpus = joblib.load(gz)
|
|
30
|
+
|
|
31
|
+
with gzip.GzipFile(fileobj=lists[3], mode='rb') as gz:
|
|
32
|
+
matcher.idx = joblib.load(gz)
|
|
33
|
+
matcher.corpus_size = max(len(matcher.corpus), len(matcher.idx))
|
|
34
|
+
print(f"加载bytes完成,共 {matcher.corpus_size} 篇文献")
|
|
35
|
+
|
|
36
|
+
matcher.fit(matcher.corpus)
|
|
37
|
+
|
|
38
|
+
print(matcher.nn._fit_method)
|
|
39
|
+
|
|
40
|
+
print("fit 训练完成")
|
|
41
|
+
|
|
42
|
+
count = 0
|
|
43
|
+
bacth_list = []
|
|
44
|
+
n = min(100, matcher.corpus_size)
|
|
45
|
+
for i in matcher.corpus:
|
|
46
|
+
count = count + 1
|
|
47
|
+
bacth_list.append(i)
|
|
48
|
+
if count % 10000 == 0:
|
|
49
|
+
t1 = time.time()
|
|
50
|
+
index, similarities = matcher.batch_search(bacth_list, n=n)
|
|
51
|
+
for rank, (idxs, sims) in enumerate(zip(index, similarities)):
|
|
52
|
+
print({"keyid": matcher.idx[rank],
|
|
53
|
+
"search_list": [(matcher.idx[idx], sim) for idx, sim in zip(idxs, sims)]})
|
|
54
|
+
|
|
55
|
+
t2 = time.time()
|
|
56
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
|
|
57
|
+
print(now, count, t2 - t1)
|
|
58
|
+
bacth_list.clear()
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# with open("t_8220_corpus.joblib","rb") as f:
|
|
67
|
+
# buf = io.BytesIO(f.read())
|
|
68
|
+
# buf.seek(0)
|
|
69
|
+
# corpus = buf
|
|
70
|
+
#
|
|
71
|
+
# matcher.corpus = joblib.load(corpus)
|
|
72
|
+
# print(len(matcher.corpus))
|
|
73
|
+
|
|
74
|
+
# matcher.load_bytes(vec, nn, corpus)
|
|
75
|
+
|
|
76
|
+
# with open(r"C:\Users\Administrator\Desktop\update\part-02440\part-02440_1", "r", encoding="utf-8") as f:
|
|
77
|
+
# lists = [line.strip() for line in f if line]
|
|
78
|
+
#
|
|
79
|
+
# matcher.fit(lists)
|
|
80
|
+
|
|
81
|
+
# matcher.load("./","test")
|
|
82
|
+
|
|
83
|
+
# query = r"herbdrug interaction in the protective effect of alpinia officinarum against gastric injury induced by indomethacin based on pharmacokinetic tissue distribution and excretion studies in rats"
|
|
84
|
+
# result = matcher.search(query, n=100)
|
|
85
|
+
# print("query", query)
|
|
86
|
+
# for rank, (idx, sim) in enumerate(result):
|
|
87
|
+
# print(f"\nTop {rank + 1} [相似度: {sim:.4f}]:")
|
|
88
|
+
# print(f"文献 #{idx}: {lists[idx]}")
|
|
89
|
+
# print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
|
|
90
|
+
# time.sleep(100)
|
|
91
|
+
|
|
92
|
+
# for query in lists[:1000]:
|
|
93
|
+
# indices, similarities = matcher.search(query, n=100)
|
|
94
|
+
# print("query", query)
|
|
95
|
+
# for rank, (idx, sim) in enumerate(zip(indices, similarities)):
|
|
96
|
+
# print(f"\nTop {rank + 1} [相似度: {sim:.4f}]:")
|
|
97
|
+
# print(f"文献 #{idx}: {lists[idx]}")
|
|
98
|
+
# print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
|
|
99
|
+
# time.sleep(100)
|
|
100
|
+
#
|
|
101
|
+
# idx,sim = matcher.batch_search(lists[:1000], n=100)
|
|
102
|
+
# for rank, (idxs, sims) in enumerate(zip(idx,sim)):
|
|
103
|
+
# tp = (lists[rank],[(lists[idx], sim) for idx,sim in zip(idxs,sims)])
|
|
104
|
+
# print(tp)
|
|
105
|
+
# time.sleep(100)
|